D atafile = ‘normalized_GSE2021.csv ’sample_output = ‘NF_correlation.csv ’random_output = ‘random_correlation.csv ’sample_list = [ ‘02n12 ’, ‘13d19 ’, ‘05p15 ’, ‘02g16 ’, ‘03o23 ’, ‘03j13 ’, ‘13d08 ’, ‘28h07 ’, ‘13o12 ’, ‘12p20 ’] import math import random def process(x, y): mean_x
float(sum(x)) / len(x) mean_y
float(sum(y)) / len(y) covariance
sum([((x[i] − mean_x) * (y[i] − mean_y)) for i in range(len(x))]) error_x
[i − mean_x for i in x] error_y
[i − mean_x for i in y] sd_x
math.sqrt(sum([i*i for i in error_x])) sd_y
math.sqrt(sum([i*i for i in error_y])) gradient
sum([error_x[index] * error_y[index] for index in range(len(error_x))]) / ∖ sum([error_x[index] * error_x[index] for index in range(len(error_x))]) intercept
mean_y − (gradient * mean_x) return (float(covariance), float(sd_x), float(sd_y), float(gradient), float(intercept)) fdata
[x[: − 1].split( ‘, ’) for x in open(datafile, ‘r ’).readlines()] # Sorting out the full data (fdata) into 3 parts # sdata - contains data for IDs in sample_list # rdata - contains data (n
number of IDs in sample_list) # for IDs not in sample_list # data - contains data for IDs not in sample_list or rdata sdata
[x for x in fdata if x[ 0] in sample_list] data
[x for x in fdata if x[ 0] not in sample_list] rdata
[random.choice(data) for x in range(len(sample_list))] data
[x for x in data if x[ 0] not in [r[ 0] for r in rdata]] sout
open(sample_output, ‘w ’) rout
open(random_output, ‘w ’) print str(len(sdata)), ‘ number of samples in sample list ’print str(len(rdata)), ‘ number of samples in random list ’print str(len(data)), ‘ number of samples in data list ’sout.write( ‘, ’.join([ ‘sample_x ’, ‘sample_y ’, ‘covariance ’, ‘sd_x ’, ‘sd_y ’, ‘gradient ’, ‘intercept ’]) + ‘∖ n ’) rout.write( ‘, ’.join([ ‘sample_x ’, ‘sample_y ’, ‘covariance ’, ‘sd_x ’, ‘sd_y ’, ‘gradient ’, ‘intercept ’]) + ‘∖ n ’) def run_correlation(sample_data, other_data, outfile): count
1 for s in sample_data: ID1
s[ 0] d1
[float(x) for x in s[1:]] for x in other_data: ID2
x[ 0] d2
[float(y) for y in x[1:]] result
process(d1, d2) result
[ID1, ID2, str(result[ 0]), str(result[ 1]), str(result[ 2]), str(result[ 3]), str(result[ 4])] outfile.write( ‘, ’.join(result) + ‘∖ n ’) count
count + 1 print str(count), ‘ processed. ID
’, ID1 outfile.close() print ‘Processing sample list ’run_correlation(sdata, data, sout) P rint print ‘Processing random list ’run_correlation(rdata, data, rout)