Commit 02a4ef98 authored by Nicole Bussola's avatar Nicole Bussola
Browse files

compute borda on all splits for rSNFi

parent d3e96589
#%%
import os
import numpy as np
import pandas as pd
import operator
from input_output import load_data
#%%
DATA_PATH = 'data/tcga_breast/ER/'
PATH = 'results/tcga_breast/ER/'
mode = 'rSNFi'
N_LAYERS = 3
layer1 = 'gene'
layer2 = 'cnv'
layer3 = 'prot'
N_SPLITS = 10
CV_K = 5
CV_N = 10
if N_LAYERS==3:
layers = f'{layer1}_{layer2}_{layer3}'
assert len(layers.split('_')) == 3
elif N_LAYERS==2:
layers = f'{layer1}_{layer2}'
assert len(layers.split('_')) == 2
else:
print('nuumber of layers must be > 1')
#%%
all_feats=[]
for i in range(N_SPLITS):
file_featureList = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
all_feats.extend(list(feats.FEATURE_NAME))
all_feats = list(set(all_feats))
#%%
positions = dict()
means = dict()
x=((len(all_feats)-1)*np.ones((1,N_SPLITS*CV_K*CV_N)))
#%%
for i in all_feats:
positions[i]=x.tolist()[0]
means[i]=0.0
#%%
for i in range(N_SPLITS):
file_featureList = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
# a=/pd.read_table("./LISTS/"+str(i)+"_gene_cnv_prot_tr_RandomForest_KBest_featurelist.txt",header=0,sep="\t")
z=[None]*len(feats)
for k in range(len(feats)):
z[feats.FEATURE_ID[k]]=feats.FEATURE_NAME[k]
# b=pd.read_table("./LISTS/"+str(i)+"_gene_cnv_prot_tr_RandomForest_KBest_ranking.csv",header=None,sep="\t")
file_ranking = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_ranking.csv.gz')
rankings = pd.read_csv(file_ranking, header=None, sep='\t')
for j in range(CV_K*CV_N):
for k in range(rankings.shape[1]):
positions[z[rankings.iloc[j][k]]][i*(CV_K*CV_N)+j]=1.0*k
#%%
#%%
for i in all_feats:
means[i]=np.mean(positions[i])
# best_feat_steps = []
#%%
sorted_means = sorted(means.items(), key=operator.itemgetter(1))
len(sorted_means)
#%%
borda_df = pd.DataFrame(sorted_means, columns=['FEATURE_NAME', 'MEAN_POS'])
borda_df.to_csv(f"{PATH}/Borda_allSpilts_{mode}_{layers}.txt", sep='\t', index=False, float_format="%.3f")
# %%
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment