Commit 2203df57 authored by Nicole Bussola's avatar Nicole Bussola
Browse files

Borda on all splits for rSNFi mode

parent 851220ad
#%%
import os
import numpy as np
import pandas as pd
import operator
from input_output import load_data
#%%
DATA_PATH = 'data/tcga_breast/ER/'
PATH = 'results/tcga_breast/ER/'
mode = 'rSNFi'
N_LAYERS = 3
layer1 = 'gene'
layer2 = 'cnv'
layer3 = 'prot'
N_SPLITS = 10
CV_K = 5
CV_N = 10
if N_LAYERS==3:
layers = f'{layer1}_{layer2}_{layer3}'
assert len(layers.split('_')) == 3
elif N_LAYERS==2:
layers = f'{layer1}_{layer2}'
assert len(layers.split('_')) == 2
else:
print('nuumber of layers must be > 1')
#%%
all_feats=[]
for i in range(N_SPLITS):
file_featureList = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
all_feats.extend(list(feats.FEATURE_NAME))
all_feats = list(set(all_feats))
#%%
positions = dict()
means = dict()
x=((len(all_feats)-1)*np.ones((1,N_SPLITS*CV_K*CV_N)))
#%%
for i in all_feats:
positions[i]=x.tolist()[0]
means[i]=0.0
#%%
for i in range(N_SPLITS):
file_featureList = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
# a=/pd.read_table("./LISTS/"+str(i)+"_gene_cnv_prot_tr_RandomForest_KBest_featurelist.txt",header=0,sep="\t")
z=[None]*len(feats)
for k in range(len(feats)):
z[feats.FEATURE_ID[k]]=feats.FEATURE_NAME[k]
# b=pd.read_table("./LISTS/"+str(i)+"_gene_cnv_prot_tr_RandomForest_KBest_ranking.csv",header=None,sep="\t")
file_ranking = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_ranking.csv.gz')
rankings = pd.read_csv(file_ranking, header=None, sep='\t')
for j in range(CV_K*CV_N):
for k in range(rankings.shape[1]):
positions[z[rankings.iloc[j][k]]][i*(CV_K*CV_N)+j]=1.0*k
#%%
#%%
for i in all_feats:
means[i]=np.mean(positions[i])
# best_feat_steps = []
#%%
sorted_means = sorted(means.items(), key=operator.itemgetter(1))
len(sorted_means)
#%%
borda_df = pd.DataFrame(sorted_means, columns=['FEATURE_NAME', 'MEAN_POS'])
borda_df.to_csv(f"{PATH}/Borda_allSpilts_{mode}_{layers}.txt", sep='\t', index=False, float_format="%.3f")
# %%
#%%
import os
import numpy as np
import pandas as pd
import operator
import argparse
from itertools import combinations
#%%
class myArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
super(myArgumentParser, self).__init__(*args, **kwargs)
def convert_arg_line_to_args(self, line):
for arg in line.split():
if not arg.strip():
continue
if arg[0] == '#':
break
yield arg
parser = myArgumentParser(
description='Concatenate omic layers files', fromfile_prefix_chars='@'
)
parser.add_argument('--datafolder', type=str, help='Main data folder')
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
parser.add_argument('--mode', type=str, help='rSNFi custom Borda')
args = parser.parse_args()
DATAFOLDER = args.datafolder
DATASET = args.dataset
OUTFOLDER = args.outfolder
TARGET = args.target
LAYERS = args.layers
N_SPLITS = args.n_splits
MODE = args.mode
assert MODE == 'rSNFi'
N_LAYERS = len(LAYERS)
CV_K = 5
CV_N = 10
#%%
for k in range(2, N_LAYERS + 1):
for comb in combinations(LAYERS, k):
layers_concat = '_'.join(comb)
all_feats=[]
for i in range(N_SPLITS):
file_featureList = os.path.join(OUTFOLDER, DATASET, TARGET,f'{i}/{MODE}/{layers_concat}_tr_RandomForest_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
all_feats.extend(list(feats.FEATURE_NAME))
all_feats = list(set(all_feats))
positions = dict()
means = dict()
x=((len(all_feats)-1)*np.ones((1,N_SPLITS*CV_K*CV_N)))
for i in all_feats:
positions[i]=x.tolist()[0]
means[i]=0.0
for i in range(N_SPLITS):
file_featureList = os.path.join(OUTFOLDER, DATASET, TARGET,f'{i}/{MODE}/{layers_concat}_tr_RandomForest_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
z=[None]*len(feats)
for k in range(len(feats)):
z[feats.FEATURE_ID[k]]=feats.FEATURE_NAME[k]
file_ranking = os.path.join(OUTFOLDER, DATASET, TARGET,f'{i}/{MODE}/{layers_concat}_tr_RandomForest_KBest_ranking.csv.gz')
rankings = pd.read_csv(file_ranking, header=None, sep='\t')
for j in range(CV_K*CV_N):
for k in range(rankings.shape[1]):
positions[z[rankings.iloc[j][k]]][i*(CV_K*CV_N)+j]=1.0*k
for i in all_feats:
means[i]=np.mean(positions[i])
sorted_means = sorted(means.items(), key=operator.itemgetter(1))
borda_df = pd.DataFrame(sorted_means, columns=['FEATURE_NAME', 'MEAN_POS'])
borda_df.to_csv(f"{OUTFOLDER}/{DATASET}/{TARGET}/__Borda_allSpilts_{MODE}_{layers_concat}.txt", sep='\t', index=False, float_format="%.3f")
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment