Commit d3e96589 authored by Nicole Bussola's avatar Nicole Bussola
Browse files

compute borda on all splits for juxt and rSNF

parent 736968b3
#%%
import os
import numpy as np
import pandas as pd
from mlpy import borda_count
from input_output import load_data
#%%
DATA_PATH = 'data/tcga_breast/subtypes'
PATH = 'results/tcga_breast/subtypes'
mode = 'rSNF'
assert mode in ['juxt', 'rSNF', 'single']
N_LAYERS = 3
if N_LAYERS==1:
assert mode=='single'
single_layer = 'prot'
else:
layer1 = 'gene'
layer2 = 'cnv'
layer3 = 'prot'
if N_LAYERS==3:
layers = f'{layer1}_{layer2}_{layer3}'
assert len(layers.split('_')) == 3
elif N_LAYERS==2:
layers = f'{layer1}_{layer2}'
assert len(layers.split('_')) == 2
else:
layers = f'{single_layer}'
assert len(layers.split('_')) == 1
N_SPLIT = 10
CV_K = 5
CV_N = 10
_, var_names, _ = load_data(os.path.join(DATA_PATH,f'0/{layers}_tr.txt') )
rankings = []
#%%
for i in range(N_SPLIT):
if mode == 'rSNF':
file_ranking = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_rankList_ranking.csv.gz')
else:
file_ranking = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_ranking.csv.gz')
rank = pd.read_csv(file_ranking, header=None, sep='\t').values
rankings.append(rank)
rankings = np.vstack(rankings)
# %%
BORDA_ID, _, BORDA_POS = borda_count(rankings)
len(rankings),BORDA_ID
# %%
borda_df = pd.DataFrame(columns=["FEATURE_ID", "FEATURE_NAME", "MEAN_POS"])
for i, pos in zip(BORDA_ID, BORDA_POS):
borda_df = borda_df.append({'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos+1}, ignore_index=True)
borda_df.to_csv(f"{PATH}/Borda_allSpilts_{mode}_{layers}.txt", sep='\t', index=False, float_format="%.3f")
# %%
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment