Commit 6bc146e9 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Fix split management + black formatting

parent 1b06cc55
......@@ -37,7 +37,9 @@ parser.add_argument('--datafolder', type=str, help='Main data folder')
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--model', type=str, help='Classifiers implemented, randomForest or LSVM')
parser.add_argument(
'--model', type=str, help='Classifiers implemented, randomForest or LSVM'
)
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits_end', type=int, help='')
parser.add_argument('--n_splits_start', type=int, help='')
......@@ -65,15 +67,31 @@ for k in range(2, N_LAYERS + 1):
for comb in combinations(LAYERS, k):
layers_concat = '_'.join(comb)
_, var_names, _ = load_data(os.path.join(DATAFOLDER, DATASET, TARGET, f'0/{layers_concat}_tr.txt') )
rankings = []
for i in range(N_SPLITS_START, N_SPLITS_END-N_SPLITS_START):
if MODE == 'rSNF':
file_ranking = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_rankList_ranking.csv.gz')
_, var_names, _ = load_data(
os.path.join(
DATAFOLDER, DATASET, TARGET, f'{N_SPLITS_START}/{layers_concat}_tr.txt'
)
)
rankings = []
for split_id in range(N_SPLITS_START, N_SPLITS_END):
if MODE == 'rSNF':
file_ranking = os.path.join(
OUTFOLDER,
DATASET,
TARGET,
MODEL,
f'{split_id}/{MODE}/{layers_concat}_tr_{MODEL}_rankList_ranking.csv.gz',
)
else:
file_ranking = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_ranking.csv.gz')
file_ranking = os.path.join(
OUTFOLDER,
DATASET,
TARGET,
MODEL,
f'{split_id}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_ranking.csv.gz',
)
rank = pd.read_csv(file_ranking, header=None, sep='\t').values
rankings.append(rank)
......@@ -83,6 +101,14 @@ for k in range(2, N_LAYERS + 1):
borda_df = pd.DataFrame(columns=["FEATURE_ID", "FEATURE_NAME", "MEAN_POS"])
for i, pos in zip(BORDA_ID, BORDA_POS):
borda_df = borda_df.append({'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos+1}, ignore_index=True)
borda_df.to_csv(f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}_{layers_concat}.txt", sep='\t', index=False, float_format="%.3f")
borda_df = borda_df.append(
{'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos + 1},
ignore_index=True,
)
borda_df.to_csv(
f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}_{layers_concat}.txt",
sep='\t',
index=False,
float_format="%.3f",
)
#%%
import argparse
import operator
import os
from itertools import combinations
import numpy as np
import pandas as pd
import operator
import argparse
from itertools import combinations
#%%
class myArgumentParser(argparse.ArgumentParser):
......@@ -27,7 +29,9 @@ parser.add_argument('--datafolder', type=str, help='Main data folder')
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--model', type=str, help='Classifiers implemented, randomForest or LSVM')
parser.add_argument(
'--model', type=str, help='Classifiers implemented, randomForest or LSVM'
)
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits_end', type=int, help='')
parser.add_argument('--n_splits_start', type=int, help='')
......@@ -52,17 +56,23 @@ N_LAYERS = len(LAYERS)
CV_K = 5
CV_N = 10
#%%
#%%
for k in range(2, N_LAYERS + 1):
for comb in combinations(LAYERS, k):
layers_concat = '_'.join(comb)
all_feats=[]
all_feats = []
for i in range(N_SPLITS_START, N_SPLITS_END-N_SPLITS_START):
file_featureList = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_featurelist.txt')
for i in range(N_SPLITS_START, N_SPLITS_END):
file_featureList = os.path.join(
OUTFOLDER,
DATASET,
TARGET,
MODEL,
f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_featurelist.txt',
)
feats = pd.read_csv(file_featureList, sep='\t')
all_feats.extend(list(feats.FEATURE_NAME))
......@@ -70,32 +80,51 @@ for k in range(2, N_LAYERS + 1):
positions = dict()
means = dict()
x=((len(all_feats)-1)*np.ones((1,(N_SPLITS_END-N_SPLITS_START)*CV_K*CV_N)))
x = (len(all_feats) - 1) * np.ones(
(1, (N_SPLITS_END - N_SPLITS_START) * CV_K * CV_N)
)
for i in all_feats:
positions[i]=x.tolist()[0]
means[i]=0.0
for i in range(N_SPLITS_START, N_SPLITS_END-N_SPLITS_START):
file_featureList = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_featurelist.txt')
positions[i] = x.tolist()[0]
means[i] = 0.0
for i, split_id in enumerate(range(N_SPLITS_START, N_SPLITS_END)):
file_featureList = os.path.join(
OUTFOLDER,
DATASET,
TARGET,
MODEL,
f'{split_id}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_featurelist.txt',
)
feats = pd.read_csv(file_featureList, sep='\t')
z=[None]*len(feats)
for k in range(len(feats)):
z[feats.FEATURE_ID[k]]=feats.FEATURE_NAME[k]
z = [None] * len(feats)
file_ranking = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_ranking.csv.gz')
for k in range(len(feats)):
z[feats.FEATURE_ID[k]] = feats.FEATURE_NAME[k]
file_ranking = os.path.join(
OUTFOLDER,
DATASET,
TARGET,
MODEL,
f'{split_id}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_ranking.csv.gz',
)
rankings = pd.read_csv(file_ranking, header=None, sep='\t')
for j in range(CV_K*CV_N):
for j in range(CV_K * CV_N):
for k in range(rankings.shape[1]):
positions[z[rankings.iloc[j][k]]][i*(CV_K*CV_N)+j]=1.0*k
positions[z[rankings.iloc[j][k]]][i * (CV_K * CV_N) + j] = 1.0 * k
for i in all_feats:
means[i]=np.mean(positions[i])
means[i] = np.mean(positions[i])
sorted_means = sorted(means.items(), key=operator.itemgetter(1))
borda_df = pd.DataFrame(sorted_means, columns=['FEATURE_NAME', 'MEAN_POS'])
borda_df.to_csv(f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}_{layers_concat}.txt", sep='\t', index=False, float_format="%.3f")
\ No newline at end of file
borda_df.to_csv(
f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}_{layers_concat}.txt",
sep='\t',
index=False,
float_format="%.3f",
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment