Commit 55d7a92d authored by Nicole Bussola's avatar Nicole Bussola
Browse files

manage custom splits

parent 610a7d3f
......@@ -39,7 +39,8 @@ parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--model', type=str, help='Classifiers implemented, randomForest or LSVM')
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
parser.add_argument('--n_splits_end', type=int, help='')
parser.add_argument('--n_splits_start', type=int, help='')
parser.add_argument('--mode', type=str, help='rSNF, rSNFi, single')
args = parser.parse_args()
......@@ -51,7 +52,8 @@ OUTFOLDER = args.outfolder
TARGET = args.target
MODEL = args.model
LAYERS = args.layers
N_SPLITS = args.n_splits
N_SPLITS_START = args.n_splits_start
N_SPLITS_END = args.n_splits_end
MODE = args.mode
assert MODE in ['juxt', 'rSNF', 'single']
......@@ -66,7 +68,7 @@ for k in range(2, N_LAYERS + 1):
_, var_names, _ = load_data(os.path.join(DATAFOLDER, DATASET, TARGET, f'0/{layers_concat}_tr.txt') )
rankings = []
for i in range(N_SPLITS):
for i in range(N_SPLITS_END-N_SPLITS_START):
if MODE == 'rSNF':
file_ranking = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_rankList_ranking.csv.gz')
......@@ -83,4 +85,4 @@ for k in range(2, N_LAYERS + 1):
for i, pos in zip(BORDA_ID, BORDA_POS):
borda_df = borda_df.append({'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos+1}, ignore_index=True)
borda_df.to_csv(f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_allSplits_{MODE}_{layers_concat}.txt", sep='\t', index=False, float_format="%.3f")
borda_df.to_csv(f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}_{layers_concat}.txt", sep='\t', index=False, float_format="%.3f")
......@@ -29,7 +29,8 @@ parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--model', type=str, help='Classifiers implemented, randomForest or LSVM')
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
parser.add_argument('--n_splits_end', type=int, help='')
parser.add_argument('--n_splits_start', type=int, help='')
parser.add_argument('--mode', type=str, help='rSNFi custom Borda')
args = parser.parse_args()
......@@ -41,7 +42,8 @@ OUTFOLDER = args.outfolder
TARGET = args.target
MODEL = args.model
LAYERS = args.layers
N_SPLITS = args.n_splits
N_SPLITS_START = args.n_splits_start
N_SPLITS_END = args.n_splits_end
MODE = args.mode
assert MODE == 'rSNFi'
......@@ -59,7 +61,7 @@ for k in range(2, N_LAYERS + 1):
all_feats=[]
for i in range(N_SPLITS):
for i in range(N_SPLITS_END-N_SPLITS_START):
file_featureList = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
all_feats.extend(list(feats.FEATURE_NAME))
......@@ -68,13 +70,13 @@ for k in range(2, N_LAYERS + 1):
positions = dict()
means = dict()
x=((len(all_feats)-1)*np.ones((1,N_SPLITS*CV_K*CV_N)))
x=((len(all_feats)-1)*np.ones((1,(N_SPLITS_END-N_SPLITS_START)*CV_K*CV_N)))
for i in all_feats:
positions[i]=x.tolist()[0]
means[i]=0.0
for i in range(N_SPLITS):
for i in range(N_SPLITS_END-N_SPLITS_START):
file_featureList = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
......@@ -96,4 +98,4 @@ for k in range(2, N_LAYERS + 1):
sorted_means = sorted(means.items(), key=operator.itemgetter(1))
borda_df = pd.DataFrame(sorted_means, columns=['FEATURE_NAME', 'MEAN_POS'])
borda_df.to_csv(f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_allSplits_{MODE}_{layers_concat}.txt", sep='\t', index=False, float_format="%.3f")
\ No newline at end of file
borda_df.to_csv(f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}_{layers_concat}.txt", sep='\t', index=False, float_format="%.3f")
\ No newline at end of file
......@@ -35,7 +35,8 @@ parser.add_argument(
'--model', type=str, help='classifiers implemented, randomForest or LSVM'
)
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
parser.add_argument('--n_splits_end', type=int, help='')
parser.add_argument('--n_splits_start', type=int, help='')
parser.add_argument('--mode', type=str, help='juxt, rSNF, rSNFi, single')
args = parser.parse_args()
......@@ -46,7 +47,8 @@ DATASET = args.dataset
TARGET = args.target
MODEL = args.model
LAYERS = args.layers
N_SPLITS = args.n_splits
N_SPLITS_START = args.n_splits_start
N_SPLITS_END = args.n_splits_end
MODE = args.mode
assert MODE in ['juxt', 'rSNF', 'rSNFi', 'single']
......@@ -98,7 +100,7 @@ for k in range_combinations:
all_test_mccs = []
best_feat_steps = []
for split_id in range(N_SPLITS):
for split_id in range(N_SPLITS_END-N_SPLITS_START):
PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/{split_id}'
......@@ -255,7 +257,7 @@ for k in range_combinations:
df_results = df_results.append(row, ignore_index=True)
df_results.to_csv(
f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_allSplits_{MODE}.txt',
f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}.txt',
sep='\t',
index=False,
)
......
......@@ -5,33 +5,34 @@ THREADS=12
OUTFOLDER=results
DATAFOLDER=data
DATASET=tcga_aml
LAYER1=gene
LAYER2=meth
LAYER3=mirna
TARGET=OS
DATASET=tcga_breast
LAYER1=cnv
LAYER2=prot
LAYER3=prot
TARGET=ER
MODEL=LSVM
N_SPLITS=10
N_SPLITS_START=0
N_SPLITS_END=10
# go!
for (( i=0; i<$N_SPLITS; i++ ))
for (( i=$N_SPLITS_START; i<$N_SPLITS_END; i++ ))
do
snakemake -s Snakefile_split --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 layer3=$LAYER3 model=$MODEL split_id=$i -p
snakemake -s Snakefile_split --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 model=$MODEL split_id=$i -p
done
for MODE in juxt rSNF rSNFi single
do
python postprocessing/compute_all_metrics.py --outfolder $OUTFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 $LAYER3 --model $MODEL --n_splits $N_SPLITS --mode $MODE
python postprocessing/compute_all_metrics.py --outfolder $OUTFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 --model $MODEL --n_splits $N_SPLITS_END $N_SPLITS_START --mode $MODE
done
for MODE in juxt rSNF
do
python postprocessing/borda_global_juxt_rSNF.py --datafolder $DATAFOLDER --outfolder $OUTFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 $LAYER3 --model $MODEL --n_splits $N_SPLITS --mode $MODE
python postprocessing/borda_global_juxt_rSNF.py --datafolder $DATAFOLDER --outfolder $OUTFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 --model $MODEL --n_splits $N_SPLITS_END $N_SPLITS_START --mode $MODE
done
python postprocessing/borda_global_rSNFi.py --datafolder $DATAFOLDER --outfolder $OUTFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 $LAYER3 --model $MODEL --n_splits $N_SPLITS --mode rSNFi
python postprocessing/borda_global_rSNFi.py --datafolder $DATAFOLDER --outfolder $OUTFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 --model $MODEL --n_splits $N_SPLITS_END $N_SPLITS_START --mode rSNFi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment