Commit 9bcb72d0 authored by Nicole Bussola's avatar Nicole Bussola
Browse files

fix concat_layers and add compute all metrics to pipeline

parent afbfd650
#%%
import pandas as pd
import numpy as np
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
from collections import Counter
from pathlib import Path
import os
#%%
PATH = 'results/tcga_breast/ER/'
mode = 'rSNFi'
N_LAYERS = 3
if N_LAYERS==1:
assert mode=='single'
single_layer = 'prot'
else:
layer1 = 'gene'
layer2 = 'cnv'
layer3 = 'prot'
N_SPLIT = 10
all_mccs = []
all_sens = []
all_spec = []
all_test_mccs = []
#%%
best_feat_steps = []
for i in range(N_SPLIT):
assert mode in ['juxt', 'rSNF', 'rSNFi', 'single']
if N_LAYERS==3:
layers = f'{layer1}_{layer2}_{layer3}'
assert len(layers.split('_')) == 3
elif N_LAYERS==2:
layers = f'{layer1}_{layer2}'
assert len(layers.split('_')) == 2
else:
layers = f'{single_layer}'
assert len(layers.split('_')) == 1
if mode == 'rSNF':
file_log = os.path.join(PATH, f'{i}/{mode}/{layers}_tr_RandomForest_rankList.log')
file_metrics = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_rankList_allmetrics.txt')
else:
file_log = os.path.join(PATH, f'{i}/{mode}/{layers}_tr_RandomForest_KBest.log')
file_metrics = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_allmetrics.txt')
with open(file_log) as f:
f = f.readlines()
for line in f:
if 'mcc' in line:
mcc_test_line = line
if "n_feats" in line:
best_feat_line = line
break
best_feat = int(best_feat_line.split(' = ')[1][:-1])
best_feat_steps.append(best_feat)
mcc_test = float(mcc_test_line.split(' = ')[1][:-1])
all_test_mccs.append(mcc_test)
# %%
all_metrics = pd.read_csv(file_metrics, sep='\t')
best_idxs = np.where(all_metrics["nf"]==best_feat)[0]
MCC = np.where(all_metrics.columns=='mcc')[0][0]
best_mccs = all_metrics.iloc[best_idxs, MCC]
all_mccs.extend(best_mccs)
SENS = np.where(all_metrics.columns=='sens')[0][0]
best_sens = all_metrics.iloc[best_idxs, SENS]
all_sens.extend(best_sens)
SPEC = np.where(all_metrics.columns=='spec')[0][0]
best_spec = all_metrics.iloc[best_idxs, SPEC]
all_spec.extend(best_spec)
all_mccs = np.array(all_mccs)
all_sens = np.array(all_sens)
all_spec = np.array(all_spec)
all_test_mccs = np.array(all_test_mccs)
# %%
if N_LAYERS==1:
print(single_layer, 'best_feats =', Counter(best_feat_steps))
else:
print(layers, 'best_feats =', Counter(best_feat_steps))
MCC_CI = bs.bootstrap(all_mccs, stat_func=bs_stats.mean)
print('MCC train=', round(np.mean(all_mccs),3), (round(MCC_CI.lower_bound,3), round(MCC_CI.upper_bound,3)))
SENS_CI = bs.bootstrap(all_sens, stat_func=bs_stats.mean)
print('SENS =', round(np.mean(all_sens),3), (round(SENS_CI.lower_bound,3), round(SENS_CI.upper_bound,3)))
SPEC_CI = bs.bootstrap(all_spec, stat_func=bs_stats.mean)
print('SPEC =', round(np.mean(all_spec),3), (round(SPEC_CI.lower_bound,3), round(SPEC_CI.upper_bound,3)))
MCC_TEST = bs.bootstrap(all_test_mccs, stat_func=bs_stats.mean)
print('MCC test =', round(np.mean(all_test_mccs),3), (round(MCC_TEST.lower_bound,3), round(MCC_TEST.upper_bound,3)))
# %%
#%%
import pandas as pd
import argparse
from itertools import combinations
import numpy as np
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
from collections import Counter, OrderedDict
from pathlib import Path
import os
#%%
class myArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
super(myArgumentParser, self).__init__(*args, **kwargs)
def convert_arg_line_to_args(self, line):
for arg in line.split():
if not arg.strip():
continue
if arg[0] == '#':
break
yield arg
parser = myArgumentParser(
description='Compute metrics on all splits', fromfile_prefix_chars='@'
)
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
parser.add_argument('--mode', type=str, help='juxt, rSNF, rSNFi, single')
args = parser.parse_args()
#%%
OUTFOLDER = args.outfolder
DATASET = args.dataset
TARGET = args.target
LAYERS = args.layers
N_SPLITS = args.n_splits
MODE = args.mode
assert MODE in ['juxt', 'rSNF', 'rSNFi', 'single']
#%%
PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}'
N_LAYERS = len(LAYERS)
#%%
df_results = pd.DataFrame(columns=['layers', 'mcc_train','mcc_train_min','mcc_train_max', 'auc_train', 'auc_train_min', 'auc_train_max',
'sens_train','sens_train_min','sens_train_max', 'spec_train', 'spec_train_min', 'spec_train_max',
'mcc_test', 'mcc_test_min', 'mcc_test_max', 'best_feat'])
for k in range(2, N_LAYERS + 1):
for comb in combinations(LAYERS, k):
layers_concat = '_'.join(comb)
all_mccs = []
all_sens = []
all_spec = []
all_aucs = []
all_test_mccs = []
best_feat_steps = []
for split_id in range(N_SPLITS):
PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{split_id}'
if MODE == 'rSNF':
file_log = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_RandomForest_rankList.log')
file_metrics = os.path.join(PATH,f'{MODE}/{layers_concat}_tr_RandomForest_rankList_allmetrics.txt')
else:
file_log = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_RandomForest_KBest.log')
file_metrics = os.path.join(PATH,f'{MODE}/{layers_concat}_tr_RandomForest_KBest_allmetrics.txt')
with open(file_log) as f:
log_content = f.readlines()
for line in log_content:
if 'mcc' in line:
mcc_test_line = line
if "n_feats" in line:
best_feat_line = line
break
best_feat = int(best_feat_line.split(' = ')[1][:-1])
best_feat_steps.append(best_feat)
mcc_test = float(mcc_test_line.split(' = ')[1][:-1])
all_test_mccs.append(mcc_test)
# %%
all_metrics = pd.read_csv(file_metrics, sep='\t')
best_idxs = np.where(all_metrics["nf"]==best_feat)[0]
MCC = np.where(all_metrics.columns=='mcc')[0][0]
best_mccs = all_metrics.iloc[best_idxs, MCC]
# print(np.mean(best_mccs), best_feat)
all_mccs.extend(best_mccs)
AUC = np.where(all_metrics.columns=='auc')[0][0]
best_auc = all_metrics.iloc[best_idxs, AUC]
all_aucs.extend(best_auc)
if TARGET!='subtypes':
SENS = np.where(all_metrics.columns=='sens')[0][0]
best_sens = all_metrics.iloc[best_idxs, SENS]
all_sens.extend(best_sens)
SPEC = np.where(all_metrics.columns=='spec')[0][0]
best_spec = all_metrics.iloc[best_idxs, SPEC]
all_spec.extend(best_spec)
all_mccs = np.array(all_mccs)
MCC_CI = bs.bootstrap(all_mccs, stat_func=bs_stats.mean)
print('MCC train =', round(np.mean(all_mccs),3), (round(MCC_CI.lower_bound,3), round(MCC_CI.upper_bound,3)))
all_aucs = np.array(all_aucs)
AUC_CI = bs.bootstrap(all_aucs, stat_func=bs_stats.mean)
print('AUC train =', round(np.mean(all_aucs),3), (round(AUC_CI.lower_bound,3), round(AUC_CI.upper_bound,3)))
all_test_mccs = np.array(all_test_mccs)
MCC_TEST = bs.bootstrap(all_test_mccs, stat_func=bs_stats.mean)
print('MCC test =', round(np.mean(all_test_mccs),3), (round(MCC_TEST.lower_bound,3), round(MCC_TEST.upper_bound,3)))
if TARGET!='subtypes':
all_sens = np.array(all_sens)
all_spec = np.array(all_spec)
SENS_CI = bs.bootstrap(all_sens, stat_func=bs_stats.mean)
SPEC_CI = bs.bootstrap(all_spec, stat_func=bs_stats.mean)
print('SENS =', round(np.mean(all_sens),3), (round(SENS_CI.lower_bound,3), round(SENS_CI.upper_bound,3)))
print('SPEC =', round(np.mean(all_spec),3), (round(SPEC_CI.lower_bound,3), round(SPEC_CI.upper_bound,3)))
row = OrderedDict({'layers':layers_concat, 'mcc_train':round(np.mean(all_mccs),3), 'mcc_train_min':round(MCC_CI.lower_bound,3), 'mcc_train_max':round(MCC_CI.upper_bound,3),
'auc_train':round(np.mean(all_aucs),3), 'auc_train_min':round(AUC_CI.lower_bound,3), 'auc_train_max':round(AUC_CI.upper_bound,3),
'sens_train':round(np.mean(all_sens),3), 'sens_train_min':round(SENS_CI.lower_bound,3), 'sens_train_max':round(SENS_CI.upper_bound,3),
'spec_train':round(np.mean(all_spec),3), 'spec_train_min':round(SPEC_CI.lower_bound,3), 'spec_train_max':round(SPEC_CI.upper_bound,3),
'mcc_test':round(np.mean(all_test_mccs),3), 'mcc_test_min':round(MCC_TEST.lower_bound,3), 'mcc_test_max':round(MCC_TEST.upper_bound,3),
'best_feat':best_feat_steps})
else:
row = OrderedDict({'layers':layers_concat, 'mcc_train':round(np.mean(all_mccs),3), 'mcc_train_min':round(MCC_CI.lower_bound,3), 'mcc_train_max':round(MCC_CI.upper_bound,3),
'auc_train':round(np.mean(all_aucs),3), 'auc_train_min':round(AUC_CI.lower_bound,3), 'auc_train_max':round(AUC_CI.upper_bound,3),
'sens_train':np.nan, 'sens_train_min':np.nan, 'sens_train_max':np.nan,
'spec_train':np.nan, 'spec_train_min':np.nan, 'spec_train_max':np.nan,
'mcc_test':round(np.mean(all_test_mccs),3), 'mcc_test_min':round(MCC_TEST.lower_bound,3), 'mcc_test_max':round(MCC_TEST.upper_bound,3),
'best_feat':best_feat_steps})
print(layers_concat, MODE, 'best_feats =', Counter(best_feat_steps))
print('\n')
df_results = df_results.append(row, ignore_index=True)
df_results.to_csv(f'{OUTFOLDER}/{DATASET}/{TARGET}/metrics_allSplits_{MODE}.txt', sep='\t', index=False)
# %%
......@@ -43,7 +43,7 @@ print(LAYERS)
for split_id in range(N_SPLITS):
PATH = f'{DATAFOLDER}/{DATASET}/{TARGET}/{split_id}'
for k in range(2, N_SPLITS + 1):
for k in range(2, len(LAYERS) + 1):
for comb in combinations(LAYERS, k):
single_dfs_tr = []
......@@ -57,8 +57,7 @@ for split_id in range(N_SPLITS):
merged_ts = reduce(lambda x, y: pd.merge(x, y, on='Sample'), single_dfs_ts)
layers_concat = '_'.join(comb)
merged_tr.to_csv(f'{PATH}/__{layers_concat}_tr.txt', sep='\t', index=False)
merged_ts.to_csv(f'{PATH}/__{layers_concat}_ts.txt', sep='\t', index=False)
merged_tr.to_csv(f'{PATH}/{layers_concat}_tr.txt', sep='\t', index=False)
merged_ts.to_csv(f'{PATH}/{layers_concat}_ts.txt', sep='\t', index=False)
# %%
......@@ -8,6 +8,7 @@ DATAFOLDER=data
DATASET=tcga_breast
LAYER1=gene
LAYER2=cnv
LAYER3=prot
TARGET=subtypes
N_SPLITS=10
......@@ -15,10 +16,14 @@ N_SPLITS=10
python preprocessing/concat_layers.py --datafolder $DATAFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 $LAYER3 --n_splits $N_SPLITS
for i in {0..$N_SPLITS}
for (( i=0; i<=$N_SPLITS; i++ ))
do
snakemake -f Snakefile_split --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 split_id=$i -p
snakemake -s Snakefile_split --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 split_id=$i -p
done
for MODE in juxt rSNF rSNFi
do
python postprocessing/compute_all_metrics.py --outfolder $OUTFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 $LAYER3 --n_splits $N_SPLITS --mode $MODE
done
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment