Commit 910072b4 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Compute mean and median number of features

parent 0a61bbaa
#%%
import pandas as pd
import argparse
import os
from collections import Counter, OrderedDict
from itertools import combinations
import numpy as np
from pathlib import Path
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
from collections import Counter, OrderedDict
from pathlib import Path
import os
import numpy as np
import pandas as pd
#%%
class myArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
......@@ -21,13 +24,16 @@ class myArgumentParser(argparse.ArgumentParser):
break
yield arg
parser = myArgumentParser(
description='Compute metrics on all splits', fromfile_prefix_chars='@'
)
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--model', type=str, help='classifiers implemented, randomForest or LSVM')
parser.add_argument(
'--model', type=str, help='classifiers implemented, randomForest or LSVM'
)
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
parser.add_argument('--mode', type=str, help='juxt, rSNF, rSNFi, single')
......@@ -51,12 +57,30 @@ N_LAYERS = len(LAYERS)
#%%
df_results = pd.DataFrame(columns=['layers', 'mcc_train','mcc_train_min','mcc_train_max', 'auc_train', 'auc_train_min', 'auc_train_max',
'sens_train','sens_train_min','sens_train_max', 'spec_train', 'spec_train_min', 'spec_train_max',
'mcc_test', 'mcc_test_min', 'mcc_test_max', 'best_feat'])
df_results = pd.DataFrame(
columns=[
'layers',
'mcc_train',
'mcc_train_min',
'mcc_train_max',
'auc_train',
'auc_train_min',
'auc_train_max',
'sens_train',
'sens_train_min',
'sens_train_max',
'spec_train',
'spec_train_min',
'spec_train_max',
'mcc_test',
'mcc_test_min',
'mcc_test_max',
'best_feat',
]
)
if MODE=='single':
range_combinations = range(1,2)
if MODE == 'single':
range_combinations = range(1, 2)
else:
range_combinations = range(2, N_LAYERS + 1)
......@@ -77,14 +101,22 @@ for k in range_combinations:
for split_id in range(N_SPLITS):
PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/{split_id}'
if MODE == 'rSNF':
file_log = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList.log')
file_metrics = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList_allmetrics.txt')
if MODE == 'rSNF':
file_log = os.path.join(
PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList.log'
)
file_metrics = os.path.join(
PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList_allmetrics.txt'
)
else:
file_log = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest.log')
file_metrics = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest_allmetrics.txt')
file_log = os.path.join(
PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest.log'
)
file_metrics = os.path.join(
PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest_allmetrics.txt'
)
with open(file_log) as f:
log_content = f.readlines()
......@@ -97,7 +129,6 @@ for k in range_combinations:
best_feat_line = line
break
best_feat = int(best_feat_line.split(' = ')[1][:-1])
best_feat_steps.append(best_feat)
......@@ -105,65 +136,127 @@ for k in range_combinations:
all_test_mccs.append(mcc_test)
# %%
all_metrics = pd.read_csv(file_metrics, sep='\t')
best_idxs = np.where(all_metrics["nf"]==best_feat)[0]
best_idxs = np.where(all_metrics["nf"] == best_feat)[0]
MCC = np.where(all_metrics.columns=='mcc')[0][0]
MCC = np.where(all_metrics.columns == 'mcc')[0][0]
best_mccs = all_metrics.iloc[best_idxs, MCC]
# print(np.mean(best_mccs), best_feat)
all_mccs.extend(best_mccs)
AUC = np.where(all_metrics.columns=='auc')[0][0]
AUC = np.where(all_metrics.columns == 'auc')[0][0]
best_auc = all_metrics.iloc[best_idxs, AUC]
all_aucs.extend(best_auc)
if TARGET!='subtypes':
SENS = np.where(all_metrics.columns=='sens')[0][0]
if TARGET != 'subtypes':
SENS = np.where(all_metrics.columns == 'sens')[0][0]
best_sens = all_metrics.iloc[best_idxs, SENS]
all_sens.extend(best_sens)
SPEC = np.where(all_metrics.columns=='spec')[0][0]
SPEC = np.where(all_metrics.columns == 'spec')[0][0]
best_spec = all_metrics.iloc[best_idxs, SPEC]
all_spec.extend(best_spec)
all_mccs = np.array(all_mccs)
MCC_CI = bs.bootstrap(all_mccs, stat_func=bs_stats.mean)
print('MCC train =', round(np.mean(all_mccs),3), (round(MCC_CI.lower_bound,3), round(MCC_CI.upper_bound,3)))
print(
'MCC train =',
round(np.mean(all_mccs), 3),
(round(MCC_CI.lower_bound, 3), round(MCC_CI.upper_bound, 3)),
)
all_aucs = np.array(all_aucs)
AUC_CI = bs.bootstrap(all_aucs, stat_func=bs_stats.mean)
print('AUC train =', round(np.mean(all_aucs),3), (round(AUC_CI.lower_bound,3), round(AUC_CI.upper_bound,3)))
AUC_CI = bs.bootstrap(all_aucs, stat_func=bs_stats.mean)
print(
'AUC train =',
round(np.mean(all_aucs), 3),
(round(AUC_CI.lower_bound, 3), round(AUC_CI.upper_bound, 3)),
)
all_test_mccs = np.array(all_test_mccs)
MCC_TEST = bs.bootstrap(all_test_mccs, stat_func=bs_stats.mean)
print('MCC test =', round(np.mean(all_test_mccs),3), (round(MCC_TEST.lower_bound,3), round(MCC_TEST.upper_bound,3)))
MCC_TEST = bs.bootstrap(all_test_mccs, stat_func=bs_stats.mean)
print(
'MCC test =',
round(np.mean(all_test_mccs), 3),
(round(MCC_TEST.lower_bound, 3), round(MCC_TEST.upper_bound, 3)),
)
mean_features = np.mean(best_feat_steps)
median_features = np.median(best_feat_steps)
if TARGET!='subtypes':
if TARGET != 'subtypes':
all_sens = np.array(all_sens)
all_spec = np.array(all_spec)
SENS_CI = bs.bootstrap(all_sens, stat_func=bs_stats.mean)
SPEC_CI = bs.bootstrap(all_spec, stat_func=bs_stats.mean)
print('SENS =', round(np.mean(all_sens),3), (round(SENS_CI.lower_bound,3), round(SENS_CI.upper_bound,3)))
print('SPEC =', round(np.mean(all_spec),3), (round(SPEC_CI.lower_bound,3), round(SPEC_CI.upper_bound,3)))
row = OrderedDict({'layers':layers_concat, 'mcc_train':round(np.mean(all_mccs),3), 'mcc_train_min':round(MCC_CI.lower_bound,3), 'mcc_train_max':round(MCC_CI.upper_bound,3),
'auc_train':round(np.mean(all_aucs),3), 'auc_train_min':round(AUC_CI.lower_bound,3), 'auc_train_max':round(AUC_CI.upper_bound,3),
'sens_train':round(np.mean(all_sens),3), 'sens_train_min':round(SENS_CI.lower_bound,3), 'sens_train_max':round(SENS_CI.upper_bound,3),
'spec_train':round(np.mean(all_spec),3), 'spec_train_min':round(SPEC_CI.lower_bound,3), 'spec_train_max':round(SPEC_CI.upper_bound,3),
'mcc_test':round(np.mean(all_test_mccs),3), 'mcc_test_min':round(MCC_TEST.lower_bound,3), 'mcc_test_max':round(MCC_TEST.upper_bound,3),
'best_feat':best_feat_steps})
print(
'SENS =',
round(np.mean(all_sens), 3),
(round(SENS_CI.lower_bound, 3), round(SENS_CI.upper_bound, 3)),
)
print(
'SPEC =',
round(np.mean(all_spec), 3),
(round(SPEC_CI.lower_bound, 3), round(SPEC_CI.upper_bound, 3)),
)
row = OrderedDict(
{
'layers': layers_concat,
'mcc_train': round(np.mean(all_mccs), 3),
'mcc_train_min': round(MCC_CI.lower_bound, 3),
'mcc_train_max': round(MCC_CI.upper_bound, 3),
'auc_train': round(np.mean(all_aucs), 3),
'auc_train_min': round(AUC_CI.lower_bound, 3),
'auc_train_max': round(AUC_CI.upper_bound, 3),
'sens_train': round(np.mean(all_sens), 3),
'sens_train_min': round(SENS_CI.lower_bound, 3),
'sens_train_max': round(SENS_CI.upper_bound, 3),
'spec_train': round(np.mean(all_spec), 3),
'spec_train_min': round(SPEC_CI.lower_bound, 3),
'spec_train_max': round(SPEC_CI.upper_bound, 3),
'mcc_test': round(np.mean(all_test_mccs), 3),
'mcc_test_min': round(MCC_TEST.lower_bound, 3),
'mcc_test_max': round(MCC_TEST.upper_bound, 3),
'best_feat': best_feat_steps,
'mean_features': mean_features,
'median_features': median_features,
}
)
else:
row = OrderedDict({'layers':layers_concat, 'mcc_train':round(np.mean(all_mccs),3), 'mcc_train_min':round(MCC_CI.lower_bound,3), 'mcc_train_max':round(MCC_CI.upper_bound,3),
'auc_train':round(np.mean(all_aucs),3), 'auc_train_min':round(AUC_CI.lower_bound,3), 'auc_train_max':round(AUC_CI.upper_bound,3),
'sens_train':np.nan, 'sens_train_min':np.nan, 'sens_train_max':np.nan,
'spec_train':np.nan, 'spec_train_min':np.nan, 'spec_train_max':np.nan,
'mcc_test':round(np.mean(all_test_mccs),3), 'mcc_test_min':round(MCC_TEST.lower_bound,3), 'mcc_test_max':round(MCC_TEST.upper_bound,3),
'best_feat':best_feat_steps})
row = OrderedDict(
{
'layers': layers_concat,
'mcc_train': round(np.mean(all_mccs), 3),
'mcc_train_min': round(MCC_CI.lower_bound, 3),
'mcc_train_max': round(MCC_CI.upper_bound, 3),
'auc_train': round(np.mean(all_aucs), 3),
'auc_train_min': round(AUC_CI.lower_bound, 3),
'auc_train_max': round(AUC_CI.upper_bound, 3),
'sens_train': np.nan,
'sens_train_min': np.nan,
'sens_train_max': np.nan,
'spec_train': np.nan,
'spec_train_min': np.nan,
'spec_train_max': np.nan,
'mcc_test': round(np.mean(all_test_mccs), 3),
'mcc_test_min': round(MCC_TEST.lower_bound, 3),
'mcc_test_max': round(MCC_TEST.upper_bound, 3),
'best_feat': best_feat_steps,
'mean_features': mean_features,
'median_features': median_features,
}
)
print(layers_concat, MODE, 'best_feats =', Counter(best_feat_steps))
print("Mean features: ", mean_features)
print("Median features: ", median_features)
print('\n')
df_results = df_results.append(row, ignore_index=True)
df_results.to_csv(f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_allSplits_{MODE}.txt', sep='\t', index=False)
df_results.to_csv(
f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_allSplits_{MODE}.txt',
sep='\t',
index=False,
)
# %%
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment