Commit 8c7a7978 authored by Marco Chierici's avatar Marco Chierici
Browse files

Add precision, recall on test set; Black formatting

parent d4dc5aa8
......@@ -11,21 +11,19 @@ import numpy as np
import pandas as pd
#%%
parser = argparse.ArgumentParser(
description='Compute Borda of Bordas for juXT and rSNF'
)
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser = argparse.ArgumentParser(description="Extract train and test metrics")
parser.add_argument("--outfolder", type=str, help="Results folder")
parser.add_argument("--dataset", type=str, help="Dataset name")
parser.add_argument("--target", type=str, help="Clinical endpoint")
parser.add_argument(
'--model', type=str, help='classifiers implemented, randomForest or LSVM'
"--model", type=str, help="classifiers implemented, randomForest or LSVM"
)
parser.add_argument('--layers', type=str, nargs='+', help='List of omic layers')
parser.add_argument("--layers", type=str, nargs="+", help="List of omic layers")
parser.add_argument(
'--n_splits_end', type=int, help='Upper end of splits range - not inclusive'
"--n_splits_end", type=int, help="Upper end of splits range - not inclusive"
)
parser.add_argument('--n_splits_start', type=int, help='Lower end of splits range')
parser.add_argument('--mode', type=str, help='juXT, rSNF, rSNFi or single')
parser.add_argument("--n_splits_start", type=int, help="Lower end of splits range")
parser.add_argument("--mode", type=str, help="juXT, rSNF, rSNFi or single")
args = parser.parse_args()
......@@ -39,9 +37,9 @@ N_SPLITS_START = args.n_splits_start
N_SPLITS_END = args.n_splits_end
MODE = args.mode
assert MODE in ['juxt', 'rSNF', 'rSNFi', 'single']
assert MODE in ["juxt", "rSNF", "rSNFi", "single"]
#%%
PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}'
PATH = f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}"
N_LAYERS = len(LAYERS)
......@@ -49,30 +47,36 @@ N_LAYERS = len(LAYERS)
#%%
df_results = pd.DataFrame(
columns=[
'layers',
'mcc_train',
'mcc_train_min',
'mcc_train_max',
'auc_train',
'auc_train_min',
'auc_train_max',
'sens_train',
'sens_train_min',
'sens_train_max',
'spec_train',
'spec_train_min',
'spec_train_max',
'ppv_train',
'ppv_train_min',
'ppv_train_max',
'mcc_test',
'mcc_test_min',
'mcc_test_max',
'best_feat',
"layers",
"mcc_train",
"mcc_train_min",
"mcc_train_max",
"auc_train",
"auc_train_min",
"auc_train_max",
"sens_train",
"sens_train_min",
"sens_train_max",
"spec_train",
"spec_train_min",
"spec_train_max",
"ppv_train",
"ppv_train_min",
"ppv_train_max",
"mcc_test",
"mcc_test_min",
"mcc_test_max",
"sens_test",
"sens_test_min",
"sens_test_max",
"ppv_test",
"ppv_test_min",
"ppv_test_max",
"best_feat",
]
)
if MODE == 'single':
if MODE == "single":
range_combinations = range(1, 2)
else:
range_combinations = range(2, N_LAYERS + 1)
......@@ -81,7 +85,7 @@ for k in range_combinations:
for comb in combinations(LAYERS, k):
layers_concat = '_'.join(comb)
layers_concat = "_".join(comb)
all_mccs = []
all_sens = []
......@@ -90,42 +94,44 @@ for k in range_combinations:
all_aucs = []
all_test_mccs = []
all_test_senss = []
all_test_ppvs = []
best_feat_steps = []
for split_id in range(N_SPLITS_START, N_SPLITS_END):
PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/{split_id}'
PATH = f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/{split_id}"
if MODE == 'rSNF':
if MODE == "rSNF":
file_log = os.path.join(
PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList.log'
PATH, f"{MODE}/{layers_concat}_tr_{MODEL}_rankList.log"
)
file_metrics = os.path.join(
PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList_allmetrics.txt'
PATH, f"{MODE}/{layers_concat}_tr_{MODEL}_rankList_allmetrics.txt"
)
file_MCC_test = os.path.join(
PATH, MODE, f'{layers_concat}_tr_MCC_scores.txt'
PATH, MODE, f"{layers_concat}_tr_MCC_scores.txt"
)
elif MODE == 'rSNFi':
elif MODE == "rSNFi":
file_log = os.path.join(
PATH, f'{MODE}/{layers_concat}_ts_{MODEL}_KBest.log'
PATH, f"{MODE}/{layers_concat}_ts_{MODEL}_KBest.log"
)
file_metrics = os.path.join(
PATH, f'{MODE}/{layers_concat}_ts_{MODEL}_KBest_allmetrics.txt'
PATH, f"{MODE}/{layers_concat}_ts_{MODEL}_KBest_allmetrics.txt"
)
file_MCC_test = os.path.join(
PATH, MODE, f'{layers_concat}_ts_MCC_scores.txt'
PATH, MODE, f"{layers_concat}_ts_MCC_scores.txt"
)
else:
file_log = os.path.join(
PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest.log'
PATH, f"{MODE}/{layers_concat}_tr_{MODEL}_KBest.log"
)
file_metrics = os.path.join(
PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest_allmetrics.txt'
PATH, f"{MODE}/{layers_concat}_tr_{MODEL}_KBest_allmetrics.txt"
)
file_MCC_test = os.path.join(
PATH, MODE, f'{layers_concat}_tr_MCC_scores.txt'
PATH, MODE, f"{layers_concat}_tr_MCC_scores.txt"
)
with open(file_log) as f:
......@@ -133,49 +139,54 @@ for k in range_combinations:
for line in log_content:
if 'mcc' in line:
if "mcc" in line:
mcc_test_line = line
if "n_feats" in line:
best_feat_line = line
break
best_feat = int(best_feat_line.split(' = ')[1][:-1])
best_feat = int(best_feat_line.split(" = ")[1][:-1])
best_feat_steps.append(best_feat)
with open(file_MCC_test, 'r') as f:
mcc_test = float(f.readlines()[1].split('\t')[1])
with open(file_MCC_test, "r") as f:
metrics_raw = f.readlines()
mcc_test = float(metrics_raw[1].split("\t")[1])
sens_test = float(metrics_raw[3].split("\t")[1])
ppv_test = float(metrics_raw[5].split("\t")[1])
all_test_mccs.append(mcc_test)
all_test_senss.append(sens_test)
all_test_ppvs.append(ppv_test)
all_metrics = pd.read_csv(file_metrics, sep='\t')
all_metrics = pd.read_csv(file_metrics, sep="\t")
best_idxs = np.where(all_metrics["nf"] == best_feat)[0]
MCC = np.where(all_metrics.columns == 'mcc')[0][0]
MCC = np.where(all_metrics.columns == "mcc")[0][0]
best_mccs = all_metrics.iloc[best_idxs, MCC]
# print(np.mean(best_mccs), best_feat)
all_mccs.extend(best_mccs)
AUC = np.where(all_metrics.columns == 'auc')[0][0]
AUC = np.where(all_metrics.columns == "auc")[0][0]
best_auc = all_metrics.iloc[best_idxs, AUC]
all_aucs.extend(best_auc)
if TARGET != 'subtypes':
SENS = np.where(all_metrics.columns == 'sens')[0][0]
if TARGET != "subtypes":
SENS = np.where(all_metrics.columns == "sens")[0][0]
best_sens = all_metrics.iloc[best_idxs, SENS]
all_sens.extend(best_sens)
SPEC = np.where(all_metrics.columns == 'spec')[0][0]
SPEC = np.where(all_metrics.columns == "spec")[0][0]
best_spec = all_metrics.iloc[best_idxs, SPEC]
all_spec.extend(best_spec)
PPV = np.where(all_metrics.columns == 'ppv')[0][0]
PPV = np.where(all_metrics.columns == "ppv")[0][0]
best_ppv = all_metrics.iloc[best_idxs, PPV]
all_ppv.extend(best_ppv)
all_mccs = np.array(all_mccs)
MCC_CI = bs.bootstrap(all_mccs, stat_func=bs_stats.mean)
print(
'MCC train =',
"MCC train =",
round(np.mean(all_mccs), 3),
(round(MCC_CI.lower_bound, 3), round(MCC_CI.upper_bound, 3)),
)
......@@ -183,7 +194,7 @@ for k in range_combinations:
all_aucs = np.array(all_aucs)
AUC_CI = bs.bootstrap(all_aucs, stat_func=bs_stats.mean)
print(
'AUC train =',
"AUC train =",
round(np.mean(all_aucs), 3),
(round(AUC_CI.lower_bound, 3), round(AUC_CI.upper_bound, 3)),
)
......@@ -191,15 +202,31 @@ for k in range_combinations:
all_test_mccs = np.array(all_test_mccs)
MCC_TEST = bs.bootstrap(all_test_mccs, stat_func=bs_stats.mean)
print(
'MCC test =',
"MCC test =",
round(np.mean(all_test_mccs), 3),
(round(MCC_TEST.lower_bound, 3), round(MCC_TEST.upper_bound, 3)),
)
all_test_senss = np.array(all_test_senss)
SENS_TEST = bs.bootstrap(all_test_senss, stat_func=bs_stats.mean)
print(
"SENS test =",
round(np.mean(all_test_senss), 3),
(round(SENS_TEST.lower_bound, 3), round(SENS_TEST.upper_bound, 3)),
)
all_test_ppvs = np.array(all_test_ppvs)
PPV_TEST = bs.bootstrap(all_test_ppvs, stat_func=bs_stats.mean)
print(
"PPV test =",
round(np.mean(all_test_ppvs), 3),
(round(PPV_TEST.lower_bound, 3), round(PPV_TEST.upper_bound, 3)),
)
mean_features = np.mean(best_feat_steps)
median_features = np.median(best_feat_steps)
if TARGET != 'subtypes':
if TARGET != "subtypes":
all_sens = np.array(all_sens)
all_spec = np.array(all_spec)
all_ppv = np.array(all_ppv)
......@@ -207,85 +234,97 @@ for k in range_combinations:
SPEC_CI = bs.bootstrap(all_spec, stat_func=bs_stats.mean)
PPV_CI = bs.bootstrap(all_ppv, stat_func=bs_stats.mean)
print(
'SENS =',
"SENS =",
round(np.mean(all_sens), 3),
(round(SENS_CI.lower_bound, 3), round(SENS_CI.upper_bound, 3)),
)
print(
'SPEC =',
"SPEC =",
round(np.mean(all_spec), 3),
(round(SPEC_CI.lower_bound, 3), round(SPEC_CI.upper_bound, 3)),
)
print(
'PPV =',
"PPV =",
round(np.mean(all_ppv), 3),
(round(PPV_CI.lower_bound, 3), round(PPV_CI.upper_bound, 3)),
)
row = OrderedDict(
{
'layers': layers_concat,
'mcc_train': round(np.mean(all_mccs), 3),
'mcc_train_min': round(MCC_CI.lower_bound, 3),
'mcc_train_max': round(MCC_CI.upper_bound, 3),
'auc_train': round(np.mean(all_aucs), 3),
'auc_train_min': round(AUC_CI.lower_bound, 3),
'auc_train_max': round(AUC_CI.upper_bound, 3),
'sens_train': round(np.mean(all_sens), 3),
'sens_train_min': round(SENS_CI.lower_bound, 3),
'sens_train_max': round(SENS_CI.upper_bound, 3),
'spec_train': round(np.mean(all_spec), 3),
'spec_train_min': round(SPEC_CI.lower_bound, 3),
'spec_train_max': round(SPEC_CI.upper_bound, 3),
'ppv_train': round(np.mean(all_ppv), 3),
'ppv_train_min': round(PPV_CI.lower_bound, 3),
'ppv_train_max': round(PPV_CI.upper_bound, 3),
'mcc_test': round(np.mean(all_test_mccs), 3),
'mcc_test_min': round(MCC_TEST.lower_bound, 3),
'mcc_test_max': round(MCC_TEST.upper_bound, 3),
'best_feat': best_feat_steps,
'mean_features': mean_features,
'median_features': median_features,
"layers": layers_concat,
"mcc_train": round(np.mean(all_mccs), 3),
"mcc_train_min": round(MCC_CI.lower_bound, 3),
"mcc_train_max": round(MCC_CI.upper_bound, 3),
"auc_train": round(np.mean(all_aucs), 3),
"auc_train_min": round(AUC_CI.lower_bound, 3),
"auc_train_max": round(AUC_CI.upper_bound, 3),
"sens_train": round(np.mean(all_sens), 3),
"sens_train_min": round(SENS_CI.lower_bound, 3),
"sens_train_max": round(SENS_CI.upper_bound, 3),
"spec_train": round(np.mean(all_spec), 3),
"spec_train_min": round(SPEC_CI.lower_bound, 3),
"spec_train_max": round(SPEC_CI.upper_bound, 3),
"ppv_train": round(np.mean(all_ppv), 3),
"ppv_train_min": round(PPV_CI.lower_bound, 3),
"ppv_train_max": round(PPV_CI.upper_bound, 3),
"mcc_test": round(np.mean(all_test_mccs), 3),
"mcc_test_min": round(MCC_TEST.lower_bound, 3),
"mcc_test_max": round(MCC_TEST.upper_bound, 3),
"sens_test": round(np.mean(all_test_senss), 3),
"sens_test_min": round(SENS_TEST.lower_bound, 3),
"sens_test_max": round(SENS_TEST.upper_bound, 3),
"ppv_test": round(np.mean(all_test_ppvs), 3),
"ppv_test_min": round(PPV_TEST.lower_bound, 3),
"ppv_test_max": round(PPV_TEST.upper_bound, 3),
"best_feat": best_feat_steps,
"mean_features": mean_features,
"median_features": median_features,
}
)
else:
row = OrderedDict(
{
'layers': layers_concat,
'mcc_train': round(np.mean(all_mccs), 3),
'mcc_train_min': round(MCC_CI.lower_bound, 3),
'mcc_train_max': round(MCC_CI.upper_bound, 3),
'auc_train': round(np.mean(all_aucs), 3),
'auc_train_min': round(AUC_CI.lower_bound, 3),
'auc_train_max': round(AUC_CI.upper_bound, 3),
'sens_train': np.nan,
'sens_train_min': np.nan,
'sens_train_max': np.nan,
'spec_train': np.nan,
'spec_train_min': np.nan,
'spec_train_max': np.nan,
'ppv_train': np.nan,
'ppv_train_min': np.nan,
'ppv_train_max': np.nan,
'mcc_test': round(np.mean(all_test_mccs), 3),
'mcc_test_min': round(MCC_TEST.lower_bound, 3),
'mcc_test_max': round(MCC_TEST.upper_bound, 3),
'best_feat': best_feat_steps,
'mean_features': mean_features,
'median_features': median_features,
"layers": layers_concat,
"mcc_train": round(np.mean(all_mccs), 3),
"mcc_train_min": round(MCC_CI.lower_bound, 3),
"mcc_train_max": round(MCC_CI.upper_bound, 3),
"auc_train": round(np.mean(all_aucs), 3),
"auc_train_min": round(AUC_CI.lower_bound, 3),
"auc_train_max": round(AUC_CI.upper_bound, 3),
"sens_train": np.nan,
"sens_train_min": np.nan,
"sens_train_max": np.nan,
"spec_train": np.nan,
"spec_train_min": np.nan,
"spec_train_max": np.nan,
"ppv_train": np.nan,
"ppv_train_min": np.nan,
"ppv_train_max": np.nan,
"mcc_test": round(np.mean(all_test_mccs), 3),
"mcc_test_min": round(MCC_TEST.lower_bound, 3),
"mcc_test_max": round(MCC_TEST.upper_bound, 3),
"sens_test": np.nan,
"sens_test_min": np.nan,
"sens_test_max": np.nan,
"ppv_test": np.nan,
"ppv_test_min": np.nan,
"ppv_test_max": np.nan,
"best_feat": best_feat_steps,
"mean_features": mean_features,
"median_features": median_features,
}
)
print(layers_concat, MODE, 'best_feats =', Counter(best_feat_steps))
print(layers_concat, MODE, "best_feats =", Counter(best_feat_steps))
print("Mean features: ", mean_features)
print("Median features: ", median_features)
print('\n')
print("\n")
df_results = df_results.append(row, ignore_index=True)
df_results.to_csv(
f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}.txt',
sep='\t',
f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}.txt",
sep="\t",
index=False,
)
# %%
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment