Commit 72fea010 authored by Marco Chierici's avatar Marco Chierici
Browse files

Updated: metrics file format, confidence intervals calculations, etc.

parent adf3a331
......@@ -5,10 +5,11 @@
import numpy as np
import pandas as pd
import csv
import os.path
from scaling import norm_l2
from mlpy import bootstrap_ci, borda_count, canberra_stability
from mlpy import borda_count, canberra_stability
from input_output import load_data
import performance as perf
import sys
......@@ -17,9 +18,11 @@ import glob
import argparse
import configparser as ConfigParser
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
__author__ = 'Marco Chierici, Alessandro Zandona'
__version__ = '2.0'
......@@ -113,26 +116,15 @@ for percentage in feature_ranges:
# prepare output files
metricsf = open(OUTFILE + "_metrics.txt", 'w')
metrics_w = csv.writer(metricsf, delimiter='\t', lineterminator='\n')
rankingf = open(OUTFILE + "_featurelist.txt", 'w')
ranking_w = csv.writer(rankingf, delimiter='\t', lineterminator='\n')
#rankingf = open(OUTFILE + "_featurelist.txt", 'w')
#ranking_w = csv.writer(rankingf, delimiter='\t', lineterminator='\n')
stabilityf = open(OUTFILE + "_stability.txt", 'w')
stability_w = csv.writer(stabilityf, delimiter='\t', lineterminator='\n')
# prepare output arrays
RANKING = np.empty((CV_K*CV_N, x.shape[1]),
NPV = np.empty((CV_K*CV_N, len(FSTEPS)))
PPV = np.empty_like(NPV)
SENS = np.empty_like(NPV)
SPEC = np.empty_like(NPV)
MCC = np.empty_like(NPV)
AUC = np.empty_like(NPV)
DOR = np.empty_like(NPV)
ACC = np.empty_like(NPV)
......@@ -146,6 +138,7 @@ else:
for i in range(CV_N):
cvmetrics_df = pd.DataFrame(columns=['Iteration', 'Fold', 'nf', 'mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv'])
for n in range(CV_N):
skf = StratifiedKFold(CV_K, shuffle=True, random_state=n)
......@@ -182,78 +175,49 @@ for n in range(CV_N):
RANKING[(n * CV_K) + i] = ranking_tmp
#forest = RandomForestClassifier(n_estimators=500, criterion='gini')
for j, s in enumerate(FSTEPS):
#print j
v = RANKING[(n * CV_K) + i][:s]
x_tr_fs, x_ts_fs = x_tr[:, v], x_ts[:, v], y_tr)
p = forest.predict(x_ts_fs)
NPV[(n * CV_K) + i, j] = perf.npv(y_ts, p)
PPV[(n * CV_K) + i, j] = perf.ppv(y_ts, p)
SENS[(n * CV_K) + i, j] = perf.sensitivity(y_ts, p)
SPEC[(n * CV_K) + i, j] = perf.specificity(y_ts, p)
MCC[(n * CV_K) + i, j] = perf.KCCC_discrete(y_ts, p)
AUC[(n * CV_K) + i, j] = roc_auc_score(y_ts, p)
DOR[(n * CV_K) + i, j] = perf.dor(y_ts, p)
ACC[(n * CV_K) + i, j] = perf.accuracy(y_ts, p)
# write metrics for all CV iterations
np.savetxt(OUTFILE + "_allmetrics_MCC.txt", MCC, fmt='%.4f', delimiter='\t')
np.savetxt(OUTFILE + "_allmetrics_SENS.txt", SENS, fmt='%.4f', delimiter='\t')
np.savetxt(OUTFILE + "_allmetrics_SPEC.txt", SPEC, fmt='%.4f', delimiter='\t')
np.savetxt(OUTFILE + "_allmetrics_PPV.txt", PPV, fmt='%.4f', delimiter='\t')
np.savetxt(OUTFILE + "_allmetrics_NPV.txt", NPV, fmt='%.4f', delimiter='\t')
np.savetxt(OUTFILE + "_allmetrics_AUC.txt", AUC, fmt='%.4f', delimiter='\t')
np.savetxt(OUTFILE + "_allmetrics_ACC.txt", ACC, fmt='%.4f', delimiter='\t')
np.savetxt(OUTFILE + "_allmetrics_DOR.txt", DOR, fmt='%.4f', delimiter='\t')
tar = + "_allmetrics.tar.gz", "w:gz")
for metricFile in glob.glob(OUTFILE + "_allmetrics_*txt"):
tar.add(metricFile, arcname=os.path.basename(metricFile))
yp = forest.predict(x_ts_fs)
cvmetrics_df = cvmetrics_df.append({'Iteration': n, 'Fold': i, 'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'sens': perf.sensitivity(y_ts, yp),
'spec': perf.specificity(y_ts, yp),
'acc': accuracy_score(y_ts, yp),
'auc': roc_auc_score(y_ts, yp),
'npv': perf.npv(y_ts, yp),
'ppv': perf.ppv(y_ts, yp)}, ignore_index=True)
cvmetrics_df[['Iteration', 'Fold', 'nf']] = cvmetrics_df[['Iteration', 'Fold', 'nf']].astype(int)
cvmetrics_df.to_csv(f"{OUTFILE}_allmetrics.txt", sep='\t', index=False, float_format="%.3f")
# write all rankings
np.savetxt(OUTFILE + "_ranking.csv.gz", RANKING, fmt='%d', delimiter='\t')
# average values
AMCC = np.mean(MCC, axis=0)
ASENS = np.mean(SENS, axis=0)
ASPEC = np.mean(SPEC, axis=0)
APPV = np.mean(PPV, axis=0)
ANPV = np.mean(NPV, axis=0)
AAUC = np.mean(AUC, axis=0)
AACC = np.mean(ACC, axis=0)
ADOR = np.mean(DOR, axis=0)
# approximated Odds Ratio, computed from ASENS and ASPEC (to avoid inf and nan values)
avg_df = cvmetrics_df.groupby(['nf']).mean()
# confidence intervals
NPVCI = []
for i in range(NPV.shape[1]):
NPVCI.append(bootstrap_ci(NPV[:, i]))
PPVCI = []
for i in range(PPV.shape[1]):
PPVCI.append(bootstrap_ci(PPV[:, i]))
for i in range(SENS.shape[1]):
SENSCI.append(bootstrap_ci(SENS[:, i]))
for i in range(SPEC.shape[1]):
SPECCI.append(bootstrap_ci(SPEC[:, i]))
MCCCI = []
for i in range(MCC.shape[1]):
MCCCI.append(bootstrap_ci(MCC[:, i]))
AUCCI = []
for i in range(AUC.shape[1]):
AUCCI.append(bootstrap_ci(AUC[:, i]))
DORCI = []
for i in range(DOR.shape[1]):
DORCI.append(bootstrap_ci(DOR[:, i]))
ACCCI = []
for i in range(ACC.shape[1]):
ACCCI.append(bootstrap_ci(ACC[:, i]))
AMCC, ASENS, ASPEC, AACC, AAUC, ANPV, APPV = [avg_df[metric].values for metric in ['mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv']]
MCC_CI, SENS_CI, SPEC_CI, ACC_CI, AUC_CI, NPV_CI, PPV_CI = ([] for i in range(7))
for nf in FSTEPS:
res = bs.bootstrap(cvmetrics_df['mcc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
MCC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['sens'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
SENS_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['spec'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
SPEC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['acc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
ACC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['auc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
AUC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['npv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
NPV_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['ppv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
PPV_CI.append([res.lower_bound, res.upper_bound])
# Borda list
BORDA_ID, _, BORDA_POS = borda_count(RANKING)
......@@ -266,48 +230,29 @@ PR = np.argsort( RANKING )
for ss in FSTEPS:
STABILITY.append( canberra_stability(PR, ss) )
metrics_w.writerow(["FS_WITH_BEST_MCC", opt_feats])
metrics_array = np.empty((len(FSTEPS), 22))
for j,s in enumerate(FSTEPS):
metrics_array[j] = np.array([s, AMCC[j], MCC_CI[j][0], MCC_CI[j][1], ASENS[j], SENS_CI[j][0], SENS_CI[j][1],
ASPEC[j], SPEC_CI[j][0], SPEC_CI[j][1], AACC[j], ACC_CI[j][0], ACC_CI[j][1],
AAUC[j], AUC_CI[j][0], AUC_CI[j][1], ANPV[j], NPV_CI[j][0], NPV_CI[j][1],
APPV[j], PPV_CI[j][0], PPV_CI[j][1]])
metrics_df = pd.DataFrame(metrics_array, columns=["nf", "mcc", "mcc_min", "mcc_max", "sens", "sens_min", "sens_max", "spec", "spec_min", "spec_max", "acc", "acc_min", "acc_max", "auc", "auc_min", "auc_max", "npv", "npv_min", "npv_max", "ppv", "ppv_min", "ppv_max"])
metrics_df['nf'] = metrics_df['nf'].astype(int)
metrics_df.to_csv(f"{OUTFILE}_metrics.txt", sep='\t', index=False, float_format="%.3f")
stability_w.writerow(["STEP", "STABILITY"])
for j, s in enumerate(FSTEPS):
AMCC[j], MCCCI[j][0], MCCCI[j][1],
ASENS[j], SENSCI[j][0], SENSCI[j][1],
ASPEC[j], SPECCI[j][0], SPECCI[j][1],
APPV[j], PPVCI[j][0], PPVCI[j][1],
ANPV[j], NPVCI[j][0], NPVCI[j][1],
AAUC[j], AUCCI[j][0], AUCCI[j][1],
AACC[j], ACCCI[j][0], ACCCI[j][1],
ADOR[j], DORCI[j][0], DORCI[j][1],
stability_w.writerow( [s, STABILITY[j]] )
borda_array = np.empty((len(BORDA_ID), 3))
borda_df = pd.DataFrame(columns=["FEATURE_ID", "FEATURE_NAME", "MEAN_POS"])
for i, pos in zip(BORDA_ID, BORDA_POS):
classes = np.unique(y)
med_all = np.median(x[:, i])
med_c = np.zeros(np.shape(classes)[0])
for jj,c in enumerate(classes):
med_c[jj] = np.median(x[y==c, i])
with np.errstate(divide='ignore'):
fc = med_c[1] / med_c[0]
log2fc = np.log2(fc)
ranking_w.writerow([ i, var_names[i], pos+1, med_all, med_c[0], med_c[1], fc, log2fc ])
borda_df = borda_df.append({'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos+1}, ignore_index=True)
borda_df.to_csv(f"{OUTFILE}_featurelist.txt", sep='\t', index=False, float_format="%.3f")
logf = open(OUTFILE + ".log", 'w')
config = ConfigParser.RawConfigParser()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment