Commit 14d2515f authored by Marco Chierici's avatar Marco Chierici
Browse files

Added multiclass support (not tested yet :))

parent 0dda140d
......@@ -97,7 +97,7 @@ y_orig = np.loadtxt(LABELSFILE, dtype=np.int)
# encode labels
le = preprocessing.LabelEncoder()
y = le.fit_transform(y_orig)
is_multiclass = len(le.classes_) > 2
# If ranked list is given as input to DAP, read it and extract features index
if RANK_METHOD == "rankList":
rankedList = np.loadtxt(RANKFEATS, delimiter='\t', dtype=str, skiprows=1)
......@@ -138,7 +138,11 @@ else:
for i in range(CV_N):
ys.append(y)
cvmetrics_df = pd.DataFrame(columns=['Iteration', 'Fold', 'nf', 'mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv'])
if is_multiclass:
cvmetrics_df = pd.DataFrame(columns=['Iteration', 'Fold', 'nf', 'mcc', 'auc'])
lb = preprocessing.LabelBinarizer()
else:
cvmetrics_df = pd.DataFrame(columns=['Iteration', 'Fold', 'nf', 'mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv'])
for n in range(CV_N):
skf = StratifiedKFold(CV_K, shuffle=True, random_state=n)
......@@ -175,20 +179,35 @@ for n in range(CV_N):
RANKING[(n * CV_K) + i] = ranking_tmp
if is_multiclass:
forest_mc = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, criterion='gini', random_state=n, n_jobs=1))
y_tr_bin = lb.fit_transform(y_tr)
y_ts_bin = lb.transform(y_ts)
for j, s in enumerate(FSTEPS):
v = RANKING[(n * CV_K) + i][:s]
x_tr_fs, x_ts_fs = x_tr[:, v], x_ts[:, v]
forest.fit(x_tr_fs, y_tr)
yp = forest.predict(x_ts_fs)
cvmetrics_df = cvmetrics_df.append({'Iteration': n, 'Fold': i, 'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'sens': perf.sensitivity(y_ts, yp),
'spec': perf.specificity(y_ts, yp),
'acc': accuracy_score(y_ts, yp),
'auc': roc_auc_score(y_ts, yp),
'npv': perf.npv(y_ts, yp),
'ppv': perf.ppv(y_ts, yp)}, ignore_index=True)
if is_multiclass:
forest_mc.fit(x_tr_fs, y_tr_bin)
yp_bin = forest.predict(x_ts_fs)
cvmetrics_df = cvmetrics_df.append({'Iteration': n, 'Fold': i, 'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'auc': roc_auc_score(y_ts_bin, yp_bin, average="micro")}, ignore_index=True)
else:
cvmetrics_df = cvmetrics_df.append({'Iteration': n, 'Fold': i, 'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'sens': perf.sensitivity(y_ts, yp),
'spec': perf.specificity(y_ts, yp),
'acc': accuracy_score(y_ts, yp),
'auc': roc_auc_score(y_ts, yp),
'npv': perf.npv(y_ts, yp),
'ppv': perf.ppv(y_ts, yp)}, ignore_index=True)
cvmetrics_df[['Iteration', 'Fold', 'nf']] = cvmetrics_df[['Iteration', 'Fold', 'nf']].astype(int)
......@@ -201,23 +220,28 @@ np.savetxt(OUTFILE + "_ranking.csv.gz", RANKING, fmt='%d', delimiter='\t')
avg_df = cvmetrics_df.groupby(['nf']).mean()
# confidence intervals
AMCC, ASENS, ASPEC, AACC, AAUC, ANPV, APPV = [avg_df[metric].values for metric in ['mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv']]
MCC_CI, SENS_CI, SPEC_CI, ACC_CI, AUC_CI, NPV_CI, PPV_CI = ([] for i in range(7))
if is_multiclass:
AMCC, AAUC = [avg_df[metric].values for metric in ['mcc', 'auc']]
MCC_CI, AUC_CI = ([] for i in range(2))
else:
AMCC, ASENS, ASPEC, AACC, AAUC, ANPV, APPV = [avg_df[metric].values for metric in ['mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv']]
MCC_CI, SENS_CI, SPEC_CI, ACC_CI, AUC_CI, NPV_CI, PPV_CI = ([] for i in range(7))
for nf in FSTEPS:
res = bs.bootstrap(cvmetrics_df['mcc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
MCC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['sens'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
SENS_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['spec'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
SPEC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['acc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
ACC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['auc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
AUC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['npv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
NPV_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['ppv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
PPV_CI.append([res.lower_bound, res.upper_bound])
if not is_multiclass:
res = bs.bootstrap(cvmetrics_df['sens'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
SENS_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['spec'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
SPEC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['acc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
ACC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['npv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
NPV_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['ppv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
PPV_CI.append([res.lower_bound, res.upper_bound])
# Borda list
BORDA_ID, _, BORDA_POS = borda_count(RANKING)
......@@ -232,11 +256,19 @@ for ss in FSTEPS:
metrics_array = np.empty((len(FSTEPS), 22))
for j,s in enumerate(FSTEPS):
metrics_array[j] = np.array([s, AMCC[j], MCC_CI[j][0], MCC_CI[j][1], ASENS[j], SENS_CI[j][0], SENS_CI[j][1],
ASPEC[j], SPEC_CI[j][0], SPEC_CI[j][1], AACC[j], ACC_CI[j][0], ACC_CI[j][1],
AAUC[j], AUC_CI[j][0], AUC_CI[j][1], ANPV[j], NPV_CI[j][0], NPV_CI[j][1],
APPV[j], PPV_CI[j][0], PPV_CI[j][1]])
metrics_df = pd.DataFrame(metrics_array, columns=["nf", "mcc", "mcc_min", "mcc_max", "sens", "sens_min", "sens_max", "spec", "spec_min", "spec_max", "acc", "acc_min", "acc_max", "auc", "auc_min", "auc_max", "npv", "npv_min", "npv_max", "ppv", "ppv_min", "ppv_max"])
if is_multiclass:
metrics_array[j] = np.array([s, AMCC[j], MCC_CI[j][0], MCC_CI[j][1],
AAUC[j], AUC_CI[j][0], AUC_CI[j][1]])
else:
metrics_array[j] = np.array([s, AMCC[j], MCC_CI[j][0], MCC_CI[j][1], ASENS[j], SENS_CI[j][0], SENS_CI[j][1],
ASPEC[j], SPEC_CI[j][0], SPEC_CI[j][1], AACC[j], ACC_CI[j][0], ACC_CI[j][1],
AAUC[j], AUC_CI[j][0], AUC_CI[j][1], ANPV[j], NPV_CI[j][0], NPV_CI[j][1],
APPV[j], PPV_CI[j][0], PPV_CI[j][1]])
if is_multiclass:
metrics_df = pd.DataFrame(metrics_array, columns=["nf", "mcc", "mcc_min", "mcc_max", "auc", "auc_min", "auc_max"])
else:
metrics_df = pd.DataFrame(metrics_array, columns=["nf", "mcc", "mcc_min", "mcc_max", "sens", "sens_min", "sens_max", "spec", "spec_min", "spec_max", "acc", "acc_min", "acc_max", "auc", "auc_min", "auc_max", "npv", "npv_min", "npv_max", "ppv", "ppv_min", "ppv_max"])
metrics_df['nf'] = metrics_df['nf'].astype(int)
metrics_df.to_csv(f"{OUTFILE}_metrics.txt", sep='\t', index=False, float_format="%.3f")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment