Commit 98add45e authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Fix random labels results folder + black formatting

parent 910072b4
......@@ -16,19 +16,28 @@ import pandas as pd
from mlpy import borda_count, canberra_stability
from sklearn import preprocessing, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, make_scorer, matthews_corrcoef,
roc_auc_score)
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
StratifiedShuffleSplit, train_test_split)
from sklearn.metrics import (
accuracy_score,
make_scorer,
matthews_corrcoef,
roc_auc_score,
)
from sklearn.model_selection import (
GridSearchCV,
StratifiedKFold,
StratifiedShuffleSplit,
train_test_split,
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
import performance as perf
from input_output import load_data
__author__ = 'Marco Chierici'
__author__ = 'Marco Chierici'
__version__ = '2.6'
__date__ = '24 Jan 2020'
__date__ = '24 Jan 2020'
class myArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
......@@ -42,20 +51,55 @@ class myArgumentParser(argparse.ArgumentParser):
break
yield arg
parser = myArgumentParser(description='Run a training experiment (10x5-CV fold) using Random Forest as classifier.',
fromfile_prefix_chars='@')
parser = myArgumentParser(
description='Run a training experiment (10x5-CV fold) using Random Forest as classifier.',
fromfile_prefix_chars='@',
)
parser.add_argument('DATAFILE', type=str, help='Training datafile')
parser.add_argument('LABELSFILE', type=str, help='Sample labels')
parser.add_argument('OUTDIR', type=str, help='Output directory')
parser.add_argument('--model', dest='MODEL_TYPE', type=str, choices=['randomForest', 'LSVM'], default='randomForest', help='Classifier (default: %(default)s)')
parser.add_argument('--scaling', dest='SCALING', type=str, choices=['std', 'minmax'], default='std', help='Scaling method (default: %(default)s)')
parser.add_argument('--ranking', dest='RANK_METHOD', type=str, choices=['randomForest', 'LSVM', 'KBest', 'random', 'rankList'], default='KBest', help='Feature ranking method: Random Forest, LSVM weights, Anova F-score, random ranking, external ranked features list (default: %(default)s)')
parser.add_argument('--random', action='store_true', help='Run with random sample labels')
parser.add_argument('--cv_k', type=np.int, default=5, help='Number of CV folds (default: %(default)s)')
parser.add_argument('--cv_n', type=np.int, default=10, help='Number of CV cycles (default: %(default)s)')
parser.add_argument('--rankFeats', type=str, default='', help='Ranked features list to be used by Machine Learning [Feats name on 1st column, feats weights on 2nd column, with HEADER]')
if len(sys.argv)==1:
parser.add_argument(
'--model',
dest='MODEL_TYPE',
type=str,
choices=['randomForest', 'LSVM'],
default='randomForest',
help='Classifier (default: %(default)s)',
)
parser.add_argument(
'--scaling',
dest='SCALING',
type=str,
choices=['std', 'minmax'],
default='std',
help='Scaling method (default: %(default)s)',
)
parser.add_argument(
'--ranking',
dest='RANK_METHOD',
type=str,
choices=['randomForest', 'LSVM', 'KBest', 'random', 'rankList'],
default='KBest',
help='Feature ranking method: Random Forest, LSVM weights, Anova F-score, random ranking, external ranked features list (default: %(default)s)',
)
parser.add_argument(
'--random', action='store_true', help='Run with random sample labels'
)
parser.add_argument(
'--cv_k', type=np.int, default=5, help='Number of CV folds (default: %(default)s)'
)
parser.add_argument(
'--cv_n', type=np.int, default=10, help='Number of CV cycles (default: %(default)s)'
)
parser.add_argument(
'--rankFeats',
type=str,
default='',
help='Ranked features list to be used by Machine Learning [Feats name on 1st column, feats weights on 2nd column, with HEADER]',
)
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
......@@ -72,7 +116,15 @@ CV_N = args.cv_n
RANKFEATS = args.rankFeats
BASEFILE = os.path.splitext(os.path.basename(DATAFILE))[0]
OUTFILE = os.path.join(OUTDIR, '_'.join([BASEFILE, MODEL_TYPE, 'RF' if RANK_METHOD=='randomForest' else RANK_METHOD]))
if random_labels:
OUTDIR = OUTDIR + '_randomLabels'
OUTFILE = os.path.join(
OUTDIR,
'_'.join(
[BASEFILE, MODEL_TYPE, 'RF' if RANK_METHOD == 'randomForest' else RANK_METHOD]
),
)
# create OUTDIR if not present
try:
......@@ -90,7 +142,7 @@ TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P = 0.5
# list of parameters for LSVM tuning
TUN_PARAMS_LSVM = [{'svm__C': [10**int(k) for k in np.arange(-2, 3)]}]
TUN_PARAMS_LSVM = [{'svm__C': [10 ** int(k) for k in np.arange(-2, 3)]}]
sample_names, var_names, x = load_data(DATAFILE)
y_orig = pd.read_csv(LABELSFILE, header=None).values
......@@ -101,18 +153,18 @@ is_multiclass = len(le.classes_) > 2
# If ranked list is given as input to DAP, read it and extract features index
if RANK_METHOD == "rankList":
rankedList = np.loadtxt(RANKFEATS, delimiter='\t', dtype=str, skiprows=1)
ranked_feats = rankedList[:,0]
ranked_feats = rankedList[:, 0]
# Find index of features inside dataset
ranked_feats_idx = []
for el in ranked_feats.tolist():
ranked_feats_idx.append(np.where([el==vn for vn in var_names])[0][0])
ranked_feats_idx.append(np.where([el == vn for vn in var_names])[0][0])
scorer = make_scorer(matthews_corrcoef)
if SCALING == 'std':
scaler = preprocessing.StandardScaler()
elif SCALING == 'minmax':
scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
# tuning Pipeline
tun_classif = svm.SVC(class_weight="balanced", kernel="linear")
......@@ -132,9 +184,9 @@ stabilityf = open(OUTFILE + "_stability.txt", 'w')
stability_w = csv.writer(stabilityf, delimiter='\t', lineterminator='\n')
# prepare output arrays
RANKING = np.empty((CV_K*CV_N, x.shape[1]), dtype=np.int)
RANKING = np.empty((CV_K * CV_N, x.shape[1]), dtype=np.int)
ys=[]
ys = []
if random_labels:
np.random.seed(0)
......@@ -150,7 +202,20 @@ if is_multiclass:
cvmetrics_df = pd.DataFrame(columns=['Iteration', 'Fold', 'nf', 'mcc', 'auc'])
lb = preprocessing.LabelBinarizer()
else:
cvmetrics_df = pd.DataFrame(columns=['Iteration', 'Fold', 'nf', 'mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv'])
cvmetrics_df = pd.DataFrame(
columns=[
'Iteration',
'Fold',
'nf',
'mcc',
'sens',
'spec',
'acc',
'auc',
'npv',
'ppv',
]
)
for n in range(CV_N):
skf = StratifiedKFold(CV_K, shuffle=True, random_state=n)
......@@ -161,30 +226,36 @@ for n in range(CV_N):
y_tr, y_ts = ys[n][idx_tr], ys[n][idx_ts]
if MODEL_TYPE == "randomForest":
model = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=n, n_jobs=1)
model = RandomForestClassifier(
n_estimators=500, criterion='gini', random_state=n, n_jobs=1
)
model.fit(x_tr, y_tr)
elif MODEL_TYPE == "LSVM":
# LSVM tuning
tuncv = StratifiedShuffleSplit(n_splits=TUN_CV_K, test_size=TUN_CV_P, random_state=i)
model_pipe = GridSearchCV(tuning_pipeline, param_grid=TUN_PARAMS_LSVM, cv=tuncv, scoring=scorer)
tuncv = StratifiedShuffleSplit(
n_splits=TUN_CV_K, test_size=TUN_CV_P, random_state=i
)
model_pipe = GridSearchCV(
tuning_pipeline, param_grid=TUN_PARAMS_LSVM, cv=tuncv, scoring=scorer
)
model_pipe.fit(x_tr, y_tr)
model = model_pipe.best_estimator_.get_params()['svm']
model = model_pipe.best_estimator_.get_params()['svm']
if RANK_METHOD == 'random':
ranking_tmp = np.arange(len(var_names))
np.random.seed((n*CV_K)+i)
np.random.seed((n * CV_K) + i)
np.random.shuffle(ranking_tmp)
elif RANK_METHOD == 'randomForest' and MODEL_TYPE == "randomForest":
ranking_tmp = np.argsort(model.feature_importances_)[::-1]
elif RANK_METHOD == 'KBest':
selector = SelectKBest(f_classif, k='all')
selector.fit(x_tr, y_tr)
ranking_tmp = np.argsort( -np.log10(selector.pvalues_) )[::-1]
ranking_tmp = np.argsort(-np.log10(selector.pvalues_))[::-1]
elif RANK_METHOD == "LSVM":
ranking_tmp = np.argsort( model.coef_[0]**2 )[::-1]
ranking_tmp = np.argsort(model.coef_[0] ** 2)[::-1]
elif RANK_METHOD == 'rankList':
ranking_tmp = ranked_feats_idx
ranking_tmp = ranked_feats_idx
RANKING[(n * CV_K) + i] = ranking_tmp
if MODEL_TYPE == "LSVM":
......@@ -196,7 +267,7 @@ for n in range(CV_N):
model_mc = OneVsRestClassifier(model)
y_tr_bin = lb.fit_transform(y_tr)
y_ts_bin = lb.transform(y_ts)
for j, s in enumerate(FSTEPS):
v = RANKING[(n * CV_K) + i][:s]
x_tr_fs, x_ts_fs = x_tr[:, v], x_ts[:, v]
......@@ -211,21 +282,39 @@ for n in range(CV_N):
elif MODEL_TYPE == "LSVM":
auc = np.nan
cvmetrics_df = cvmetrics_df.append({'Iteration': n, 'Fold': i, 'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'auc': auc}, ignore_index=True)
cvmetrics_df = cvmetrics_df.append(
{
'Iteration': n,
'Fold': i,
'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'auc': auc,
},
ignore_index=True,
)
else:
cvmetrics_df = cvmetrics_df.append({'Iteration': n, 'Fold': i, 'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'sens': perf.sensitivity(y_ts, yp),
'spec': perf.specificity(y_ts, yp),
'acc': accuracy_score(y_ts, yp),
'auc': roc_auc_score(y_ts, yp),
'npv': perf.npv(y_ts, yp),
'ppv': perf.ppv(y_ts, yp)}, ignore_index=True)
cvmetrics_df[['Iteration', 'Fold', 'nf']] = cvmetrics_df[['Iteration', 'Fold', 'nf']].astype(int)
cvmetrics_df.to_csv(f"{OUTFILE}_allmetrics.txt", sep='\t', index=False, float_format="%.3f")
cvmetrics_df = cvmetrics_df.append(
{
'Iteration': n,
'Fold': i,
'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'sens': perf.sensitivity(y_ts, yp),
'spec': perf.specificity(y_ts, yp),
'acc': accuracy_score(y_ts, yp),
'auc': roc_auc_score(y_ts, yp),
'npv': perf.npv(y_ts, yp),
'ppv': perf.ppv(y_ts, yp),
},
ignore_index=True,
)
cvmetrics_df[['Iteration', 'Fold', 'nf']] = cvmetrics_df[
['Iteration', 'Fold', 'nf']
].astype(int)
cvmetrics_df.to_csv(
f"{OUTFILE}_allmetrics.txt", sep='\t', index=False, float_format="%.3f"
)
# write all rankings
np.savetxt(OUTFILE + "_ranking.csv.gz", RANKING, fmt='%d', delimiter='\t')
......@@ -238,23 +327,45 @@ if is_multiclass:
AMCC, AAUC = [avg_df[metric].values for metric in ['mcc', 'auc']]
MCC_CI, AUC_CI = ([] for i in range(2))
else:
AMCC, ASENS, ASPEC, AACC, AAUC, ANPV, APPV = [avg_df[metric].values for metric in ['mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv']]
AMCC, ASENS, ASPEC, AACC, AAUC, ANPV, APPV = [
avg_df[metric].values
for metric in ['mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv']
]
MCC_CI, SENS_CI, SPEC_CI, ACC_CI, AUC_CI, NPV_CI, PPV_CI = ([] for i in range(7))
for nf in FSTEPS:
res = bs.bootstrap(cvmetrics_df['mcc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
res = bs.bootstrap(
cvmetrics_df['mcc'][cvmetrics_df['nf'] == nf].values, stat_func=bs_stats.mean
)
MCC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['auc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
res = bs.bootstrap(
cvmetrics_df['auc'][cvmetrics_df['nf'] == nf].values, stat_func=bs_stats.mean
)
AUC_CI.append([res.lower_bound, res.upper_bound])
if not is_multiclass:
res = bs.bootstrap(cvmetrics_df['sens'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
res = bs.bootstrap(
cvmetrics_df['sens'][cvmetrics_df['nf'] == nf].values,
stat_func=bs_stats.mean,
)
SENS_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['spec'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
res = bs.bootstrap(
cvmetrics_df['spec'][cvmetrics_df['nf'] == nf].values,
stat_func=bs_stats.mean,
)
SPEC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['acc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
res = bs.bootstrap(
cvmetrics_df['acc'][cvmetrics_df['nf'] == nf].values,
stat_func=bs_stats.mean,
)
ACC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['npv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
res = bs.bootstrap(
cvmetrics_df['npv'][cvmetrics_df['nf'] == nf].values,
stat_func=bs_stats.mean,
)
NPV_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['ppv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
res = bs.bootstrap(
cvmetrics_df['ppv'][cvmetrics_df['nf'] == nf].values,
stat_func=bs_stats.mean,
)
PPV_CI.append([res.lower_bound, res.upper_bound])
# Borda list
......@@ -264,32 +375,92 @@ opt_feats = FSTEPS[np.argmax(AMCC)]
# Canberra stability indicator
STABILITY = []
PR = np.argsort( RANKING )
PR = np.argsort(RANKING)
for ss in FSTEPS:
STABILITY.append( canberra_stability(PR, ss) )
STABILITY.append(canberra_stability(PR, ss))
n_cols = 7 if is_multiclass else 22
metrics_array = np.empty((len(FSTEPS), n_cols))
for j,s in enumerate(FSTEPS):
for j, s in enumerate(FSTEPS):
if is_multiclass:
metrics_array[j] = np.array([s, AMCC[j], MCC_CI[j][0], MCC_CI[j][1],
AAUC[j], AUC_CI[j][0], AUC_CI[j][1]])
metrics_array[j] = np.array(
[
s,
AMCC[j],
MCC_CI[j][0],
MCC_CI[j][1],
AAUC[j],
AUC_CI[j][0],
AUC_CI[j][1],
]
)
else:
metrics_array[j] = np.array([s, AMCC[j], MCC_CI[j][0], MCC_CI[j][1], ASENS[j], SENS_CI[j][0], SENS_CI[j][1],
ASPEC[j], SPEC_CI[j][0], SPEC_CI[j][1], AACC[j], ACC_CI[j][0], ACC_CI[j][1],
AAUC[j], AUC_CI[j][0], AUC_CI[j][1], ANPV[j], NPV_CI[j][0], NPV_CI[j][1],
APPV[j], PPV_CI[j][0], PPV_CI[j][1]])
metrics_array[j] = np.array(
[
s,
AMCC[j],
MCC_CI[j][0],
MCC_CI[j][1],
ASENS[j],
SENS_CI[j][0],
SENS_CI[j][1],
ASPEC[j],
SPEC_CI[j][0],
SPEC_CI[j][1],
AACC[j],
ACC_CI[j][0],
ACC_CI[j][1],
AAUC[j],
AUC_CI[j][0],
AUC_CI[j][1],
ANPV[j],
NPV_CI[j][0],
NPV_CI[j][1],
APPV[j],
PPV_CI[j][0],
PPV_CI[j][1],
]
)
if is_multiclass:
metrics_df = pd.DataFrame(metrics_array, columns=["nf", "mcc", "mcc_min", "mcc_max", "auc", "auc_min", "auc_max"])
metrics_df = pd.DataFrame(
metrics_array,
columns=["nf", "mcc", "mcc_min", "mcc_max", "auc", "auc_min", "auc_max"],
)
else:
metrics_df = pd.DataFrame(metrics_array, columns=["nf", "mcc", "mcc_min", "mcc_max", "sens", "sens_min", "sens_max", "spec", "spec_min", "spec_max", "acc", "acc_min", "acc_max", "auc", "auc_min", "auc_max", "npv", "npv_min", "npv_max", "ppv", "ppv_min", "ppv_max"])
metrics_df = pd.DataFrame(
metrics_array,
columns=[
"nf",
"mcc",
"mcc_min",
"mcc_max",
"sens",
"sens_min",
"sens_max",
"spec",
"spec_min",
"spec_max",
"acc",
"acc_min",
"acc_max",
"auc",
"auc_min",
"auc_max",
"npv",
"npv_min",
"npv_max",
"ppv",
"ppv_min",
"ppv_max",
],
)
metrics_df['nf'] = metrics_df['nf'].astype(int)
metrics_df.to_csv(f"{OUTFILE}_metrics.txt", sep='\t', index=False, float_format="%.3f")
stability_w.writerow(["STEP", "STABILITY"])
for j, s in enumerate(FSTEPS):
stability_w.writerow( [s, STABILITY[j]] )
stability_w.writerow([s, STABILITY[j]])
stabilityf.close()
......@@ -297,9 +468,14 @@ borda_array = np.empty((len(BORDA_ID), 3))
borda_df = pd.DataFrame(columns=["FEATURE_ID", "FEATURE_NAME", "MEAN_POS"])
for i, pos in zip(BORDA_ID, BORDA_POS):
borda_df = borda_df.append({'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos+1}, ignore_index=True)
borda_df = borda_df.append(
{'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos + 1},
ignore_index=True,
)
borda_df.to_csv(f"{OUTFILE}_featurelist.txt", sep='\t', index=False, float_format="%.3f")
borda_df.to_csv(
f"{OUTFILE}_featurelist.txt", sep='\t', index=False, float_format="%.3f"
)
logf = open(OUTFILE + ".log", 'w')
config = ConfigParser.RawConfigParser()
......@@ -311,18 +487,18 @@ config.add_section("CV PARAMETERS")
config.set("CV PARAMETERS", "Folds", CV_K)
config.set("CV PARAMETERS", "Iterations", CV_N)
config.add_section("INPUT")
config.set("INPUT", "Data", os.path.realpath( DATAFILE ))
config.set("INPUT", "Labels", os.path.realpath( LABELSFILE ))
config.set("INPUT", "Data", os.path.realpath(DATAFILE))
config.set("INPUT", "Labels", os.path.realpath(LABELSFILE))
config.set("INPUT", "Classifier", MODEL_TYPE)
if MODEL_TYPE == "LSVM":
config.set("INPUT", "Scaling", SCALING)
config.set("INPUT", "Rank_method", RANK_METHOD)
config.set("INPUT", "Random_labels", random_labels)
config.add_section("OUTPUT")
config.set("OUTPUT", "Metrics", os.path.realpath( OUTFILE + "_metrics.txt" ))
config.set("OUTPUT", "Borda", os.path.realpath( OUTFILE + "_featurelist.txt" ))
config.set("OUTPUT", "Internal", os.path.realpath( OUTFILE + "_internal.txt" ))
config.set("OUTPUT", "Stability", os.path.realpath( OUTFILE + "_stability.txt" ))
config.set("OUTPUT", "Metrics", os.path.realpath(OUTFILE + "_metrics.txt"))
config.set("OUTPUT", "Borda", os.path.realpath(OUTFILE + "_featurelist.txt"))
config.set("OUTPUT", "Internal", os.path.realpath(OUTFILE + "_internal.txt"))
config.set("OUTPUT", "Stability", os.path.realpath(OUTFILE + "_stability.txt"))
config.set("OUTPUT", "MCC", np.max(AMCC))
config.set("OUTPUT", "N_feats", opt_feats)
config.write(logf)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment