Commit d4dc5aa8 authored by Marco Chierici's avatar Marco Chierici
Browse files

Add precision, recall; Black formatting

parent add8a9fa
...@@ -13,53 +13,59 @@ import numpy as np ...@@ -13,53 +13,59 @@ import numpy as np
import pandas as pd import pandas as pd
from sklearn import preprocessing, svm from sklearn import preprocessing, svm
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, make_scorer, matthews_corrcoef, from sklearn.metrics import (
roc_auc_score) accuracy_score,
make_scorer,
matthews_corrcoef,
roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
import performance as perf
from extract_topfeats import extract_feats from extract_topfeats import extract_feats
from input_output import load_data from input_output import load_data
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Run a validation experiment using LibLinear.' description="Run a validation experiment using LibLinear."
) )
parser.add_argument( parser.add_argument(
'CONFIGFILE', type=str, help='Training experiment configuration file' "CONFIGFILE", type=str, help="Training experiment configuration file"
) )
parser.add_argument('TSFILE', type=str, help='Validation datafile') parser.add_argument("TSFILE", type=str, help="Validation datafile")
parser.add_argument('OUTDIR', type=str, help='Output directory') parser.add_argument("OUTDIR", type=str, help="Output directory")
parser.add_argument( parser.add_argument(
'--tslab', type=str, default=None, help='Validation labels, if available' "--tslab", type=str, default=None, help="Validation labels, if available"
) )
parser.add_argument( parser.add_argument(
'--nf', type=int, default=None, help='Custom number of top features' "--nf", type=int, default=None, help="Custom number of top features"
) )
__author__ = 'Marco Chierici, Alessandro Zandona' __author__ = "Marco Chierici, Alessandro Zandona"
__date__ = '15 December 2016' __date__ = "15 December 2016"
if len(sys.argv) == 1: if len(sys.argv) == 1:
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
args = parser.parse_args() args = parser.parse_args()
CONFIGFILE = vars(args)['CONFIGFILE'] CONFIGFILE = vars(args)["CONFIGFILE"]
TSFILE = vars(args)['TSFILE'] TSFILE = vars(args)["TSFILE"]
OUTDIR = vars(args)['OUTDIR'] OUTDIR = vars(args)["OUTDIR"]
TSLABELSFILE = vars(args)['tslab'] TSLABELSFILE = vars(args)["tslab"]
NFEATS = vars(args)['nf'] NFEATS = vars(args)["nf"]
# number of Montecarlo CV cycles (for SVM tuning) # number of Montecarlo CV cycles (for SVM tuning)
TUN_CV_K = 10 TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning) # fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P = 0.5 TUN_CV_P = 0.5
# list of parameters for LSVM tuning # list of parameters for LSVM tuning
TUN_PARAMS_LSVM = [{'svm__C': [10 ** int(k) for k in np.arange(-2, 3)]}] TUN_PARAMS_LSVM = [{"svm__C": [10 ** int(k) for k in np.arange(-2, 3)]}]
config = ConfigParser.RawConfigParser() config = ConfigParser.RawConfigParser()
config.read(CONFIGFILE) config.read(CONFIGFILE)
if not config.has_section('INPUT'): if not config.has_section("INPUT"):
print("%s is not a valid configuration file." % CONFIGFILE) print("%s is not a valid configuration file." % CONFIGFILE)
sys.exit(3) sys.exit(3)
...@@ -74,10 +80,10 @@ BASEFILE = os.path.splitext(TRFILE)[0] ...@@ -74,10 +80,10 @@ BASEFILE = os.path.splitext(TRFILE)[0]
OUTFILE = os.path.join(OUTDIR, os.path.basename(BASEFILE)) OUTFILE = os.path.join(OUTDIR, os.path.basename(BASEFILE))
# extract the top-ranked NFEATS features from TRAINING set # extract the top-ranked NFEATS features from TRAINING set
TR_TOPFEATS = OUTFILE + '_top%s_tr.txt' % NFEATS TR_TOPFEATS = OUTFILE + "_top%s_tr.txt" % NFEATS
extract_feats(TRFILE, RANK, NFEATS, TR_TOPFEATS) extract_feats(TRFILE, RANK, NFEATS, TR_TOPFEATS)
# extract the top-ranked NFEATS features from VALIDATION set # extract the top-ranked NFEATS features from VALIDATION set
TS_TOPFEATS = OUTFILE + '_top%s_ts.txt' % NFEATS TS_TOPFEATS = OUTFILE + "_top%s_ts.txt" % NFEATS
extract_feats(TSFILE, RANK, NFEATS, TS_TOPFEATS) extract_feats(TSFILE, RANK, NFEATS, TS_TOPFEATS)
# initialize LabelEncoder # initialize LabelEncoder
...@@ -85,7 +91,7 @@ le = preprocessing.LabelEncoder() ...@@ -85,7 +91,7 @@ le = preprocessing.LabelEncoder()
# load data # load data
sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS) sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS)
y_tr = pd.read_csv(LABELSFILE, sep='\t', header=None, dtype=str).values y_tr = pd.read_csv(LABELSFILE, sep="\t", header=None, dtype=str).values
y_tr = le.fit_transform(y_tr) y_tr = le.fit_transform(y_tr)
sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS) sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS)
# load the TS labels if available # load the TS labels if available
...@@ -93,20 +99,21 @@ if TSLABELSFILE is not None: ...@@ -93,20 +99,21 @@ if TSLABELSFILE is not None:
y_ts = pd.read_csv(TSLABELSFILE, header=None, dtype=str).values y_ts = pd.read_csv(TSLABELSFILE, header=None, dtype=str).values
y_ts = le.transform(y_ts) y_ts = le.transform(y_ts)
is_multiclass = len(le.classes_) > 2
# define classifier # define classifier
if MODEL_TYPE == "randomForest": if MODEL_TYPE == "randomForest":
model = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0) model = RandomForestClassifier(n_estimators=500, criterion="gini", random_state=0)
elif MODEL_TYPE == "LSVM": elif MODEL_TYPE == "LSVM":
SCALING = config.get("INPUT", "Scaling") SCALING = config.get("INPUT", "Scaling")
if SCALING == 'std': if SCALING == "std":
scaler = preprocessing.StandardScaler() scaler = preprocessing.StandardScaler()
elif SCALING == 'minmax': elif SCALING == "minmax":
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
scorer = make_scorer(matthews_corrcoef) scorer = make_scorer(matthews_corrcoef)
# tuning Pipeline # tuning Pipeline
tun_classif = svm.SVC(class_weight="balanced", kernel="linear") tun_classif = svm.SVC(class_weight="balanced", kernel="linear")
pipeline_steps = [('scaler', scaler), ('svm', tun_classif)] pipeline_steps = [("scaler", scaler), ("svm", tun_classif)]
tuning_pipeline = Pipeline(pipeline_steps) tuning_pipeline = Pipeline(pipeline_steps)
# LSVM tuning # LSVM tuning
tuncv = StratifiedShuffleSplit( tuncv = StratifiedShuffleSplit(
...@@ -141,6 +148,22 @@ if TSLABELSFILE is not None: ...@@ -141,6 +148,22 @@ if TSLABELSFILE is not None:
with open(OUTFILE + "_MCC_scores.txt", "w") as fout: with open(OUTFILE + "_MCC_scores.txt", "w") as fout:
fout.write("MCC_train\t%.5f\n" % (matthews_corrcoef(y_tr, p_tr))) fout.write("MCC_train\t%.5f\n" % (matthews_corrcoef(y_tr, p_tr)))
fout.write("MCC_validation\t%.5f\n" % (matthews_corrcoef(y_ts, p_ts))) fout.write("MCC_validation\t%.5f\n" % (matthews_corrcoef(y_ts, p_ts)))
# additional metrics
if not is_multiclass:
# sensitivity, or recall
fout.write("SENS_train\t%.5f\n" % (perf.sensitivity(y_tr, p_tr)))
fout.write("SENS_validation\t%.5f\n" % (perf.sensitivity(y_ts, p_ts)))
# positive predictive value, or precision
fout.write("PPV_train\t%.5f\n" % (perf.ppv(y_tr, p_tr)))
fout.write("PPV_validation\t%.5f\n" % (perf.ppv(y_ts, p_ts)))
else:
# sensitivity, or recall
fout.write("SENS_train\t%.5f\n" % (np.nan))
fout.write("SENS_validation\t%.5f\n" % (np.nan))
# positive predictive value, or precision
fout.write("PPV_train\t%.5f\n" % (np.nan))
fout.write("PPV_validation\t%.5f\n" % (np.nan))
with open(OUTFILE + "_TEST_pred_tr.txt", "w") as fout: with open(OUTFILE + "_TEST_pred_tr.txt", "w") as fout:
for i in range(len(sample_names_tr)): for i in range(len(sample_names_tr)):
...@@ -153,8 +176,8 @@ with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout: ...@@ -153,8 +176,8 @@ with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
np.savetxt( np.savetxt(
OUTFILE + "_TEST_signature.txt", OUTFILE + "_TEST_signature.txt",
np.array(var_names_tr).reshape(-1, 1), np.array(var_names_tr).reshape(-1, 1),
fmt='%s', fmt="%s",
delimiter='\t', delimiter="\t",
) )
if MODEL_TYPE == "randomForest": if MODEL_TYPE == "randomForest":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment