Commit d4dc5aa8 authored by Marco Chierici's avatar Marco Chierici
Browse files

Add precision, recall; Black formatting

parent add8a9fa
......@@ -13,53 +13,59 @@ import numpy as np
import pandas as pd
from sklearn import preprocessing, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, make_scorer, matthews_corrcoef,
roc_auc_score)
from sklearn.metrics import (
accuracy_score,
make_scorer,
matthews_corrcoef,
roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
import performance as perf
from extract_topfeats import extract_feats
from input_output import load_data
parser = argparse.ArgumentParser(
description='Run a validation experiment using LibLinear.'
description="Run a validation experiment using LibLinear."
)
parser.add_argument(
'CONFIGFILE', type=str, help='Training experiment configuration file'
"CONFIGFILE", type=str, help="Training experiment configuration file"
)
parser.add_argument('TSFILE', type=str, help='Validation datafile')
parser.add_argument('OUTDIR', type=str, help='Output directory')
parser.add_argument("TSFILE", type=str, help="Validation datafile")
parser.add_argument("OUTDIR", type=str, help="Output directory")
parser.add_argument(
'--tslab', type=str, default=None, help='Validation labels, if available'
"--tslab", type=str, default=None, help="Validation labels, if available"
)
parser.add_argument(
'--nf', type=int, default=None, help='Custom number of top features'
"--nf", type=int, default=None, help="Custom number of top features"
)
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
__author__ = "Marco Chierici, Alessandro Zandona"
__date__ = "15 December 2016"
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
CONFIGFILE = vars(args)['CONFIGFILE']
TSFILE = vars(args)['TSFILE']
OUTDIR = vars(args)['OUTDIR']
TSLABELSFILE = vars(args)['tslab']
NFEATS = vars(args)['nf']
CONFIGFILE = vars(args)["CONFIGFILE"]
TSFILE = vars(args)["TSFILE"]
OUTDIR = vars(args)["OUTDIR"]
TSLABELSFILE = vars(args)["tslab"]
NFEATS = vars(args)["nf"]
# number of Montecarlo CV cycles (for SVM tuning)
TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P = 0.5
# list of parameters for LSVM tuning
TUN_PARAMS_LSVM = [{'svm__C': [10 ** int(k) for k in np.arange(-2, 3)]}]
TUN_PARAMS_LSVM = [{"svm__C": [10 ** int(k) for k in np.arange(-2, 3)]}]
config = ConfigParser.RawConfigParser()
config.read(CONFIGFILE)
if not config.has_section('INPUT'):
if not config.has_section("INPUT"):
print("%s is not a valid configuration file." % CONFIGFILE)
sys.exit(3)
......@@ -74,10 +80,10 @@ BASEFILE = os.path.splitext(TRFILE)[0]
OUTFILE = os.path.join(OUTDIR, os.path.basename(BASEFILE))
# extract the top-ranked NFEATS features from TRAINING set
TR_TOPFEATS = OUTFILE + '_top%s_tr.txt' % NFEATS
TR_TOPFEATS = OUTFILE + "_top%s_tr.txt" % NFEATS
extract_feats(TRFILE, RANK, NFEATS, TR_TOPFEATS)
# extract the top-ranked NFEATS features from VALIDATION set
TS_TOPFEATS = OUTFILE + '_top%s_ts.txt' % NFEATS
TS_TOPFEATS = OUTFILE + "_top%s_ts.txt" % NFEATS
extract_feats(TSFILE, RANK, NFEATS, TS_TOPFEATS)
# initialize LabelEncoder
......@@ -85,7 +91,7 @@ le = preprocessing.LabelEncoder()
# load data
sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS)
y_tr = pd.read_csv(LABELSFILE, sep='\t', header=None, dtype=str).values
y_tr = pd.read_csv(LABELSFILE, sep="\t", header=None, dtype=str).values
y_tr = le.fit_transform(y_tr)
sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS)
# load the TS labels if available
......@@ -93,20 +99,21 @@ if TSLABELSFILE is not None:
y_ts = pd.read_csv(TSLABELSFILE, header=None, dtype=str).values
y_ts = le.transform(y_ts)
is_multiclass = len(le.classes_) > 2
# define classifier
if MODEL_TYPE == "randomForest":
model = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0)
model = RandomForestClassifier(n_estimators=500, criterion="gini", random_state=0)
elif MODEL_TYPE == "LSVM":
SCALING = config.get("INPUT", "Scaling")
if SCALING == 'std':
if SCALING == "std":
scaler = preprocessing.StandardScaler()
elif SCALING == 'minmax':
elif SCALING == "minmax":
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
scorer = make_scorer(matthews_corrcoef)
# tuning Pipeline
tun_classif = svm.SVC(class_weight="balanced", kernel="linear")
pipeline_steps = [('scaler', scaler), ('svm', tun_classif)]
pipeline_steps = [("scaler", scaler), ("svm", tun_classif)]
tuning_pipeline = Pipeline(pipeline_steps)
# LSVM tuning
tuncv = StratifiedShuffleSplit(
......@@ -141,6 +148,22 @@ if TSLABELSFILE is not None:
with open(OUTFILE + "_MCC_scores.txt", "w") as fout:
fout.write("MCC_train\t%.5f\n" % (matthews_corrcoef(y_tr, p_tr)))
fout.write("MCC_validation\t%.5f\n" % (matthews_corrcoef(y_ts, p_ts)))
# additional metrics
if not is_multiclass:
# sensitivity, or recall
fout.write("SENS_train\t%.5f\n" % (perf.sensitivity(y_tr, p_tr)))
fout.write("SENS_validation\t%.5f\n" % (perf.sensitivity(y_ts, p_ts)))
# positive predictive value, or precision
fout.write("PPV_train\t%.5f\n" % (perf.ppv(y_tr, p_tr)))
fout.write("PPV_validation\t%.5f\n" % (perf.ppv(y_ts, p_ts)))
else:
# sensitivity, or recall
fout.write("SENS_train\t%.5f\n" % (np.nan))
fout.write("SENS_validation\t%.5f\n" % (np.nan))
# positive predictive value, or precision
fout.write("PPV_train\t%.5f\n" % (np.nan))
fout.write("PPV_validation\t%.5f\n" % (np.nan))
with open(OUTFILE + "_TEST_pred_tr.txt", "w") as fout:
for i in range(len(sample_names_tr)):
......@@ -153,8 +176,8 @@ with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
np.savetxt(
OUTFILE + "_TEST_signature.txt",
np.array(var_names_tr).reshape(-1, 1),
fmt='%s',
delimiter='\t',
fmt="%s",
delimiter="\t",
)
if MODEL_TYPE == "randomForest":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment