Commit 41142126 authored by Marco Chierici's avatar Marco Chierici
Browse files

New unified RF/LSVM validation script

parent 6e36c818
## This code is written by Marco Chierici <>, Alessandro Zandona' <>.
## Based on code previously written by Davide Albanese.
## Requires Python >= 2.7, mlpy >= 3.5
import argparse
import configparser as ConfigParser
import os.path
import sys
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
import performance as perf
from extract_topfeats import extract_feats
from input_output import load_data
parser = argparse.ArgumentParser(description='Run a validation experiment using LibLinear.')
parser.add_argument('CONFIGFILE', type=str, help='Training experiment configuration file')
parser.add_argument('TSFILE', type=str, help='Validation datafile')
parser.add_argument('OUTDIR', type=str, help='Output directory')
parser.add_argument('--tslab', type=str, default=None, help='Validation labels, if available')
parser.add_argument('--nf', type=int, default=None, help='Custom number of top features')
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
if len(sys.argv)==1:
args = parser.parse_args()
TSFILE = vars(args)['TSFILE']
OUTDIR = vars(args)['OUTDIR']
TSLABELSFILE = vars(args)['tslab']
NFEATS = vars(args)['nf']
# number of Montecarlo CV cycles (for SVM tuning)
TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P = 0.5
# list of parameters for LSVM tuning
TUN_PARAMS_LSVM = [{'svm__C': [10**int(k) for k in np.arange(-2, 3)]}]
config = ConfigParser.RawConfigParser()
if not config.has_section('INPUT'):
print("%s is not a valid configuration file." % CONFIGFILE)
TRFILE = config.get("INPUT", "Data")
LABELSFILE = config.get("INPUT", "Labels")
MODEL_TYPE = config.get("INPUT", "Classifier")
RANK = config.get("OUTPUT", "Borda")
if NFEATS is None:
NFEATS = config.getint("OUTPUT", "N_feats")
BASEFILE = os.path.splitext(TRFILE)[0]
OUTFILE = os.path.join(OUTDIR, os.path.basename(BASEFILE))
# extract the top-ranked NFEATS features from TRAINING set
TR_TOPFEATS = OUTFILE + '_top%s_tr.txt' % NFEATS
# extract the top-ranked NFEATS features from VALIDATION set
TS_TOPFEATS = OUTFILE + '_top%s_ts.txt' % NFEATS
# initialize LabelEncoder
le = preprocessing.LabelEncoder()
# load data
sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS)
y_tr = pd.read_csv(LABELSFILE, sep='\t', header=None).values
y_tr = le.fit_transform(y_tr)
sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS)
# load the TS labels if available
if TSLABELSFILE is not None:
y_ts = pd.read_csv(TSLABELSFILE, sep='\t', header=None).values
y_ts = le.transform(y_ts)
# define classifier
if MODEL_TYPE == "randomForest":
model = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0)
elif MODEL_TYPE == "LSVM":
SCALING = config.get("INPUT", "Scaling")
if SCALING == 'std':
scaler = preprocessing.StandardScaler()
elif SCALING == 'minmax':
scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
scorer = make_scorer(matthews_corrcoef)
# tuning Pipeline
tun_classif = svm.SVC(class_weight="balanced", kernel="linear")
pipeline_steps = [('scaler', scaler), ('svm', tun_classif)]
tuning_pipeline = Pipeline(pipeline_steps)
# LSVM tuning
tuncv = StratifiedShuffleSplit(n_splits=TUN_CV_K, test_size=TUN_CV_P, random_state=0)
model = GridSearchCV(tuning_pipeline, param_grid=TUN_PARAMS_LSVM, cv=tuncv, scoring=scorer)
# train, y_tr)
# predict
p_tr = model.predict(x_tr)
p_ts = model.predict(x_ts)
# decode labels back
p_tr_dec = le.inverse_transform(p_tr)
p_ts_dec = le.inverse_transform(p_ts)
# compute probabilities only for RandomForest;
# for LSVM see
if MODEL_TYPE == "randomForest":
prob_tr = model.predict_proba(x_tr)
prob_ts = model.predict_proba(x_ts)
print("MCC on train: %.3f" % (matthews_corrcoef(y_tr, p_tr)))
if TSLABELSFILE is not None:
print("MCC on validation: %.3f" % (matthews_corrcoef(y_ts, p_ts)))
# write output files
# save MCC_train and MCC_validation
with open(OUTFILE + "_MCC_scores.txt", "w") as fout:
fout.write("MCC_train\t%.5f\n" % (matthews_corrcoef(y_tr, p_tr)))
fout.write("MCC_validation\t%.5f\n" % (matthews_corrcoef(y_ts, p_ts)))
with open(OUTFILE + "_TEST_pred_tr.txt", "w") as fout:
for i in range(len(sample_names_tr)):
fout.write("%s\t%s\n" % (sample_names_tr[i], p_tr_dec[i]))
with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
for i in range(len(sample_names_ts)):
fout.write("%s\t%s\n" % (sample_names_ts[i], p_ts_dec[i]))
np.savetxt(OUTFILE + "_TEST_signature.txt",
fmt='%s', delimiter='\t')
if MODEL_TYPE == "randomForest":
with open(OUTFILE + "_TEST_prob_tr.txt", "w") as fout:
fout.write("SAMPLE\tCLASS 0\tCLASS 1\n")
for i in range(len(sample_names_tr)):
fout.write("%s\t%f\t%f\n" % (sample_names_tr[i], prob_tr[i,0], prob_tr[i,1]))
with open(OUTFILE + "_TEST_prob_ts.txt", "w") as fout:
fout.write("SAMPLE\tCLASS 0\tCLASS 1\n")
for i in range(len(sample_names_ts)):
fout.write("%s\t%f\t%f\n" % (sample_names_ts[i], prob_ts[i,0], prob_ts[i,1]))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment