Commit 7a219819 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Fix missing import and black formatting

parent 7cb397cb
......@@ -14,12 +14,12 @@ import bootstrapped.stats_functions as bs_stats
import numpy as np
import pandas as pd
from mlpy import borda_count, canberra_stability
from sklearn import preprocessing
from sklearn import preprocessing, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import (StratifiedKFold, StratifiedShuffleSplit,
train_test_split, GridSearchCV)
from sklearn.metrics import (accuracy_score, make_scorer, matthews_corrcoef,
roc_auc_score)
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
StratifiedShuffleSplit, train_test_split)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
......@@ -326,4 +326,4 @@ config.set("OUTPUT", "Stability", os.path.realpath( OUTFILE + "_stability.txt" )
config.set("OUTPUT", "MCC", np.max(AMCC))
config.set("OUTPUT", "N_feats", opt_feats)
config.write(logf)
logf.close()
\ No newline at end of file
logf.close()
......@@ -11,28 +11,34 @@ import sys
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn import preprocessing
from sklearn import preprocessing, svm
from sklearn.metrics import (accuracy_score, make_scorer, matthews_corrcoef,
roc_auc_score)
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
import performance as perf
from extract_topfeats import extract_feats
from input_output import load_data
parser = argparse.ArgumentParser(description='Run a validation experiment using LibLinear.')
parser.add_argument('CONFIGFILE', type=str, help='Training experiment configuration file')
parser = argparse.ArgumentParser(
description='Run a validation experiment using LibLinear.'
)
parser.add_argument(
'CONFIGFILE', type=str, help='Training experiment configuration file'
)
parser.add_argument('TSFILE', type=str, help='Validation datafile')
parser.add_argument('OUTDIR', type=str, help='Output directory')
parser.add_argument('--tslab', type=str, default=None, help='Validation labels, if available')
parser.add_argument('--nf', type=int, default=None, help='Custom number of top features')
parser.add_argument(
'--tslab', type=str, default=None, help='Validation labels, if available'
)
parser.add_argument(
'--nf', type=int, default=None, help='Custom number of top features'
)
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
if len(sys.argv)==1:
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
......@@ -48,7 +54,7 @@ TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P = 0.5
# list of parameters for LSVM tuning
TUN_PARAMS_LSVM = [{'svm__C': [10**int(k) for k in np.arange(-2, 3)]}]
TUN_PARAMS_LSVM = [{'svm__C': [10 ** int(k) for k in np.arange(-2, 3)]}]
config = ConfigParser.RawConfigParser()
config.read(CONFIGFILE)
......@@ -80,12 +86,14 @@ le = preprocessing.LabelEncoder()
sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS)
y_tr = pd.read_csv(LABELSFILE, sep='\t', header=None).values
y_tr = le.fit_transform(y_tr)
print(y_tr.shape)
sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS)
# load the TS labels if available
if TSLABELSFILE is not None:
y_ts = pd.read_csv(TSLABELSFILE, sep='\t', header=None).values
y_ts = pd.read_csv(TSLABELSFILE, sep='\t', header=None, dtype=str).values
y_ts = le.transform(y_ts)
# define classifier
if MODEL_TYPE == "randomForest":
model = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0)
......@@ -94,15 +102,19 @@ elif MODEL_TYPE == "LSVM":
if SCALING == 'std':
scaler = preprocessing.StandardScaler()
elif SCALING == 'minmax':
scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
scorer = make_scorer(matthews_corrcoef)
# tuning Pipeline
tun_classif = svm.SVC(class_weight="balanced", kernel="linear")
pipeline_steps = [('scaler', scaler), ('svm', tun_classif)]
tuning_pipeline = Pipeline(pipeline_steps)
# LSVM tuning
tuncv = StratifiedShuffleSplit(n_splits=TUN_CV_K, test_size=TUN_CV_P, random_state=0)
model = GridSearchCV(tuning_pipeline, param_grid=TUN_PARAMS_LSVM, cv=tuncv, scoring=scorer)
tuncv = StratifiedShuffleSplit(
n_splits=TUN_CV_K, test_size=TUN_CV_P, random_state=0
)
model = GridSearchCV(
tuning_pipeline, param_grid=TUN_PARAMS_LSVM, cv=tuncv, scoring=scorer
)
# train
model.fit(x_tr, y_tr)
......@@ -138,17 +150,24 @@ with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
for i in range(len(sample_names_ts)):
fout.write("%s\t%s\n" % (sample_names_ts[i], p_ts_dec[i]))
np.savetxt(OUTFILE + "_TEST_signature.txt",
np.array(var_names_tr).reshape(-1,1),
fmt='%s', delimiter='\t')
np.savetxt(
OUTFILE + "_TEST_signature.txt",
np.array(var_names_tr).reshape(-1, 1),
fmt='%s',
delimiter='\t',
)
if MODEL_TYPE == "randomForest":
with open(OUTFILE + "_TEST_prob_tr.txt", "w") as fout:
fout.write("SAMPLE\tCLASS 0\tCLASS 1\n")
for i in range(len(sample_names_tr)):
fout.write("%s\t%f\t%f\n" % (sample_names_tr[i], prob_tr[i,0], prob_tr[i,1]))
fout.write(
"%s\t%f\t%f\n" % (sample_names_tr[i], prob_tr[i, 0], prob_tr[i, 1])
)
with open(OUTFILE + "_TEST_prob_ts.txt", "w") as fout:
fout.write("SAMPLE\tCLASS 0\tCLASS 1\n")
for i in range(len(sample_names_ts)):
fout.write("%s\t%f\t%f\n" % (sample_names_ts[i], prob_ts[i,0], prob_ts[i,1]))
fout.write(
"%s\t%f\t%f\n" % (sample_names_ts[i], prob_ts[i, 0], prob_ts[i, 1])
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment