Commit 7a219819 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Fix missing import and black formatting

parent 7cb397cb
...@@ -14,12 +14,12 @@ import bootstrapped.stats_functions as bs_stats ...@@ -14,12 +14,12 @@ import bootstrapped.stats_functions as bs_stats
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from mlpy import borda_count, canberra_stability from mlpy import borda_count, canberra_stability
from sklearn import preprocessing from sklearn import preprocessing, svm
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn import svm from sklearn.metrics import (accuracy_score, make_scorer, matthews_corrcoef,
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef, roc_auc_score roc_auc_score)
from sklearn.model_selection import (StratifiedKFold, StratifiedShuffleSplit, from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
train_test_split, GridSearchCV) StratifiedShuffleSplit, train_test_split)
from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
...@@ -326,4 +326,4 @@ config.set("OUTPUT", "Stability", os.path.realpath( OUTFILE + "_stability.txt" ) ...@@ -326,4 +326,4 @@ config.set("OUTPUT", "Stability", os.path.realpath( OUTFILE + "_stability.txt" )
config.set("OUTPUT", "MCC", np.max(AMCC)) config.set("OUTPUT", "MCC", np.max(AMCC))
config.set("OUTPUT", "N_feats", opt_feats) config.set("OUTPUT", "N_feats", opt_feats)
config.write(logf) config.write(logf)
logf.close() logf.close()
\ No newline at end of file
...@@ -11,28 +11,34 @@ import sys ...@@ -11,28 +11,34 @@ import sys
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn.ensemble import RandomForestClassifier from sklearn import preprocessing, svm
from sklearn import svm from sklearn.metrics import (accuracy_score, make_scorer, matthews_corrcoef,
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef, roc_auc_score roc_auc_score)
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn import preprocessing
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
import performance as perf
from extract_topfeats import extract_feats from extract_topfeats import extract_feats
from input_output import load_data from input_output import load_data
parser = argparse.ArgumentParser(description='Run a validation experiment using LibLinear.') parser = argparse.ArgumentParser(
parser.add_argument('CONFIGFILE', type=str, help='Training experiment configuration file') description='Run a validation experiment using LibLinear.'
)
parser.add_argument(
'CONFIGFILE', type=str, help='Training experiment configuration file'
)
parser.add_argument('TSFILE', type=str, help='Validation datafile') parser.add_argument('TSFILE', type=str, help='Validation datafile')
parser.add_argument('OUTDIR', type=str, help='Output directory') parser.add_argument('OUTDIR', type=str, help='Output directory')
parser.add_argument('--tslab', type=str, default=None, help='Validation labels, if available') parser.add_argument(
parser.add_argument('--nf', type=int, default=None, help='Custom number of top features') '--tslab', type=str, default=None, help='Validation labels, if available'
)
parser.add_argument(
'--nf', type=int, default=None, help='Custom number of top features'
)
__author__ = 'Marco Chierici, Alessandro Zandona' __author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016' __date__ = '15 December 2016'
if len(sys.argv)==1: if len(sys.argv) == 1:
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
...@@ -48,7 +54,7 @@ TUN_CV_K = 10 ...@@ -48,7 +54,7 @@ TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning) # fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P = 0.5 TUN_CV_P = 0.5
# list of parameters for LSVM tuning # list of parameters for LSVM tuning
TUN_PARAMS_LSVM = [{'svm__C': [10**int(k) for k in np.arange(-2, 3)]}] TUN_PARAMS_LSVM = [{'svm__C': [10 ** int(k) for k in np.arange(-2, 3)]}]
config = ConfigParser.RawConfigParser() config = ConfigParser.RawConfigParser()
config.read(CONFIGFILE) config.read(CONFIGFILE)
...@@ -80,12 +86,14 @@ le = preprocessing.LabelEncoder() ...@@ -80,12 +86,14 @@ le = preprocessing.LabelEncoder()
sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS) sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS)
y_tr = pd.read_csv(LABELSFILE, sep='\t', header=None).values y_tr = pd.read_csv(LABELSFILE, sep='\t', header=None).values
y_tr = le.fit_transform(y_tr) y_tr = le.fit_transform(y_tr)
print(y_tr.shape)
sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS) sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS)
# load the TS labels if available # load the TS labels if available
if TSLABELSFILE is not None: if TSLABELSFILE is not None:
y_ts = pd.read_csv(TSLABELSFILE, sep='\t', header=None).values y_ts = pd.read_csv(TSLABELSFILE, sep='\t', header=None, dtype=str).values
y_ts = le.transform(y_ts) y_ts = le.transform(y_ts)
# define classifier # define classifier
if MODEL_TYPE == "randomForest": if MODEL_TYPE == "randomForest":
model = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0) model = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0)
...@@ -94,15 +102,19 @@ elif MODEL_TYPE == "LSVM": ...@@ -94,15 +102,19 @@ elif MODEL_TYPE == "LSVM":
if SCALING == 'std': if SCALING == 'std':
scaler = preprocessing.StandardScaler() scaler = preprocessing.StandardScaler()
elif SCALING == 'minmax': elif SCALING == 'minmax':
scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
scorer = make_scorer(matthews_corrcoef) scorer = make_scorer(matthews_corrcoef)
# tuning Pipeline # tuning Pipeline
tun_classif = svm.SVC(class_weight="balanced", kernel="linear") tun_classif = svm.SVC(class_weight="balanced", kernel="linear")
pipeline_steps = [('scaler', scaler), ('svm', tun_classif)] pipeline_steps = [('scaler', scaler), ('svm', tun_classif)]
tuning_pipeline = Pipeline(pipeline_steps) tuning_pipeline = Pipeline(pipeline_steps)
# LSVM tuning # LSVM tuning
tuncv = StratifiedShuffleSplit(n_splits=TUN_CV_K, test_size=TUN_CV_P, random_state=0) tuncv = StratifiedShuffleSplit(
model = GridSearchCV(tuning_pipeline, param_grid=TUN_PARAMS_LSVM, cv=tuncv, scoring=scorer) n_splits=TUN_CV_K, test_size=TUN_CV_P, random_state=0
)
model = GridSearchCV(
tuning_pipeline, param_grid=TUN_PARAMS_LSVM, cv=tuncv, scoring=scorer
)
# train # train
model.fit(x_tr, y_tr) model.fit(x_tr, y_tr)
...@@ -138,17 +150,24 @@ with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout: ...@@ -138,17 +150,24 @@ with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
for i in range(len(sample_names_ts)): for i in range(len(sample_names_ts)):
fout.write("%s\t%s\n" % (sample_names_ts[i], p_ts_dec[i])) fout.write("%s\t%s\n" % (sample_names_ts[i], p_ts_dec[i]))
np.savetxt(OUTFILE + "_TEST_signature.txt", np.savetxt(
np.array(var_names_tr).reshape(-1,1), OUTFILE + "_TEST_signature.txt",
fmt='%s', delimiter='\t') np.array(var_names_tr).reshape(-1, 1),
fmt='%s',
delimiter='\t',
)
if MODEL_TYPE == "randomForest": if MODEL_TYPE == "randomForest":
with open(OUTFILE + "_TEST_prob_tr.txt", "w") as fout: with open(OUTFILE + "_TEST_prob_tr.txt", "w") as fout:
fout.write("SAMPLE\tCLASS 0\tCLASS 1\n") fout.write("SAMPLE\tCLASS 0\tCLASS 1\n")
for i in range(len(sample_names_tr)): for i in range(len(sample_names_tr)):
fout.write("%s\t%f\t%f\n" % (sample_names_tr[i], prob_tr[i,0], prob_tr[i,1])) fout.write(
"%s\t%f\t%f\n" % (sample_names_tr[i], prob_tr[i, 0], prob_tr[i, 1])
)
with open(OUTFILE + "_TEST_prob_ts.txt", "w") as fout: with open(OUTFILE + "_TEST_prob_ts.txt", "w") as fout:
fout.write("SAMPLE\tCLASS 0\tCLASS 1\n") fout.write("SAMPLE\tCLASS 0\tCLASS 1\n")
for i in range(len(sample_names_ts)): for i in range(len(sample_names_ts)):
fout.write("%s\t%f\t%f\n" % (sample_names_ts[i], prob_ts[i,0], prob_ts[i,1])) fout.write(
"%s\t%f\t%f\n" % (sample_names_ts[i], prob_ts[i, 0], prob_ts[i, 1])
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment