Commit 0dda140d authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Merge branch 'master' of gitlab.fbk.eu:MPBA/inf_revamped

parents 68546ea4 8a5e66c1
......@@ -7,6 +7,7 @@ __pycache__/
# Jupyter Notebook
.ipynb_checkpoints
*ipynb
# VS code settings
.vscode
......
......@@ -8,25 +8,24 @@ import numpy as np
import pandas as pd
import csv
import os.path
from scaling import norm_l2
from mlpy import borda_count, canberra_stability
from input_output import load_data
import performance as perf
import sys
import tarfile
import glob
import argparse
import configparser as ConfigParser
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
__author__ = 'Marco Chierici, Alessandro Zandona'
__version__ = '2.0'
__date__ = '15 December 2016'
__author__ = 'Marco Chierici'
__version__ = '2.5'
__date__ = '28 Nov 2019'
class myArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
......@@ -51,7 +50,6 @@ parser.add_argument('--cv_k', type=np.int, default=5, help='Number of CV folds (
parser.add_argument('--cv_n', type=np.int, default=10, help='Number of CV cycles (default: %(default)s)')
parser.add_argument('--rankFeats', type=str, default='', help='Ranked features list to be used by Machine Learning [Feats name on 1st column, feats weights on 2nd column, with HEADER]')
parser.add_argument('--reliefk', type=np.int, default=3, help='Number of nearest neighbors for ReliefF (default: %(default)s)')
#parser.add_argument('--plot', action='store_true', help='Plot metric values over all training cycles' )
if len(sys.argv)==1:
parser.print_help()
......@@ -62,7 +60,6 @@ DATAFILE = args.DATAFILE
LABELSFILE = args.LABELSFILE
RANK_METHOD = args.RANK_METHOD
OUTDIR = args.OUTDIR
#plot_out = args.plot
random_labels = args.random
CV_K = args.cv_k
CV_N = args.cv_n
......@@ -70,8 +67,8 @@ RANKFEATS = args.rankFeats
relief_k = args.reliefk
BASEFILE = os.path.splitext(os.path.basename(DATAFILE))[0]
SVM_TYPE = 'RandomForest'
OUTFILE = os.path.join(OUTDIR, '_'.join([BASEFILE, SVM_TYPE, 'RF' if RANK_METHOD=='randomForest' else RANK_METHOD]))
MODEL_TYPE = 'RandomForest'
OUTFILE = os.path.join(OUTDIR, '_'.join([BASEFILE, MODEL_TYPE, 'RF' if RANK_METHOD=='randomForest' else RANK_METHOD]))
# create OUTDIR if not present
try:
......@@ -84,7 +81,7 @@ except OSError:
if RANK_METHOD == 'ReliefF':
from relief import ReliefF
# add ReliefF K to OUTFILE
OUTFILE = os.path.join(OUTDIR, '_'.join([BASEFILE, SVM_TYPE, RANK_METHOD + str(relief_k)]))
OUTFILE = os.path.join(OUTDIR, '_'.join([BASEFILE, MODEL_TYPE, RANK_METHOD + str(relief_k)]))
elif RANK_METHOD == 'tree' :
from sklearn.ensemble import ExtraTreesClassifier
elif RANK_METHOD == 'KBest':
......@@ -96,8 +93,11 @@ TUN_CV_K = 10
TUN_CV_P = 50
sample_names, var_names, x = load_data(DATAFILE)
y = np.loadtxt(LABELSFILE, dtype=np.int)
#print(var_names[:10])
y_orig = np.loadtxt(LABELSFILE, dtype=np.int)
# encode labels
le = preprocessing.LabelEncoder()
y = le.fit_transform(y_orig)
# If ranked list is given as input to DAP, read it and extract features index
if RANK_METHOD == "rankList":
rankedList = np.loadtxt(RANKFEATS, delimiter='\t', dtype=str, skiprows=1)
......@@ -167,7 +167,7 @@ for n in range(CV_N):
elif RANK_METHOD == 'randomForest' :
ranking_tmp = np.argsort(forest.feature_importances_)[::-1]
elif RANK_METHOD == 'KBest':
selector = SelectKBest(f_classif)
selector = SelectKBest(f_classif, k='all')
selector.fit(x_tr, y_tr)
ranking_tmp = np.argsort( -np.log10(selector.pvalues_) )[::-1]
elif RANK_METHOD == 'rankList':
......@@ -279,14 +279,3 @@ config.set("OUTPUT", "MCC", np.max(AMCC))
config.set("OUTPUT", "N_feats", opt_feats)
config.write(logf)
logf.close()
#if plot_out:
# from metrics_plot import *
# plt_title = (' ').join( [os.path.basename(DATAFILE).replace('.txt', ''), SVM_TYPE] )
# if random_labels:
# metplot(RLFile = (OUTFILE + "_metrics.txt"), title = plt_title)
# elif RANK_METHOD=='random':
# metplot(RRFile = (OUTFILE + "_metrics.txt"), title = plt_title)
# else:
# metplot(normFile = (OUTFILE + "_metrics.txt"), title = plt_title)
......@@ -11,6 +11,8 @@ import sys
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.preprocessing import LabelEncoder
import performance as perf
from extract_topfeats import extract_feats
......@@ -59,13 +61,18 @@ extract_feats(TRFILE, RANK, NFEATS, TR_TOPFEATS)
TS_TOPFEATS = OUTFILE + '_top%s_ts.txt' % NFEATS
extract_feats(TSFILE, RANK, NFEATS, TS_TOPFEATS)
# initialize LabelEncoder
le = LabelEncoder()
# load data
sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS)
y_tr = np.loadtxt(LABELSFILE, dtype=np.int, delimiter='\t')
y_tr = le.fit_transform(y_tr)
sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS)
# load the TS labels if available
if TSLABELSFILE is not None:
y_ts = np.loadtxt(TSLABELSFILE, dtype=np.int, delimiter='\t')
y_ts = le.transform(y_ts)
# prediction
forest = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0)
......@@ -74,26 +81,30 @@ forest.fit(x_tr, y_tr)
p_tr = forest.predict(x_tr)
p_ts = forest.predict(x_ts)
# decode labels back
p_tr_dec = le.inverse_transform(p_tr)
p_ts_dec = le.inverse_transform(p_ts)
prob_tr = forest.predict_proba(x_tr)
prob_ts = forest.predict_proba(x_ts)
print("MCC on train: %.3f" % (perf.KCCC_discrete(y_tr, p_tr)))
print("MCC on train: %.3f" % (matthews_corrcoef(y_tr, p_tr)))
if TSLABELSFILE is not None:
print("MCC on validation: %.3f" % (perf.KCCC_discrete(y_ts, p_ts)))
print("MCC on validation: %.3f" % (matthews_corrcoef(y_ts, p_ts)))
# write output files
# save MCC_train and MCC_validation
with open(OUTFILE + "_MCC_scores.txt", "w") as fout:
fout.write("MCC_train\t%.5f\n" % (perf.KCCC_discrete(y_tr, p_tr)))
fout.write("MCC_validation\t%.5f\n" % (perf.KCCC_discrete(y_ts, p_ts)))
fout.write("MCC_train\t%.5f\n" % (matthews_corrcoef(y_tr, p_tr)))
fout.write("MCC_validation\t%.5f\n" % (matthews_corrcoef(y_ts, p_ts)))
with open(OUTFILE + "_TEST_pred_tr.txt", "w") as fout:
for i in range(len(sample_names_tr)):
fout.write("%s\t%i\n" % (sample_names_tr[i], p_tr[i]))
fout.write("%s\t%i\n" % (sample_names_tr[i], p_tr_dec[i]))
with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
for i in range(len(sample_names_ts)):
fout.write("%s\t%i\n" % (sample_names_ts[i], p_ts[i]))
fout.write("%s\t%i\n" % (sample_names_ts[i], p_ts_dec[i]))
np.savetxt(OUTFILE + "_TEST_signature.txt",
np.array(var_names_tr).reshape(-1,1),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment