Commit fb350dc2 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Remove unused files

parent 87454b37
## This code is written by Marco Chierici <chierici@fbk.eu>, Alessandro Zandona' <zandona@fbk.eu>.
## Based on code previously written by Davide Albanese.
## Requires Python >= 2.7, mlpy >= 3.5
import argparse
import configparser as ConfigParser
import csv
import glob
import os.path
import sys
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
import numpy as np
import pandas as pd
from mlpy import borda_count, canberra_stability
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import (StratifiedKFold, StratifiedShuffleSplit,
train_test_split)
from sklearn.multiclass import OneVsRestClassifier
import performance as perf
from input_output import load_data
__author__ = 'Marco Chierici'
__version__ = '2.5'
__date__ = '28 Nov 2019'
class myArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
super(myArgumentParser, self).__init__(*args, **kwargs)
def convert_arg_line_to_args(self, line):
for arg in line.split():
if not arg.strip():
continue
if arg[0] == '#':
break
yield arg
parser = myArgumentParser(description='Run a training experiment (10x5-CV fold) using Random Forest as classifier.',
fromfile_prefix_chars='@')
parser.add_argument('DATAFILE', type=str, help='Training datafile')
parser.add_argument('LABELSFILE', type=str, help='Sample labels')
parser.add_argument('OUTDIR', type=str, help='Output directory')
parser.add_argument('--ranking', dest='RANK_METHOD', type=str, choices=['ReliefF', 'tree', 'randomForest', 'KBest', 'random', 'rankList'], default='randomForest', help='Feature ranking method: ReliefF, extraTrees, Random Forest, Anova F-score, random ranking, external ranked features list (default: %(default)s)')
parser.add_argument('--random', action='store_true', help='Run with random sample labels')
parser.add_argument('--cv_k', type=np.int, default=5, help='Number of CV folds (default: %(default)s)')
parser.add_argument('--cv_n', type=np.int, default=10, help='Number of CV cycles (default: %(default)s)')
parser.add_argument('--rankFeats', type=str, default='', help='Ranked features list to be used by Machine Learning [Feats name on 1st column, feats weights on 2nd column, with HEADER]')
parser.add_argument('--reliefk', type=np.int, default=3, help='Number of nearest neighbors for ReliefF (default: %(default)s)')
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
DATAFILE = args.DATAFILE
LABELSFILE = args.LABELSFILE
RANK_METHOD = args.RANK_METHOD
OUTDIR = args.OUTDIR
random_labels = args.random
CV_K = args.cv_k
CV_N = args.cv_n
RANKFEATS = args.rankFeats
relief_k = args.reliefk
BASEFILE = os.path.splitext(os.path.basename(DATAFILE))[0]
MODEL_TYPE = 'RandomForest'
OUTFILE = os.path.join(OUTDIR, '_'.join([BASEFILE, MODEL_TYPE, 'RF' if RANK_METHOD=='randomForest' else RANK_METHOD]))
# create OUTDIR if not present
try:
os.makedirs(OUTDIR)
except OSError:
if not os.path.isdir(OUTDIR):
raise
# load modules
if RANK_METHOD == 'ReliefF':
from relief import ReliefF
# add ReliefF K to OUTFILE
OUTFILE = os.path.join(OUTDIR, '_'.join([BASEFILE, MODEL_TYPE, RANK_METHOD + str(relief_k)]))
elif RANK_METHOD == 'tree' :
from sklearn.ensemble import ExtraTreesClassifier
elif RANK_METHOD == 'KBest':
from sklearn.feature_selection import SelectKBest, f_classif
# number of Montecarlo CV cycles (for SVM tuning)
TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P = 50
sample_names, var_names, x = load_data(DATAFILE)
y_orig = pd.read_csv(LABELSFILE, header=None).values
# encode labels
le = preprocessing.LabelEncoder()
y = le.fit_transform(y_orig)
is_multiclass = len(le.classes_) > 2
# If ranked list is given as input to DAP, read it and extract features index
if RANK_METHOD == "rankList":
rankedList = np.loadtxt(RANKFEATS, delimiter='\t', dtype=str, skiprows=1)
ranked_feats = rankedList[:,0]
# Find index of features inside dataset
ranked_feats_idx = []
for el in ranked_feats.tolist():
ranked_feats_idx.append(np.where([el==vn for vn in var_names])[0][0])
# build FSTEPS according to dataset size
nfeat = x.shape[1]
feature_ranges = [5, 10, 25, 50, 75, 100]
FSTEPS = list()
for percentage in feature_ranges:
k = np.ceil((nfeat * percentage) / 100).astype(np.int)
FSTEPS.append(k)
# prepare output files
#rankingf = open(OUTFILE + "_featurelist.txt", 'w')
#ranking_w = csv.writer(rankingf, delimiter='\t', lineterminator='\n')
#ranking_w.writerow(["FEATURE_ID", "FEATURE_NAME", "MEAN_POS", "MEDIAN_ALL", "MEDIAN_0", "MEDIAN_1", "FOLD_CHANGE", "LOG2_FOLD_CHANGE"])
stabilityf = open(OUTFILE + "_stability.txt", 'w')
stability_w = csv.writer(stabilityf, delimiter='\t', lineterminator='\n')
# prepare output arrays
RANKING = np.empty((CV_K*CV_N, x.shape[1]), dtype=np.int)
ys=[]
if random_labels:
np.random.seed(0)
tmp = y.copy()
np.random.shuffle(tmp)
for i in range(CV_N):
ys.append(tmp)
else:
for i in range(CV_N):
ys.append(y)
if is_multiclass:
cvmetrics_df = pd.DataFrame(columns=['Iteration', 'Fold', 'nf', 'mcc', 'auc'])
lb = preprocessing.LabelBinarizer()
else:
cvmetrics_df = pd.DataFrame(columns=['Iteration', 'Fold', 'nf', 'mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv'])
for n in range(CV_N):
skf = StratifiedKFold(CV_K, shuffle=True, random_state=n)
for i, (idx_tr, idx_ts) in enumerate(skf.split(x, ys[n])):
x_tr, x_ts = x[idx_tr], x[idx_ts]
y_tr, y_ts = ys[n][idx_tr], ys[n][idx_ts]
forest = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=n, n_jobs=1)
forest.fit(x_tr, y_tr)
if RANK_METHOD == 'random':
ranking_tmp = np.arange(len(var_names))
np.random.seed((n*CV_K)+i)
np.random.shuffle(ranking_tmp)
elif RANK_METHOD == 'ReliefF':
relief = ReliefF(relief_k, seed=n)
relief.learn(x_tr, y_tr)
w = relief.w()
ranking_tmp = np.argsort(w)[::-1]
elif RANK_METHOD == 'tree' :
forest = ExtraTreesClassifier(n_estimators=250, criterion='gini', random_state=n)
forest.fit(x_tr, y_tr)
ranking_tmp = np.argsort(forest.feature_importances_)[::-1]
elif RANK_METHOD == 'randomForest' :
ranking_tmp = np.argsort(forest.feature_importances_)[::-1]
elif RANK_METHOD == 'KBest':
selector = SelectKBest(f_classif, k='all')
selector.fit(x_tr, y_tr)
ranking_tmp = np.argsort( -np.log10(selector.pvalues_) )[::-1]
elif RANK_METHOD == 'rankList':
ranking_tmp = ranked_feats_idx
RANKING[(n * CV_K) + i] = ranking_tmp
if is_multiclass:
forest_mc = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, criterion='gini', random_state=n, n_jobs=1))
y_tr_bin = lb.fit_transform(y_tr)
y_ts_bin = lb.transform(y_ts)
for j, s in enumerate(FSTEPS):
v = RANKING[(n * CV_K) + i][:s]
x_tr_fs, x_ts_fs = x_tr[:, v], x_ts[:, v]
forest.fit(x_tr_fs, y_tr)
yp = forest.predict(x_ts_fs)
if is_multiclass:
forest_mc.fit(x_tr_fs, y_tr_bin)
yp_bin = forest_mc.predict(x_ts_fs)
cvmetrics_df = cvmetrics_df.append({'Iteration': n, 'Fold': i, 'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'auc': roc_auc_score(y_ts_bin, yp_bin, average="micro")}, ignore_index=True)
else:
cvmetrics_df = cvmetrics_df.append({'Iteration': n, 'Fold': i, 'nf': s,
'mcc': matthews_corrcoef(y_ts, yp),
'sens': perf.sensitivity(y_ts, yp),
'spec': perf.specificity(y_ts, yp),
'acc': accuracy_score(y_ts, yp),
'auc': roc_auc_score(y_ts, yp),
'npv': perf.npv(y_ts, yp),
'ppv': perf.ppv(y_ts, yp)}, ignore_index=True)
cvmetrics_df[['Iteration', 'Fold', 'nf']] = cvmetrics_df[['Iteration', 'Fold', 'nf']].astype(int)
cvmetrics_df.to_csv(f"{OUTFILE}_allmetrics.txt", sep='\t', index=False, float_format="%.3f")
# write all rankings
np.savetxt(OUTFILE + "_ranking.csv.gz", RANKING, fmt='%d', delimiter='\t')
# average values
avg_df = cvmetrics_df.groupby(['nf']).mean()
# confidence intervals
if is_multiclass:
AMCC, AAUC = [avg_df[metric].values for metric in ['mcc', 'auc']]
MCC_CI, AUC_CI = ([] for i in range(2))
else:
AMCC, ASENS, ASPEC, AACC, AAUC, ANPV, APPV = [avg_df[metric].values for metric in ['mcc', 'sens', 'spec', 'acc', 'auc', 'npv', 'ppv']]
MCC_CI, SENS_CI, SPEC_CI, ACC_CI, AUC_CI, NPV_CI, PPV_CI = ([] for i in range(7))
for nf in FSTEPS:
res = bs.bootstrap(cvmetrics_df['mcc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
MCC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['auc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
AUC_CI.append([res.lower_bound, res.upper_bound])
if not is_multiclass:
res = bs.bootstrap(cvmetrics_df['sens'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
SENS_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['spec'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
SPEC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['acc'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
ACC_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['npv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
NPV_CI.append([res.lower_bound, res.upper_bound])
res = bs.bootstrap(cvmetrics_df['ppv'][cvmetrics_df['nf']==nf].values, stat_func=bs_stats.mean)
PPV_CI.append([res.lower_bound, res.upper_bound])
# Borda list
BORDA_ID, _, BORDA_POS = borda_count(RANKING)
# optimal number of features (yielding max MCC)
opt_feats = FSTEPS[np.argmax(AMCC)]
# Canberra stability indicator
STABILITY = []
PR = np.argsort( RANKING )
for ss in FSTEPS:
STABILITY.append( canberra_stability(PR, ss) )
n_cols = 7 if is_multiclass else 22
metrics_array = np.empty((len(FSTEPS), n_cols))
for j,s in enumerate(FSTEPS):
if is_multiclass:
metrics_array[j] = np.array([s, AMCC[j], MCC_CI[j][0], MCC_CI[j][1],
AAUC[j], AUC_CI[j][0], AUC_CI[j][1]])
else:
metrics_array[j] = np.array([s, AMCC[j], MCC_CI[j][0], MCC_CI[j][1], ASENS[j], SENS_CI[j][0], SENS_CI[j][1],
ASPEC[j], SPEC_CI[j][0], SPEC_CI[j][1], AACC[j], ACC_CI[j][0], ACC_CI[j][1],
AAUC[j], AUC_CI[j][0], AUC_CI[j][1], ANPV[j], NPV_CI[j][0], NPV_CI[j][1],
APPV[j], PPV_CI[j][0], PPV_CI[j][1]])
if is_multiclass:
metrics_df = pd.DataFrame(metrics_array, columns=["nf", "mcc", "mcc_min", "mcc_max", "auc", "auc_min", "auc_max"])
else:
metrics_df = pd.DataFrame(metrics_array, columns=["nf", "mcc", "mcc_min", "mcc_max", "sens", "sens_min", "sens_max", "spec", "spec_min", "spec_max", "acc", "acc_min", "acc_max", "auc", "auc_min", "auc_max", "npv", "npv_min", "npv_max", "ppv", "ppv_min", "ppv_max"])
metrics_df['nf'] = metrics_df['nf'].astype(int)
metrics_df.to_csv(f"{OUTFILE}_metrics.txt", sep='\t', index=False, float_format="%.3f")
stability_w.writerow(["STEP", "STABILITY"])
for j, s in enumerate(FSTEPS):
stability_w.writerow( [s, STABILITY[j]] )
stabilityf.close()
borda_array = np.empty((len(BORDA_ID), 3))
borda_df = pd.DataFrame(columns=["FEATURE_ID", "FEATURE_NAME", "MEAN_POS"])
for i, pos in zip(BORDA_ID, BORDA_POS):
borda_df = borda_df.append({'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos+1}, ignore_index=True)
borda_df.to_csv(f"{OUTFILE}_featurelist.txt", sep='\t', index=False, float_format="%.3f")
logf = open(OUTFILE + ".log", 'w')
config = ConfigParser.RawConfigParser()
config.add_section("SOFTWARE VERSIONS")
config.set("SOFTWARE VERSIONS", os.path.basename(__file__), __version__)
config.set("SOFTWARE VERSIONS", "Python", sys.version.replace('\n', ''))
config.set("SOFTWARE VERSIONS", "Numpy", np.__version__)
config.add_section("CV PARAMETERS")
config.set("CV PARAMETERS", "Folds", CV_K)
config.set("CV PARAMETERS", "Iterations", CV_N)
config.add_section("INPUT")
config.set("INPUT", "Data", os.path.realpath( DATAFILE ))
config.set("INPUT", "Labels", os.path.realpath( LABELSFILE ))
config.set("INPUT", "Classifier", "RandomForest")
config.set("INPUT", "n_estimators", 500)
config.set("INPUT", "Rank_method", RANK_METHOD)
config.set("INPUT", "Random_labels", random_labels)
config.add_section("OUTPUT")
config.set("OUTPUT", "Metrics", os.path.realpath( OUTFILE + "_metrics.txt" ))
config.set("OUTPUT", "Borda", os.path.realpath( OUTFILE + "_featurelist.txt" ))
config.set("OUTPUT", "Internal", os.path.realpath( OUTFILE + "_internal.txt" ))
config.set("OUTPUT", "Stability", os.path.realpath( OUTFILE + "_stability.txt" ))
config.set("OUTPUT", "MCC", np.max(AMCC))
config.set("OUTPUT", "N_feats", opt_feats)
config.write(logf)
logf.close()
## This code is written by Marco Chierici <chierici@fbk.eu>, Alessandro Zandona' <zandona@fbk.eu>.
## Based on code previously written by Davide Albanese.
## Requires Python >= 2.7, mlpy >= 3.5
import argparse
import configparser as ConfigParser
import os.path
import sys
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import performance as perf
from extract_topfeats import extract_feats
from input_output import load_data
parser = argparse.ArgumentParser(description='Run a validation experiment using LibLinear.')
parser.add_argument('CONFIGFILE', type=str, help='Training experiment configuration file')
parser.add_argument('TSFILE', type=str, help='Validation datafile')
parser.add_argument('OUTDIR', type=str, help='Output directory')
parser.add_argument('--tslab', type=str, default=None, help='Validation labels, if available')
parser.add_argument('--nf', type=int, default=None, help='Custom number of top features')
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
CONFIGFILE = vars(args)['CONFIGFILE']
TSFILE = vars(args)['TSFILE']
OUTDIR = vars(args)['OUTDIR']
TSLABELSFILE = vars(args)['tslab']
NFEATS = vars(args)['nf']
config = ConfigParser.RawConfigParser()
config.read(CONFIGFILE)
if not config.has_section('INPUT'):
print("%s is not a valid configuration file." % CONFIGFILE)
sys.exit(3)
TRFILE = config.get("INPUT", "Data")
LABELSFILE = config.get("INPUT", "Labels")
RANK = config.get("OUTPUT", "Borda")
if NFEATS is None:
NFEATS = config.getint("OUTPUT", "N_feats")
BASEFILE = os.path.splitext(TRFILE)[0]
OUTFILE = os.path.join(OUTDIR, os.path.basename(BASEFILE))
# extract the top-ranked NFEATS features from TRAINING set
TR_TOPFEATS = OUTFILE + '_top%s_tr.txt' % NFEATS
extract_feats(TRFILE, RANK, NFEATS, TR_TOPFEATS)
# extract the top-ranked NFEATS features from VALIDATION set
TS_TOPFEATS = OUTFILE + '_top%s_ts.txt' % NFEATS
extract_feats(TSFILE, RANK, NFEATS, TS_TOPFEATS)
# initialize LabelEncoder
le = LabelEncoder()
# load data
sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS)
y_tr = pd.read_csv(LABELSFILE, sep='\t', header=None).values
y_tr = le.fit_transform(y_tr)
sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS)
# load the TS labels if available
if TSLABELSFILE is not None:
y_ts = pd.read_csv(TSLABELSFILE, sep='\t', header=None).values
y_ts = le.transform(y_ts)
# prediction
forest = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0)
forest.fit(x_tr, y_tr)
p_tr = forest.predict(x_tr)
p_ts = forest.predict(x_ts)
# decode labels back
p_tr_dec = le.inverse_transform(p_tr)
p_ts_dec = le.inverse_transform(p_ts)
prob_tr = forest.predict_proba(x_tr)
prob_ts = forest.predict_proba(x_ts)
print("MCC on train: %.3f" % (matthews_corrcoef(y_tr, p_tr)))
if TSLABELSFILE is not None:
print("MCC on validation: %.3f" % (matthews_corrcoef(y_ts, p_ts)))
# write output files
# save MCC_train and MCC_validation
with open(OUTFILE + "_MCC_scores.txt", "w") as fout:
fout.write("MCC_train\t%.5f\n" % (matthews_corrcoef(y_tr, p_tr)))
fout.write("MCC_validation\t%.5f\n" % (matthews_corrcoef(y_ts, p_ts)))
with open(OUTFILE + "_TEST_pred_tr.txt", "w") as fout:
for i in range(len(sample_names_tr)):
fout.write("%s\t%s\n" % (sample_names_tr[i], p_tr_dec[i]))
with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
for i in range(len(sample_names_ts)):
fout.write("%s\t%s\n" % (sample_names_ts[i], p_ts_dec[i]))
np.savetxt(OUTFILE + "_TEST_signature.txt",
np.array(var_names_tr).reshape(-1,1),
fmt='%s', delimiter='\t')
with open(OUTFILE + "_TEST_prob_tr.txt", "w") as fout:
fout.write("SAMPLE\tCLASS 0\tCLASS 1\n")
for i in range(len(sample_names_tr)):
fout.write("%s\t%f\t%f\n" % (sample_names_tr[i], prob_tr[i,0], prob_tr[i,1]))
with open(OUTFILE + "_TEST_prob_ts.txt", "w") as fout:
fout.write("SAMPLE\tCLASS 0\tCLASS 1\n")
for i in range(len(sample_names_ts)):
fout.write("%s\t%f\t%f\n" % (sample_names_ts[i], prob_ts[i,0], prob_ts[i,1]))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment