Commit a128b05f authored by Marco Chierici's avatar Marco Chierici
Browse files

Added LabelEncoder support

parent 13d5f927
......@@ -11,6 +11,8 @@ import sys
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.preprocessing import LabelEncoder
import performance as perf
from extract_topfeats import extract_feats
......@@ -59,13 +61,18 @@ extract_feats(TRFILE, RANK, NFEATS, TR_TOPFEATS)
TS_TOPFEATS = OUTFILE + '_top%s_ts.txt' % NFEATS
extract_feats(TSFILE, RANK, NFEATS, TS_TOPFEATS)
# initialize LabelEncoder
le = LabelEncoder()
# load data
sample_names_tr, var_names_tr, x_tr = load_data(TR_TOPFEATS)
y_tr = np.loadtxt(LABELSFILE, dtype=np.int, delimiter='\t')
y_tr = le.fit_transform(y_tr)
sample_names_ts, var_names_ts, x_ts = load_data(TS_TOPFEATS)
# load the TS labels if available
if TSLABELSFILE is not None:
y_ts = np.loadtxt(TSLABELSFILE, dtype=np.int, delimiter='\t')
y_ts = le.transform(y_ts)
# prediction
forest = RandomForestClassifier(n_estimators=500, criterion='gini', random_state=0)
......@@ -74,26 +81,30 @@ forest.fit(x_tr, y_tr)
p_tr = forest.predict(x_tr)
p_ts = forest.predict(x_ts)
# decode labels back
p_tr_dec = le.inverse_transform(p_tr)
p_ts_dec = le.inverse_transform(p_ts)
prob_tr = forest.predict_proba(x_tr)
prob_ts = forest.predict_proba(x_ts)
print("MCC on train: %.3f" % (perf.KCCC_discrete(y_tr, p_tr)))
print("MCC on train: %.3f" % (matthews_corrcoef(y_tr, p_tr)))
if TSLABELSFILE is not None:
print("MCC on validation: %.3f" % (perf.KCCC_discrete(y_ts, p_ts)))
print("MCC on validation: %.3f" % (matthews_corrcoef(y_ts, p_ts)))
# write output files
# save MCC_train and MCC_validation
with open(OUTFILE + "_MCC_scores.txt", "w") as fout:
fout.write("MCC_train\t%.5f\n" % (perf.KCCC_discrete(y_tr, p_tr)))
fout.write("MCC_validation\t%.5f\n" % (perf.KCCC_discrete(y_ts, p_ts)))
fout.write("MCC_train\t%.5f\n" % (matthews_corrcoef(y_tr, p_tr)))
fout.write("MCC_validation\t%.5f\n" % (matthews_corrcoef(y_ts, p_ts)))
with open(OUTFILE + "_TEST_pred_tr.txt", "w") as fout:
for i in range(len(sample_names_tr)):
fout.write("%s\t%i\n" % (sample_names_tr[i], p_tr[i]))
fout.write("%s\t%i\n" % (sample_names_tr[i], p_tr_dec[i]))
with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
for i in range(len(sample_names_ts)):
fout.write("%s\t%i\n" % (sample_names_ts[i], p_ts[i]))
fout.write("%s\t%i\n" % (sample_names_ts[i], p_ts_dec[i]))
np.savetxt(OUTFILE + "_TEST_signature.txt",
np.array(var_names_tr).reshape(-1,1),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment