Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
MPBA
INF
Commits
d4dc5aa8
Commit
d4dc5aa8
authored
May 13, 2020
by
Marco Chierici
Browse files
Add precision, recall; Black formatting
parent
add8a9fa
Changes
1
Hide whitespace changes
Inline
Side-by-side
sklearn_validation.py
View file @
d4dc5aa8
...
...
@@ -13,53 +13,59 @@ import numpy as np
import
pandas
as
pd
from
sklearn
import
preprocessing
,
svm
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.metrics
import
(
accuracy_score
,
make_scorer
,
matthews_corrcoef
,
roc_auc_score
)
from
sklearn.metrics
import
(
accuracy_score
,
make_scorer
,
matthews_corrcoef
,
roc_auc_score
,
)
from
sklearn.model_selection
import
GridSearchCV
,
StratifiedShuffleSplit
from
sklearn.pipeline
import
Pipeline
import
performance
as
perf
from
extract_topfeats
import
extract_feats
from
input_output
import
load_data
parser
=
argparse
.
ArgumentParser
(
description
=
'
Run a validation experiment using LibLinear.
'
description
=
"
Run a validation experiment using LibLinear.
"
)
parser
.
add_argument
(
'
CONFIGFILE
'
,
type
=
str
,
help
=
'
Training experiment configuration file
'
"
CONFIGFILE
"
,
type
=
str
,
help
=
"
Training experiment configuration file
"
)
parser
.
add_argument
(
'
TSFILE
'
,
type
=
str
,
help
=
'
Validation datafile
'
)
parser
.
add_argument
(
'
OUTDIR
'
,
type
=
str
,
help
=
'
Output directory
'
)
parser
.
add_argument
(
"
TSFILE
"
,
type
=
str
,
help
=
"
Validation datafile
"
)
parser
.
add_argument
(
"
OUTDIR
"
,
type
=
str
,
help
=
"
Output directory
"
)
parser
.
add_argument
(
'
--tslab
'
,
type
=
str
,
default
=
None
,
help
=
'
Validation labels, if available
'
"
--tslab
"
,
type
=
str
,
default
=
None
,
help
=
"
Validation labels, if available
"
)
parser
.
add_argument
(
'
--nf
'
,
type
=
int
,
default
=
None
,
help
=
'
Custom number of top features
'
"
--nf
"
,
type
=
int
,
default
=
None
,
help
=
"
Custom number of top features
"
)
__author__
=
'
Marco Chierici, Alessandro Zandona
'
__date__
=
'
15 December 2016
'
__author__
=
"
Marco Chierici, Alessandro Zandona
"
__date__
=
"
15 December 2016
"
if
len
(
sys
.
argv
)
==
1
:
parser
.
print_help
()
sys
.
exit
(
1
)
args
=
parser
.
parse_args
()
CONFIGFILE
=
vars
(
args
)[
'
CONFIGFILE
'
]
TSFILE
=
vars
(
args
)[
'
TSFILE
'
]
OUTDIR
=
vars
(
args
)[
'
OUTDIR
'
]
TSLABELSFILE
=
vars
(
args
)[
'
tslab
'
]
NFEATS
=
vars
(
args
)[
'
nf
'
]
CONFIGFILE
=
vars
(
args
)[
"
CONFIGFILE
"
]
TSFILE
=
vars
(
args
)[
"
TSFILE
"
]
OUTDIR
=
vars
(
args
)[
"
OUTDIR
"
]
TSLABELSFILE
=
vars
(
args
)[
"
tslab
"
]
NFEATS
=
vars
(
args
)[
"
nf
"
]
# number of Montecarlo CV cycles (for SVM tuning)
TUN_CV_K
=
10
# fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P
=
0.5
# list of parameters for LSVM tuning
TUN_PARAMS_LSVM
=
[{
'
svm__C
'
:
[
10
**
int
(
k
)
for
k
in
np
.
arange
(
-
2
,
3
)]}]
TUN_PARAMS_LSVM
=
[{
"
svm__C
"
:
[
10
**
int
(
k
)
for
k
in
np
.
arange
(
-
2
,
3
)]}]
config
=
ConfigParser
.
RawConfigParser
()
config
.
read
(
CONFIGFILE
)
if
not
config
.
has_section
(
'
INPUT
'
):
if
not
config
.
has_section
(
"
INPUT
"
):
print
(
"%s is not a valid configuration file."
%
CONFIGFILE
)
sys
.
exit
(
3
)
...
...
@@ -74,10 +80,10 @@ BASEFILE = os.path.splitext(TRFILE)[0]
OUTFILE
=
os
.
path
.
join
(
OUTDIR
,
os
.
path
.
basename
(
BASEFILE
))
# extract the top-ranked NFEATS features from TRAINING set
TR_TOPFEATS
=
OUTFILE
+
'
_top%s_tr.txt
'
%
NFEATS
TR_TOPFEATS
=
OUTFILE
+
"
_top%s_tr.txt
"
%
NFEATS
extract_feats
(
TRFILE
,
RANK
,
NFEATS
,
TR_TOPFEATS
)
# extract the top-ranked NFEATS features from VALIDATION set
TS_TOPFEATS
=
OUTFILE
+
'
_top%s_ts.txt
'
%
NFEATS
TS_TOPFEATS
=
OUTFILE
+
"
_top%s_ts.txt
"
%
NFEATS
extract_feats
(
TSFILE
,
RANK
,
NFEATS
,
TS_TOPFEATS
)
# initialize LabelEncoder
...
...
@@ -85,7 +91,7 @@ le = preprocessing.LabelEncoder()
# load data
sample_names_tr
,
var_names_tr
,
x_tr
=
load_data
(
TR_TOPFEATS
)
y_tr
=
pd
.
read_csv
(
LABELSFILE
,
sep
=
'
\t
'
,
header
=
None
,
dtype
=
str
).
values
y_tr
=
pd
.
read_csv
(
LABELSFILE
,
sep
=
"
\t
"
,
header
=
None
,
dtype
=
str
).
values
y_tr
=
le
.
fit_transform
(
y_tr
)
sample_names_ts
,
var_names_ts
,
x_ts
=
load_data
(
TS_TOPFEATS
)
# load the TS labels if available
...
...
@@ -93,20 +99,21 @@ if TSLABELSFILE is not None:
y_ts
=
pd
.
read_csv
(
TSLABELSFILE
,
header
=
None
,
dtype
=
str
).
values
y_ts
=
le
.
transform
(
y_ts
)
is_multiclass
=
len
(
le
.
classes_
)
>
2
# define classifier
if
MODEL_TYPE
==
"randomForest"
:
model
=
RandomForestClassifier
(
n_estimators
=
500
,
criterion
=
'
gini
'
,
random_state
=
0
)
model
=
RandomForestClassifier
(
n_estimators
=
500
,
criterion
=
"
gini
"
,
random_state
=
0
)
elif
MODEL_TYPE
==
"LSVM"
:
SCALING
=
config
.
get
(
"INPUT"
,
"Scaling"
)
if
SCALING
==
'
std
'
:
if
SCALING
==
"
std
"
:
scaler
=
preprocessing
.
StandardScaler
()
elif
SCALING
==
'
minmax
'
:
elif
SCALING
==
"
minmax
"
:
scaler
=
preprocessing
.
MinMaxScaler
(
feature_range
=
(
-
1
,
1
))
scorer
=
make_scorer
(
matthews_corrcoef
)
# tuning Pipeline
tun_classif
=
svm
.
SVC
(
class_weight
=
"balanced"
,
kernel
=
"linear"
)
pipeline_steps
=
[(
'
scaler
'
,
scaler
),
(
'
svm
'
,
tun_classif
)]
pipeline_steps
=
[(
"
scaler
"
,
scaler
),
(
"
svm
"
,
tun_classif
)]
tuning_pipeline
=
Pipeline
(
pipeline_steps
)
# LSVM tuning
tuncv
=
StratifiedShuffleSplit
(
...
...
@@ -141,6 +148,22 @@ if TSLABELSFILE is not None:
with
open
(
OUTFILE
+
"_MCC_scores.txt"
,
"w"
)
as
fout
:
fout
.
write
(
"MCC_train
\t
%.5f
\n
"
%
(
matthews_corrcoef
(
y_tr
,
p_tr
)))
fout
.
write
(
"MCC_validation
\t
%.5f
\n
"
%
(
matthews_corrcoef
(
y_ts
,
p_ts
)))
# additional metrics
if
not
is_multiclass
:
# sensitivity, or recall
fout
.
write
(
"SENS_train
\t
%.5f
\n
"
%
(
perf
.
sensitivity
(
y_tr
,
p_tr
)))
fout
.
write
(
"SENS_validation
\t
%.5f
\n
"
%
(
perf
.
sensitivity
(
y_ts
,
p_ts
)))
# positive predictive value, or precision
fout
.
write
(
"PPV_train
\t
%.5f
\n
"
%
(
perf
.
ppv
(
y_tr
,
p_tr
)))
fout
.
write
(
"PPV_validation
\t
%.5f
\n
"
%
(
perf
.
ppv
(
y_ts
,
p_ts
)))
else
:
# sensitivity, or recall
fout
.
write
(
"SENS_train
\t
%.5f
\n
"
%
(
np
.
nan
))
fout
.
write
(
"SENS_validation
\t
%.5f
\n
"
%
(
np
.
nan
))
# positive predictive value, or precision
fout
.
write
(
"PPV_train
\t
%.5f
\n
"
%
(
np
.
nan
))
fout
.
write
(
"PPV_validation
\t
%.5f
\n
"
%
(
np
.
nan
))
with
open
(
OUTFILE
+
"_TEST_pred_tr.txt"
,
"w"
)
as
fout
:
for
i
in
range
(
len
(
sample_names_tr
)):
...
...
@@ -153,8 +176,8 @@ with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
np
.
savetxt
(
OUTFILE
+
"_TEST_signature.txt"
,
np
.
array
(
var_names_tr
).
reshape
(
-
1
,
1
),
fmt
=
'
%s
'
,
delimiter
=
'
\t
'
,
fmt
=
"
%s
"
,
delimiter
=
"
\t
"
,
)
if
MODEL_TYPE
==
"randomForest"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment