Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
MPBA
INF
Commits
7a219819
Commit
7a219819
authored
Jan 28, 2020
by
Alessia Marcolini
Browse files
Fix missing import and black formatting
parent
7cb397cb
Changes
2
Show whitespace changes
Inline
Side-by-side
sklearn_training.py
View file @
7a219819
...
...
@@ -14,12 +14,12 @@ import bootstrapped.stats_functions as bs_stats
import
numpy
as
np
import
pandas
as
pd
from
mlpy
import
borda_count
,
canberra_stability
from
sklearn
import
preprocessing
from
sklearn
import
preprocessing
,
svm
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn
import
svm
from
sklearn.metrics
import
make_scorer
,
accuracy_score
,
matthews_corrcoef
,
roc_auc_score
from
sklearn.model_selection
import
(
StratifiedKFold
,
StratifiedShuffleSplit
,
train_test_split
,
GridSearchCV
)
from
sklearn
.metrics
import
(
accuracy_score
,
make_scorer
,
matthews_corrcoef
,
roc_auc_score
)
from
sklearn.model_selection
import
(
GridSearchCV
,
StratifiedKFold
,
StratifiedShuffleSplit
,
train_test_split
)
from
sklearn.multiclass
import
OneVsRestClassifier
from
sklearn.pipeline
import
Pipeline
...
...
sklearn_validation.py
View file @
7a219819
...
...
@@ -11,28 +11,34 @@ import sys
import
numpy
as
np
import
pandas
as
pd
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn
import
svm
from
sklearn.metrics
import
make_scorer
,
accuracy_score
,
matthews_corrcoef
,
roc_auc_score
from
sklearn.model_selection
import
StratifiedShuffleSplit
,
GridSearchCV
from
sklearn
import
preprocessing
from
sklearn
import
preprocessing
,
svm
from
sklearn.metrics
import
(
accuracy_score
,
make_scorer
,
matthews_corrcoef
,
roc_auc_score
)
from
sklearn.model_selection
import
GridSearchCV
,
StratifiedShuffleSplit
from
sklearn.pipeline
import
Pipeline
import
performance
as
perf
from
extract_topfeats
import
extract_feats
from
input_output
import
load_data
parser
=
argparse
.
ArgumentParser
(
description
=
'Run a validation experiment using LibLinear.'
)
parser
.
add_argument
(
'CONFIGFILE'
,
type
=
str
,
help
=
'Training experiment configuration file'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Run a validation experiment using LibLinear.'
)
parser
.
add_argument
(
'CONFIGFILE'
,
type
=
str
,
help
=
'Training experiment configuration file'
)
parser
.
add_argument
(
'TSFILE'
,
type
=
str
,
help
=
'Validation datafile'
)
parser
.
add_argument
(
'OUTDIR'
,
type
=
str
,
help
=
'Output directory'
)
parser
.
add_argument
(
'--tslab'
,
type
=
str
,
default
=
None
,
help
=
'Validation labels, if available'
)
parser
.
add_argument
(
'--nf'
,
type
=
int
,
default
=
None
,
help
=
'Custom number of top features'
)
parser
.
add_argument
(
'--tslab'
,
type
=
str
,
default
=
None
,
help
=
'Validation labels, if available'
)
parser
.
add_argument
(
'--nf'
,
type
=
int
,
default
=
None
,
help
=
'Custom number of top features'
)
__author__
=
'Marco Chierici, Alessandro Zandona'
__date__
=
'15 December 2016'
if
len
(
sys
.
argv
)
==
1
:
if
len
(
sys
.
argv
)
==
1
:
parser
.
print_help
()
sys
.
exit
(
1
)
...
...
@@ -48,7 +54,7 @@ TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P
=
0.5
# list of parameters for LSVM tuning
TUN_PARAMS_LSVM
=
[{
'svm__C'
:
[
10
**
int
(
k
)
for
k
in
np
.
arange
(
-
2
,
3
)]}]
TUN_PARAMS_LSVM
=
[{
'svm__C'
:
[
10
**
int
(
k
)
for
k
in
np
.
arange
(
-
2
,
3
)]}]
config
=
ConfigParser
.
RawConfigParser
()
config
.
read
(
CONFIGFILE
)
...
...
@@ -80,12 +86,14 @@ le = preprocessing.LabelEncoder()
sample_names_tr
,
var_names_tr
,
x_tr
=
load_data
(
TR_TOPFEATS
)
y_tr
=
pd
.
read_csv
(
LABELSFILE
,
sep
=
'
\t
'
,
header
=
None
).
values
y_tr
=
le
.
fit_transform
(
y_tr
)
print
(
y_tr
.
shape
)
sample_names_ts
,
var_names_ts
,
x_ts
=
load_data
(
TS_TOPFEATS
)
# load the TS labels if available
if
TSLABELSFILE
is
not
None
:
y_ts
=
pd
.
read_csv
(
TSLABELSFILE
,
sep
=
'
\t
'
,
header
=
None
).
values
y_ts
=
pd
.
read_csv
(
TSLABELSFILE
,
sep
=
'
\t
'
,
header
=
None
,
dtype
=
str
).
values
y_ts
=
le
.
transform
(
y_ts
)
# define classifier
if
MODEL_TYPE
==
"randomForest"
:
model
=
RandomForestClassifier
(
n_estimators
=
500
,
criterion
=
'gini'
,
random_state
=
0
)
...
...
@@ -94,15 +102,19 @@ elif MODEL_TYPE == "LSVM":
if
SCALING
==
'std'
:
scaler
=
preprocessing
.
StandardScaler
()
elif
SCALING
==
'minmax'
:
scaler
=
preprocessing
.
MinMaxScaler
(
feature_range
=
(
-
1
,
1
))
scaler
=
preprocessing
.
MinMaxScaler
(
feature_range
=
(
-
1
,
1
))
scorer
=
make_scorer
(
matthews_corrcoef
)
# tuning Pipeline
tun_classif
=
svm
.
SVC
(
class_weight
=
"balanced"
,
kernel
=
"linear"
)
pipeline_steps
=
[(
'scaler'
,
scaler
),
(
'svm'
,
tun_classif
)]
tuning_pipeline
=
Pipeline
(
pipeline_steps
)
# LSVM tuning
tuncv
=
StratifiedShuffleSplit
(
n_splits
=
TUN_CV_K
,
test_size
=
TUN_CV_P
,
random_state
=
0
)
model
=
GridSearchCV
(
tuning_pipeline
,
param_grid
=
TUN_PARAMS_LSVM
,
cv
=
tuncv
,
scoring
=
scorer
)
tuncv
=
StratifiedShuffleSplit
(
n_splits
=
TUN_CV_K
,
test_size
=
TUN_CV_P
,
random_state
=
0
)
model
=
GridSearchCV
(
tuning_pipeline
,
param_grid
=
TUN_PARAMS_LSVM
,
cv
=
tuncv
,
scoring
=
scorer
)
# train
model
.
fit
(
x_tr
,
y_tr
)
...
...
@@ -138,17 +150,24 @@ with open(OUTFILE + "_TEST_pred_ts.txt", "w") as fout:
for
i
in
range
(
len
(
sample_names_ts
)):
fout
.
write
(
"%s
\t
%s
\n
"
%
(
sample_names_ts
[
i
],
p_ts_dec
[
i
]))
np
.
savetxt
(
OUTFILE
+
"_TEST_signature.txt"
,
np
.
array
(
var_names_tr
).
reshape
(
-
1
,
1
),
fmt
=
'%s'
,
delimiter
=
'
\t
'
)
np
.
savetxt
(
OUTFILE
+
"_TEST_signature.txt"
,
np
.
array
(
var_names_tr
).
reshape
(
-
1
,
1
),
fmt
=
'%s'
,
delimiter
=
'
\t
'
,
)
if
MODEL_TYPE
==
"randomForest"
:
with
open
(
OUTFILE
+
"_TEST_prob_tr.txt"
,
"w"
)
as
fout
:
fout
.
write
(
"SAMPLE
\t
CLASS 0
\t
CLASS 1
\n
"
)
for
i
in
range
(
len
(
sample_names_tr
)):
fout
.
write
(
"%s
\t
%f
\t
%f
\n
"
%
(
sample_names_tr
[
i
],
prob_tr
[
i
,
0
],
prob_tr
[
i
,
1
]))
fout
.
write
(
"%s
\t
%f
\t
%f
\n
"
%
(
sample_names_tr
[
i
],
prob_tr
[
i
,
0
],
prob_tr
[
i
,
1
])
)
with
open
(
OUTFILE
+
"_TEST_prob_ts.txt"
,
"w"
)
as
fout
:
fout
.
write
(
"SAMPLE
\t
CLASS 0
\t
CLASS 1
\n
"
)
for
i
in
range
(
len
(
sample_names_ts
)):
fout
.
write
(
"%s
\t
%f
\t
%f
\n
"
%
(
sample_names_ts
[
i
],
prob_ts
[
i
,
0
],
prob_ts
[
i
,
1
]))
fout
.
write
(
"%s
\t
%f
\t
%f
\n
"
%
(
sample_names_ts
[
i
],
prob_ts
[
i
,
0
],
prob_ts
[
i
,
1
])
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment