Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
MPBA Radiomics
RADLER
Commits
a714181f
Commit
a714181f
authored
Oct 28, 2019
by
Alessia Marcolini
Browse files
Radiomics pipeline repo
parent
32ce2db1
Changes
38
Hide whitespace changes
Inline
Side-by-side
radiomics_pipeline/.gitignore
0 → 100755
View file @
a714181f
.ipynb_checkpoints/
.project
.pydevproject
.spyproject
__pycache__
# Compiled python modules.
*.pyc
# Setuptools distribution folder.
/dist/
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
data
data/
results
results/
experiments/
experiments
radiomics_pipeline/.ipynb_checkpoints/split_HN-checkpoint.py
deleted
100755 → 0
View file @
32ce2db1
PATH
=
'/home/bizzego/projects/networks_dami/radiomics_pipeline'
import
numpy
as
np
import
pandas
as
pd
#%%
FEATDIR
=
f
'
{
PATH
}
/data/HN/features'
FILENAME
=
'merged_features_B_UNCORR_selected'
FEATURE_FILE
=
f
'
{
FEATDIR
}
/selected/
{
FILENAME
}
.csv'
features
=
pd
.
read_csv
(
FEATURE_FILE
,
index_col
=
0
)
#%%
subjects
=
features
.
index
.
values
is_train
=
[
x
.
split
(
'-'
)[
1
]
in
[
'HGJ'
,
'CHUS'
]
for
x
in
subjects
]
is_test
=
[
x
.
split
(
'-'
)[
1
]
in
[
'HMR'
,
'CHUM'
]
for
x
in
subjects
]
features_train
=
features
.
iloc
[
is_train
,
:]
features_test
=
features
.
iloc
[
is_test
,
:]
#%%
features_train
.
to_csv
(
f
'
{
FEATDIR
}
/selected_TT/
{
FILENAME
}
_TRAIN.csv'
)
features_test
.
to_csv
(
f
'
{
FEATDIR
}
/selected_TT/
{
FILENAME
}
_TEST.csv'
)
radiomics_pipeline/deep_features/deep_extraction.py
0 → 100755
View file @
a714181f
PATH
=
'/home/bizzego/projects/radiomics_pipeline'
import
numpy
as
np
import
pandas
as
pd
import
torch
from
dap.datasets.numpydatasets
import
NumpyCSVDataset
,
augment_3D
from
networks
import
CNN3D_NET
#%%
DATASET
=
'OPBG'
LAB_COL
=
'group'
MODEL_NAME
=
f
'DAP_bbox_T2-FLAIR_
{
LAB_COL
}
_CNN3D'
SIZE
=
64
#%%
DATADIR
=
f
'
{
PATH
}
/data/
{
DATASET
}
/bbox_T2-FLAIR_noseg'
LABELS_FILE
=
f
'
{
PATH
}
/data/
{
DATASET
}
/labels.csv'
OUTDIR
=
f
'
{
PATH
}
/data/
{
DATASET
}
/features'
OUTFILE
=
'deep_features.csv'
#%%
dataset
=
NumpyCSVDataset
(
DATADIR
,
LABELS_FILE
,
LAB_COL
,
'multi'
,
SIZE
,
augmentation_function
=
augment_3D
,
name
=
f
'bbox_
{
LAB_COL
}
'
)
model_file
=
f
'
{
PATH
}
/data/
{
DATASET
}
/predictions/
{
MODEL_NAME
}
/components/torch_model.th'
model
=
torch
.
load
(
model_file
)
#%%
deep_features
=
[]
sample_names
=
[]
with
torch
.
autograd
.
no_grad
():
for
i
,
sample
in
enumerate
(
dataset
):
name
=
sample
[
'sample'
]
image
=
sample
[
'data'
]
image
=
image
.
unsqueeze
(
0
)
out
=
model
.
extract_features
(
image
)
deep_features
.
append
(
out
.
numpy
())
sample_names
.
append
(
name
)
deep_features
=
np
.
array
(
deep_features
)
deep_features_pd
=
pd
.
DataFrame
(
deep_features
[:,
0
,:],
index
=
sample_names
)
#%% SAVE
print
(
deep_features_pd
.
shape
)
deep_features_pd
.
to_csv
(
f
'
{
OUTDIR
}
/
{
OUTFILE
}
'
)
radiomics_pipeline/deep_features/deep_extraction_transfer.py
0 → 100755
View file @
a714181f
PATH
=
'/home/bizzego/projects/radiomics_pipeline'
import
numpy
as
np
import
pandas
as
pd
import
torch
from
dap.datasets.numpydatasets
import
NumpyCSVDataset
,
augment_3D
from
networks
import
CNN3D_NET
#%%
model_file
=
f
'
{
PATH
}
/data/BraTS/predictions/DAP_bbox_lab_3classes_CNN3D/components/torch_model.th'
DATASET
=
'OPBG'
LAB_COL
=
'group'
SIZE
=
64
#%%
DATADIR
=
f
'
{
PATH
}
/data/
{
DATASET
}
/bbox_T2-FLAIR_noseg'
LABELS_FILE
=
f
'
{
PATH
}
/data/
{
DATASET
}
/labels.csv'
OUTDIR
=
f
'
{
PATH
}
/data/
{
DATASET
}
/features'
OUTFILE
=
'transfer_features_FLAIR.csv'
#%%
dataset
=
NumpyCSVDataset
(
DATADIR
,
LABELS_FILE
,
LAB_COL
,
'multi'
,
SIZE
,
augmentation_function
=
augment_3D
,
name
=
f
'bbox_
{
LAB_COL
}
'
)
model
=
torch
.
load
(
model_file
)
'''
#%%
deep_features = []
sample_names = []
with torch.autograd.no_grad():
for i,sample in enumerate(dataset):
name = sample['sample']
image = sample['data']
image = image.unsqueeze(0)
out_t2 = model.features_1(image[:,0,:,:,:].unsqueeze(1)).view(1, -1).numpy()
out_flair = model.features_3(image[:,1,:,:,:].unsqueeze(1)).view(1, -1).numpy()
out = np.hstack([out_t2, out_flair])
deep_features.append(out)
sample_names.append(name)
'''
deep_features
=
[]
sample_names
=
[]
with
torch
.
autograd
.
no_grad
():
for
i
,
sample
in
enumerate
(
dataset
):
name
=
sample
[
'sample'
]
image
=
sample
[
'data'
]
image
=
image
.
unsqueeze
(
0
)
#out_t2 = model.features_1(image[:,0,:,:,:].unsqueeze(1)).view(1, -1).numpy()
out_flair
=
model
.
features_3
(
image
[:,
1
,:,:,:].
unsqueeze
(
1
)).
view
(
1
,
-
1
).
numpy
()
#out = np.hstack([out_t2, out_flair])
deep_features
.
append
(
out_flair
)
sample_names
.
append
(
name
)
deep_features
=
np
.
array
(
deep_features
)
deep_features_pd
=
pd
.
DataFrame
(
deep_features
[:,
0
,:],
index
=
sample_names
)
#%% SAVE
print
(
deep_features_pd
.
shape
)
deep_features_pd
.
to_csv
(
f
'
{
OUTDIR
}
/
{
OUTFILE
}
'
)
radiomics_pipeline/deep_features/networks.py
0 → 100755
View file @
a714181f
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 5 11:31:52 2018
@author: andrea
"""
import
torch.nn
as
nn
import
torch
def
get_conv3d_block
(
n_features
,
dropout
):
block
=
nn
.
Sequential
(
nn
.
Conv3d
(
1
,
20
,
3
,
padding
=
1
),
nn
.
BatchNorm3d
(
20
),
nn
.
ReLU
(
True
),
nn
.
MaxPool3d
(
2
,
stride
=
2
),
nn
.
Dropout3d
(
dropout
),
nn
.
Conv3d
(
20
,
40
,
3
,
padding
=
1
),
nn
.
BatchNorm3d
(
40
),
nn
.
ReLU
(
True
),
nn
.
MaxPool3d
(
2
,
stride
=
2
),
nn
.
Dropout3d
(
dropout
/
2
),
nn
.
Conv3d
(
40
,
n_features
,
3
),
nn
.
BatchNorm3d
(
n_features
),
nn
.
ReLU
(
True
),
nn
.
MaxPool3d
(
2
,
stride
=
2
),
nn
.
Dropout3d
(
dropout
/
4
),
nn
.
AdaptiveAvgPool3d
(
1
))
return
(
block
)
def
get_fully_connect
(
n_features
,
n_channels
,
n_classes
):
block
=
nn
.
Sequential
(
nn
.
Linear
(
n_features
*
n_channels
,
50
),
nn
.
ReLU
(),
nn
.
Dropout
(
0.1
),
nn
.
Linear
(
50
,
n_classes
),
nn
.
Softmax
(
1
))
return
(
block
)
class
CNN3D_NET
(
nn
.
Module
):
def
__init__
(
self
,
**
params
):
super
(
CNN3D_NET
,
self
).
__init__
()
self
.
n_classes
=
params
.
get
(
'n_classes'
,
2
)
self
.
n_channels
=
params
.
get
(
'n_channels'
,
1
)
self
.
dropout
=
params
.
get
(
'dropout'
,
0.5
)
self
.
n_features
=
params
.
get
(
'n_features'
,
80
)
for
ID_CH
in
range
(
self
.
n_channels
):
setattr
(
self
,
f
'features_
{
ID_CH
}
'
,
get_conv3d_block
(
self
.
n_features
,
self
.
dropout
))
self
.
linear
=
get_fully_connect
(
self
.
n_features
,
self
.
n_channels
,
self
.
n_classes
)
def
forward
(
self
,
x
):
out_features
=
[]
for
ID_CH
in
range
(
self
.
n_channels
):
x_channel
=
x
[:,
ID_CH
,
:,:,:].
unsqueeze
(
1
)
conv_block
=
getattr
(
self
,
f
'features_
{
ID_CH
}
'
)
out_channel
=
conv_block
(
x_channel
).
view
(
x
.
shape
[
0
],
-
1
)
out_features
.
append
(
out_channel
)
out_merged
=
torch
.
cat
(
out_features
,
dim
=
1
)
out
=
self
.
linear
(
out_merged
)
return
(
out
)
def
extract_features
(
self
,
x
):
out_features
=
[]
for
ID_CH
in
range
(
self
.
n_channels
):
x_channel
=
x
[:,
ID_CH
,
:,:,:].
unsqueeze
(
1
)
conv_block
=
getattr
(
self
,
f
'features_
{
ID_CH
}
'
)
out_channel
=
conv_block
(
x_channel
).
view
(
x
.
shape
[
0
],
-
1
)
out_features
.
append
(
out_channel
)
out_merged
=
torch
.
cat
(
out_features
,
dim
=
1
)
return
(
out_merged
)
def
initialize_weights
(
self
):
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Linear
):
nn
.
init
.
kaiming_normal_
(
m
.
weight
,
mode
=
'fan_out'
,
nonlinearity
=
'relu'
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
elif
isinstance
(
m
,
nn
.
Conv3d
):
nn
.
init
.
kaiming_normal_
(
m
.
weight
,
mode
=
'fan_out'
,
nonlinearity
=
'relu'
)
elif
isinstance
(
m
,
nn
.
BatchNorm3d
):
nn
.
init
.
constant_
(
m
.
weight
,
1
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
'''
class FeatureNet(nn.Module):
def __init__(self, **params):
super(FeatureNet, self).__init__()
self.n_feat = params['n_feat'] if 'n_feat' in params else 1000
self.n_classes = params['n_classes'] if 'n_classes' in params else 2
self.n_hidden = params['n_hidden'] if 'n_hidden' in params else 1000
self.dropout = params['dropout'] if 'dropout' in params else 0.5
self.fc1 = nn.Sequential(nn.Linear(self.n_feat, self.n_hidden),
nn.Dropout(self.dropout),
nn.ReLU())
self.fc2 = nn.Sequential(nn.Linear(self.n_hidden, 100),
nn.Dropout(self.dropout/2),
nn.ReLU())
self.fc3 = nn.Sequential(nn.Linear(100, self.n_classes),
nn.Dropout(self.dropout/4),
nn.Softmax(dim=1))
def forward(self, x):
out = self.fc3(self.fc2(self.fc1(x)))
# print(self.get_params())
return(out)
def extract_features(self, x):
out = self.fc2(self.fc1(x))
return(out)
def get_params(self):
return({'n_classes':self.n_classes, 'n_hidden':self.n_hidden, 'dropout':self.dropout})
def initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
nn.init.constant_(m.bias, 0)
'''
\ No newline at end of file
radiomics_pipeline/deep_features/networks_OPBG.py
0 → 100755
View file @
a714181f
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 5 11:31:52 2018
@author: andrea
"""
import
torch.nn
as
nn
import
torch
def
get_conv3d_block
(
n_features
,
dropout
):
block
=
nn
.
Sequential
(
nn
.
Conv3d
(
1
,
20
,
3
),
nn
.
BatchNorm3d
(
20
),
nn
.
ReLU
(
True
),
nn
.
MaxPool3d
(
2
,
stride
=
2
),
nn
.
Dropout3d
(
dropout
),
nn
.
Conv3d
(
20
,
40
,
3
),
nn
.
BatchNorm3d
(
40
),
nn
.
ReLU
(
True
),
nn
.
MaxPool3d
(
2
,
stride
=
2
),
nn
.
Dropout3d
(
dropout
/
2
),
nn
.
Conv3d
(
40
,
80
,
3
),
nn
.
BatchNorm3d
(
80
),
nn
.
ReLU
(
True
),
nn
.
MaxPool3d
(
2
,
stride
=
2
),
nn
.
Dropout3d
(
dropout
/
4
),
nn
.
Conv3d
(
80
,
n_features
,
3
),
nn
.
BatchNorm3d
(
n_features
),
nn
.
ReLU
(
True
),
nn
.
MaxPool3d
(
2
,
stride
=
2
),
nn
.
Dropout3d
(
dropout
/
4
),
nn
.
AdaptiveAvgPool3d
(
1
))
return
(
block
)
def
get_fully_connect
(
n_features
,
n_channels
,
n_classes
):
block
=
nn
.
Sequential
(
nn
.
Linear
(
n_features
*
n_channels
,
50
),
nn
.
ReLU
(),
nn
.
Dropout
(
0.1
),
nn
.
Linear
(
50
,
n_classes
),
nn
.
Softmax
(
1
))
return
(
block
)
class
CNN3D_NET
(
nn
.
Module
):
def
__init__
(
self
,
**
params
):
super
(
CNN3D_NET
,
self
).
__init__
()
self
.
n_classes
=
params
.
get
(
'n_classes'
,
2
)
self
.
n_channels
=
params
.
get
(
'n_channels'
,
1
)
self
.
dropout
=
params
.
get
(
'dropout'
,
0.5
)
self
.
n_features
=
params
.
get
(
'n_features'
,
160
)
for
ID_CH
in
range
(
self
.
n_channels
):
setattr
(
self
,
f
'features_
{
ID_CH
}
'
,
get_conv3d_block
(
self
.
n_features
,
self
.
dropout
))
self
.
linear
=
get_fully_connect
(
self
.
n_features
,
self
.
n_channels
,
self
.
n_classes
)
def
forward
(
self
,
x
):
out_features
=
[]
for
ID_CH
in
range
(
self
.
n_channels
):
x_channel
=
x
[:,
ID_CH
,
:,:,:].
unsqueeze
(
1
)
conv_block
=
getattr
(
self
,
f
'features_
{
ID_CH
}
'
)
out_channel
=
conv_block
(
x_channel
).
view
(
x
.
shape
[
0
],
-
1
)
out_features
.
append
(
out_channel
)
out_merged
=
torch
.
cat
(
out_features
,
dim
=
1
)
out
=
self
.
linear
(
out_merged
)
return
(
out
)
def
extract_features
(
self
,
x
):
out_features
=
[]
for
ID_CH
in
range
(
self
.
n_channels
):
x_channel
=
x
[:,
ID_CH
,
:,:,:].
unsqueeze
(
1
)
conv_block
=
getattr
(
self
,
f
'features_
{
ID_CH
}
'
)
out_channel
=
conv_block
(
x_channel
).
view
(
x
.
shape
[
0
],
-
1
)
out_features
.
append
(
out_channel
)
out_merged
=
torch
.
cat
(
out_features
,
dim
=
1
)
return
(
out_merged
)
def
initialize_weights
(
self
):
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Linear
):
nn
.
init
.
kaiming_normal_
(
m
.
weight
,
mode
=
'fan_out'
,
nonlinearity
=
'relu'
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
elif
isinstance
(
m
,
nn
.
Conv3d
):
nn
.
init
.
kaiming_normal_
(
m
.
weight
,
mode
=
'fan_out'
,
nonlinearity
=
'relu'
)
elif
isinstance
(
m
,
nn
.
BatchNorm3d
):
nn
.
init
.
constant_
(
m
.
weight
,
1
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
'''
class FeatureNet(nn.Module):
def __init__(self, **params):
super(FeatureNet, self).__init__()
self.n_feat = params['n_feat'] if 'n_feat' in params else 1000
self.n_classes = params['n_classes'] if 'n_classes' in params else 2
self.n_hidden = params['n_hidden'] if 'n_hidden' in params else 1000
self.dropout = params['dropout'] if 'dropout' in params else 0.5
self.fc1 = nn.Sequential(nn.Linear(self.n_feat, self.n_hidden),
nn.Dropout(self.dropout),
nn.ReLU())
self.fc2 = nn.Sequential(nn.Linear(self.n_hidden, 100),
nn.Dropout(self.dropout/2),
nn.ReLU())
self.fc3 = nn.Sequential(nn.Linear(100, self.n_classes),
nn.Dropout(self.dropout/4),
nn.Softmax(dim=1))
def forward(self, x):
out = self.fc3(self.fc2(self.fc1(x)))
# print(self.get_params())
return(out)
def extract_features(self, x):
out = self.fc2(self.fc1(x))
return(out)
def get_params(self):
return({'n_classes':self.n_classes, 'n_hidden':self.n_hidden, 'dropout':self.dropout})
def initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
nn.init.constant_(m.bias, 0)
'''
\ No newline at end of file
radiomics_pipeline/deep_features/train_deep.py
0 → 100755
View file @
a714181f
PATH
=
'/home/bizzego/projects/radiomics_pipeline'
import
numpy
as
np
import
dap
import
os
import
torch
torch
.
set_num_threads
(
20
)
import
SimpleITK
as
sitk
import
dicom_utils.augmentation
as
aug
from
dap.datasets.numpydatasets
import
NumpyCSVDataset
,
augment_3D
from
dap.metrics
import
BinaryMetrics
,
MulticlassMetrics
from
dap.models.deepmodel
import
DeepLearningModel
from
dap.crossval
import
kfold_split
import
dicom_utils.processing
as
dup
from
networks_OPBG
import
CNN3D_NET
#%%
DATASET
=
'OPBG'
LAB_COL
=
'group'
N_CLASSES
=
4
SIZE
=
64
N_CHANNELS
=
2
#%%
DATADIR
=
f
'
{
PATH
}
/data/
{
DATASET
}
/bbox_T2-FLAIR_noseg'
OUT_DIR
=
f
'
{
PATH
}
/data/
{
DATASET
}
/predictions'
LABELS_FILE
=
f
'
{
PATH
}
/data/
{
DATASET
}
/labels.csv'
dataset
=
NumpyCSVDataset
(
DATADIR
,
LABELS_FILE
,
LAB_COL
,
'multi'
,
SIZE
,
augmentation_function
=
augment_3D
,
name
=
f
'bbox_T2-FLAIR_
{
LAB_COL
}
'
)
##%
#dataset.INFO['mode'] = 'train'
#test_sample = dataset[100]
#image = sitk.GetImageFromArray(test_sample['data'].numpy()[0,:,:,:])
#viz.multi_slice_viewer(image)
#%
metrics
=
MulticlassMetrics
(
reference
=
'MCC'
)
model
=
DeepLearningModel
(
CNN3D_NET
,
params
=
{
'device'
:
'cpu'
,
'n_classes'
:
N_CLASSES
,
'n_channels'
:
N_CHANNELS
,
'n_features'
:
160
},
name
=
'CNN3D_OPBG'
)
dap_settings
=
{
'stratified_test'
:
True
,
'ratio_test'
:
0.2
,
'stratified_valid'
:
True
,
'ratio_valid'
:
0.2
,
'fold_method'
:
kfold_split
,
'cv_n'
:
1
,
'one_fold'
:
True
}
DAP
=
dap
.
DAP
(
dap_settings
,
dataset
,
model
,
metrics
)
#%%
parameters
=
{
'epochs'
:[
1000
],
'batch_size'
:[
4
,
8
],
'dropout'
:[
0.5
,
0.1
]}
param_grid
=
dap
.
dap
.
create_param_grid
(
parameters
)
DAP
.
fit
(
param_grid
)
#%%
DAP
.
predict_on_test
()
#%%
DAP
.
save
(
OUT_DIR
,
lite
=
False
)
radiomics_pipeline/feature_selection/.ipynb_checkpoints/feature_selection-checkpoint.py
deleted
100755 → 0
View file @
32ce2db1
PATH
=
'/home/bizzego/projects/networks_dami/radiomics_pipeline'
import
numpy
as
np
import
pandas
as
pd
import
itertools
from
sklearn.svm
import
SVC
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from
sklearn.preprocessing
import
StandardScaler
,
Imputer
from
sklearn.feature_selection
import
RFECV
,
mutual_info_classif
,
f_classif
,
SelectKBest
from
sklearn.metrics
import
matthews_corrcoef
as
mcc
,
make_scorer
,
auc
,
precision_score
def
create_param_grid
(
params
):
keys
,
values
=
zip
(
*
params
.
items
())
experiments
=
[
dict
(
zip
(
keys
,
v
))
for
v
in
itertools
.
product
(
*
values
)]
grid
=
dict
([(
i
,
d
)
for
i
,
d
in
enumerate
(
experiments
)])
return
(
grid
)
#%%
DATASET
=
'HN'
FEATURE_NAME
=
'merged_features_B_UNCORR'
FEATUREFILE
=
f
'selected/
{
FEATURE_NAME
}
.csv'
LAB_COL
=
'Locoregional'
OUTFILE
=
f
'selected/
{
FEATURE_NAME
}
_selected.csv'
#%%
PREPROCESS
=
True
REMOVE_CORRELATED
=
True
UNIVARIATE_RANKING
=
True
RFE_RANKING
=
True
UNIV_TESTER
=
f_classif
N_FEAT_UNIV
=
1000
model
=
SVC
()
model_params
=
{
'C'
:[
1
],
'kernel'
:[
'linear'
]}
# model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']}
N_FEAT_RFE
=
1000
#%%
FEATDIR
=
f
'
{
PATH
}
/data/
{
DATASET
}
/features'
LABELFILE
=
f
'
{
PATH
}
/data/
{
DATASET
}
/labels.csv'
#%%
features
=
pd
.
read_csv
(
f
'
{
FEATDIR
}
/
{
FEATUREFILE
}
'
,
index_col
=
0
)
labels
=
pd
.
read_csv
(
LABELFILE
,
index_col
=
0
)[
LAB_COL
].
to_frame
()
features
=
pd
.
merge
(
features
,
labels
,
left_index
=
True
,
right_index
=
True
)
labels
=
features
.
pop
(
LAB_COL
).
to_frame
()
X
=
features
.
values
y
=
labels
.
values
[:,
0
]
print
(
X
.
shape
)
#%% PREPROCESS
if
PREPROCESS
:
#% remove nan
imputer
=
Imputer
()
X
=
imputer
.
fit_transform
(
X
)
#% scale
scaler
=
StandardScaler
()
X
=
scaler
.
fit_transform
(
X
)
print
(
'Done preprocessing'
)
print
(
X
.
shape
,
features
.
shape
)
#%% REMOVE CORRELATED
if
REMOVE_CORRELATED
:
corr_matrix
=
abs
(
np
.
corrcoef
(
X
,
rowvar
=
False
))
corr_matrix
[
np
.
where
(
np
.
tril
(
np
.
ones
(
corr_matrix
.
shape
),
k
=
1
).
astype
(
np
.
bool
)
==
True
)]
=
0
idx_to_keep
=
[(
corr_matrix
[:,
idx
]
<=
0.95
).
all
()
for
idx
in
range
(
corr_matrix
.
shape
[
1
])]
X
=
X
[:,
idx_to_keep
]
features
=
features
.
iloc
[:,
idx_to_keep
]
print
(
'Remove correlated'
)
print
(
X
.
shape
,
features
.
shape
)
#%% UNIVARIATE RANKING
if
UNIVARIATE_RANKING
:
selector
=
SelectKBest
(
UNIV_TESTER
,
k
=
'all'
)
selector
.
fit
(
X
,
y
)
scores
=
selector
.
scores_
idx_sorted
=
np
.
argsort
(
scores
)[::
-
1
]
if
N_FEAT_UNIV
==
0
:
X
=
X
[:,
idx_sorted
]
features
=
features
.
iloc
[:,
idx_sorted
]
else
:
X
=
X
[:,
idx_sorted
[:
N_FEAT_UNIV
]]
features
=
features
.
iloc
[:,
idx_sorted
[:
N_FEAT_UNIV
]]
print
(
'Done univariate'
)
print
(
X
.
shape
,
features
.
shape
)