Commit d1d66c16 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

ipynb -> py + better root path handling

parent fdcaf7a8
PATH = '/home/bizzego/projects/radiomics_pipeline' # To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %% Change working directory from the workspace root to the ipynb file location. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
os.chdir(os.path.join(os.getcwd(), 'radiomics_pipeline/feature_selection'))
print(os.getcwd())
except:
pass
# %%
from IPython import get_ipython
# %%
get_ipython().run_line_magic('HN_env', '')
# %%
import os
PATH = os.path.abspath(os.path.join(os.path.curdir,'../..'))
# %%
print(PATH)
# %%
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import itertools import itertools
...@@ -18,17 +44,18 @@ def create_param_grid(params): ...@@ -18,17 +44,18 @@ def create_param_grid(params):
grid = dict([(i, d) for i, d in enumerate(experiments)]) grid = dict([(i, d) for i, d in enumerate(experiments)])
return(grid) return(grid)
#%%
DATASET = 'OPBG'
FEATURE_NAME = 'merged_features_A2'
FEATUREFILE = f'{FEATURE_NAME}.csv'
LAB_COL = 'strat_01'
# %%
FEATURE_NAME = 'merged_radiomics_features_F_SVC_Locoregional.csv_features_noTx_F_SVC_Locoregional.csv' #'radiomics_features'#'features_noTx' #Tstage_binary_augmented_noTx_branch_wise_20191025-160304'
FEATUREFILE = f'{FEATURE_NAME}.csv'
LAB_COL = 'Locoregional'
EXP = 'LR_noTx_branch_wise_free_aug_CT_20191027-124913'
OUTFILE = f'{FEATURE_NAME}_F_SVC_{LAB_COL}.csv' OUTFILE = f'{FEATURE_NAME}_F_SVC_{LAB_COL}.csv'
#%% #%%
PREPROCESS = True PREPROCESS = True
REMOVE_CORRELATED = False REMOVE_CORRELATED = True
UNIVARIATE_RANKING = True UNIVARIATE_RANKING = True
RFE_RANKING = True RFE_RANKING = True
...@@ -40,8 +67,8 @@ model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']} # ...@@ -40,8 +67,8 @@ model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']} #
N_FEAT_RFE = 1000 N_FEAT_RFE = 1000
#%% #%%
FEATDIR = f'{PATH}/data/{DATASET}/features' FEATDIR = f'{PATH}/experiments/{EXP}/features/' #'{PATH}/data/' f'{PATH}/data/'
LABELFILE = f'{PATH}/data/{DATASET}/labels_os.csv' LABELFILE = f'{PATH}/data/clinical_data_noTx.csv'
#%% #%%
features = pd.read_csv(f'{FEATDIR}/{FEATUREFILE}', index_col =0) features = pd.read_csv(f'{FEATDIR}/{FEATUREFILE}', index_col =0)
...@@ -133,3 +160,10 @@ if RFE_RANKING: ...@@ -133,3 +160,10 @@ if RFE_RANKING:
print(features.shape) print(features.shape)
features.to_csv(f'{FEATDIR}/{OUTFILE}') features.to_csv(f'{FEATDIR}/{OUTFILE}')
print('features saved')
# %%
%% Cell type:code id: tags:
``` python
%HN_env
```
%% Output
<IPython.core.display.HTML object>
%% Cell type:code id: tags:
``` python
import os
PATH = os.path.abspath(os.path.join(os.path.curdir,'../..'))
```
%% Cell type:code id: tags:
``` python
PATH
```
%% Output
'/home/utente/bussola/networks_dami'
%% Cell type:code id: tags:
``` python
import numpy as np
import pandas as pd
import itertools
from sklearn.svm import SVC
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.feature_selection import RFECV, mutual_info_classif, f_classif, SelectKBest
from sklearn.metrics import matthews_corrcoef as mcc, make_scorer, auc, precision_score
def create_param_grid(params):
keys, values = zip(*params.items())
experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]
grid = dict([(i, d) for i, d in enumerate(experiments)])
return(grid)
```
%% Cell type:code id: tags:
``` python
FEATURE_NAME = 'features_noTx' #Tstage_binary_augmented_noTx_branch_wise_20191025-160304'
FEATUREFILE = f'{FEATURE_NAME}.csv'
LAB_COL = 'Locoregional'
EXP = 'LR_noTx_branch_wise_free_aug_20191027-003918'
OUTFILE = f'{FEATURE_NAME}_F_SVC_{LAB_COL}.csv'
#%%
PREPROCESS = True
REMOVE_CORRELATED = True
UNIVARIATE_RANKING = True
RFE_RANKING = True
UNIV_TESTER = f_classif
N_FEAT_UNIV = 1000
model = SVC()
model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']} # model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']}
N_FEAT_RFE = 1000
#%%
FEATDIR = f'{PATH}/experiments/{EXP}/features/' #{PATH}/data/'
LABELFILE = f'{PATH}/data/clinical_data_noTx.csv'
#%%
features = pd.read_csv(f'{FEATDIR}/{FEATUREFILE}', index_col =0)
labels = pd.read_csv(LABELFILE, index_col =0)[LAB_COL].to_frame()
features = pd.merge(features, labels, left_index=True, right_index=True)
labels = features.pop(LAB_COL).to_frame()
X = features.values
y = labels.values[:,0]
print(X.shape)
#%% PREPROCESS
if PREPROCESS:
#% remove nan
imputer = Imputer()
X = imputer.fit_transform(X)
#% scale
scaler = StandardScaler()
X = scaler.fit_transform(X)
print('Done preprocessing')
print(X.shape, features.shape)
#%% REMOVE CORRELATED
if REMOVE_CORRELATED:
corr_matrix = abs(np.corrcoef(X, rowvar=False))
corr_matrix[np.where(np.tril(np.ones(corr_matrix.shape), k=1).astype(np.bool) == True)] = 0
idx_to_keep = [(corr_matrix[:,idx]<=0.95).all() for idx in range(corr_matrix.shape[1])]
X = X[:,idx_to_keep]
features = features.iloc[:,idx_to_keep]
print('Remove correlated')
print(X.shape, features.shape)
#%% UNIVARIATE RANKING
if UNIVARIATE_RANKING:
selector = SelectKBest(UNIV_TESTER, k='all')
selector.fit(X, y)
scores = selector.scores_
idx_sorted = np.argsort(scores)[::-1]
if N_FEAT_UNIV == 0:
X = X[:,idx_sorted]
features = features.iloc[:,idx_sorted]
else:
X = X[:,idx_sorted[:N_FEAT_UNIV]]
features = features.iloc[:,idx_sorted[:N_FEAT_UNIV]]
print('Done univariate')
print(X.shape, features.shape)
#%% RFE RANKING
if RFE_RANKING:
param_grid = create_param_grid(model_params)
rfe_max_scores = []
rfe_rankings = []
model_name = 'svc'
for i in range(len(param_grid)):
model.set_params(**param_grid[i])
print(param_grid[i])
rfe_selector = RFECV(model, step=1, cv=5, n_jobs=32, scoring=make_scorer(mcc))
rfe_selector = rfe_selector.fit(X, y)
rfe_max_scores.append(np.max(rfe_selector.grid_scores_))
rfe_rankings.append(rfe_selector.ranking_)
#%%
idx_best_model = np.argmax(rfe_max_scores)
print(model_params['C'][idx_best_model])
print(rfe_max_scores)
rfe_ranking = rfe_rankings[idx_best_model]
idx_sorted = np.argsort(rfe_ranking)
if N_FEAT_RFE == 0:
X = X[:,idx_sorted]
features = features.iloc[:,idx_sorted]
else:
X = X[:,idx_sorted[:N_FEAT_RFE]]
features = features.iloc[:,idx_sorted[:N_FEAT_RFE]]
print('Done RFE')
print(X.shape, features.shape)
#%%
print(features.shape)
features.to_csv(f'{FEATDIR}/{OUTFILE}')
print('features saved')
```
%% Output
(2445, 512)
Done preprocessing
(2445, 512) (2445, 512)
Remove correlated
(2445, 512) (2445, 512)
Done univariate
(2445, 512) (2445, 512)
{'C': 0.001, 'kernel': 'linear'}
/home/utente/anaconda3/envs/dappertf/lib/python3.6/site-packages/sklearn/utils/deprecation.py:66: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.
warnings.warn(msg, category=DeprecationWarning)
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment