Commit d1d66c16 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

ipynb -> py + better root path handling

parent fdcaf7a8
PATH = '/home/bizzego/projects/radiomics_pipeline'
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %% Change working directory from the workspace root to the ipynb file location. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
os.chdir(os.path.join(os.getcwd(), 'radiomics_pipeline/feature_selection'))
print(os.getcwd())
except:
pass
# %%
from IPython import get_ipython
# %%
get_ipython().run_line_magic('HN_env', '')
# %%
import os
PATH = os.path.abspath(os.path.join(os.path.curdir,'../..'))
# %%
print(PATH)
# %%
import numpy as np
import pandas as pd
import itertools
......@@ -18,17 +44,18 @@ def create_param_grid(params):
grid = dict([(i, d) for i, d in enumerate(experiments)])
return(grid)
#%%
DATASET = 'OPBG'
FEATURE_NAME = 'merged_features_A2'
FEATUREFILE = f'{FEATURE_NAME}.csv'
LAB_COL = 'strat_01'
# %%
FEATURE_NAME = 'merged_radiomics_features_F_SVC_Locoregional.csv_features_noTx_F_SVC_Locoregional.csv' #'radiomics_features'#'features_noTx' #Tstage_binary_augmented_noTx_branch_wise_20191025-160304'
FEATUREFILE = f'{FEATURE_NAME}.csv'
LAB_COL = 'Locoregional'
EXP = 'LR_noTx_branch_wise_free_aug_CT_20191027-124913'
OUTFILE = f'{FEATURE_NAME}_F_SVC_{LAB_COL}.csv'
#%%
PREPROCESS = True
REMOVE_CORRELATED = False
REMOVE_CORRELATED = True
UNIVARIATE_RANKING = True
RFE_RANKING = True
......@@ -40,8 +67,8 @@ model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']} #
N_FEAT_RFE = 1000
#%%
FEATDIR = f'{PATH}/data/{DATASET}/features'
LABELFILE = f'{PATH}/data/{DATASET}/labels_os.csv'
FEATDIR = f'{PATH}/experiments/{EXP}/features/' #'{PATH}/data/' f'{PATH}/data/'
LABELFILE = f'{PATH}/data/clinical_data_noTx.csv'
#%%
features = pd.read_csv(f'{FEATDIR}/{FEATUREFILE}', index_col =0)
......@@ -133,3 +160,10 @@ if RFE_RANKING:
print(features.shape)
features.to_csv(f'{FEATDIR}/{OUTFILE}')
print('features saved')
# %%
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<text style=\"color: green;\">Success</text>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%HN_env"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"PATH = os.path.abspath(os.path.join(os.path.curdir,'../..'))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/utente/bussola/networks_dami'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PATH"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import itertools\n",
"\n",
"from sklearn.svm import SVC\n",
"#from sklearn.ensemble import RandomForestClassifier\n",
"#from sklearn.linear_model import LogisticRegression\n",
"\n",
"from sklearn.preprocessing import StandardScaler, Imputer\n",
"from sklearn.feature_selection import RFECV, mutual_info_classif, f_classif, SelectKBest\n",
"from sklearn.metrics import matthews_corrcoef as mcc, make_scorer, auc, precision_score\n",
"\n",
"def create_param_grid(params):\n",
" keys, values = zip(*params.items())\n",
" experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]\n",
" grid = dict([(i, d) for i, d in enumerate(experiments)])\n",
" return(grid)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2445, 512)\n",
"Done preprocessing\n",
"(2445, 512) (2445, 512)\n",
"Remove correlated\n",
"(2445, 512) (2445, 512)\n",
"Done univariate\n",
"(2445, 512) (2445, 512)\n",
"{'C': 0.001, 'kernel': 'linear'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/utente/anaconda3/envs/dappertf/lib/python3.6/site-packages/sklearn/utils/deprecation.py:66: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.\n",
" warnings.warn(msg, category=DeprecationWarning)\n"
]
}
],
"source": [
"FEATURE_NAME = 'features_noTx' #Tstage_binary_augmented_noTx_branch_wise_20191025-160304'\n",
"FEATUREFILE = f'{FEATURE_NAME}.csv'\n",
"LAB_COL = 'Locoregional'\n",
"EXP = 'LR_noTx_branch_wise_free_aug_20191027-003918'\n",
"OUTFILE = f'{FEATURE_NAME}_F_SVC_{LAB_COL}.csv'\n",
"\n",
"#%%\n",
"PREPROCESS = True\n",
"REMOVE_CORRELATED = True\n",
"UNIVARIATE_RANKING = True\n",
"RFE_RANKING = True\n",
"\n",
"UNIV_TESTER = f_classif\n",
"N_FEAT_UNIV = 1000\n",
"\n",
"model = SVC()\n",
"model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']} # model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']}\n",
"N_FEAT_RFE = 1000\n",
"\n",
"#%%\n",
"FEATDIR = f'{PATH}/experiments/{EXP}/features/' #{PATH}/data/'\n",
"LABELFILE = f'{PATH}/data/clinical_data_noTx.csv'\n",
"\n",
"#%%\n",
"features = pd.read_csv(f'{FEATDIR}/{FEATUREFILE}', index_col =0)\n",
"labels = pd.read_csv(LABELFILE, index_col =0)[LAB_COL].to_frame()\n",
"features = pd.merge(features, labels, left_index=True, right_index=True)\n",
"\n",
"labels = features.pop(LAB_COL).to_frame()\n",
"\n",
"X = features.values\n",
"y = labels.values[:,0]\n",
"\n",
"print(X.shape)\n",
"#%% PREPROCESS\n",
"if PREPROCESS:\n",
" #% remove nan\n",
" imputer = Imputer()\n",
" X = imputer.fit_transform(X)\n",
" \n",
" #% scale\n",
" scaler = StandardScaler()\n",
" X = scaler.fit_transform(X)\n",
" print('Done preprocessing')\n",
" print(X.shape, features.shape)\n",
" \n",
"#%% REMOVE CORRELATED\n",
"if REMOVE_CORRELATED:\n",
" corr_matrix = abs(np.corrcoef(X, rowvar=False))\n",
" corr_matrix[np.where(np.tril(np.ones(corr_matrix.shape), k=1).astype(np.bool) == True)] = 0\n",
" idx_to_keep = [(corr_matrix[:,idx]<=0.95).all() for idx in range(corr_matrix.shape[1])]\n",
" X = X[:,idx_to_keep]\n",
" features = features.iloc[:,idx_to_keep]\n",
" print('Remove correlated')\n",
" print(X.shape, features.shape)\n",
"\n",
"#%% UNIVARIATE RANKING\n",
"if UNIVARIATE_RANKING:\n",
" selector = SelectKBest(UNIV_TESTER, k='all')\n",
" selector.fit(X, y)\n",
" scores = selector.scores_\n",
" idx_sorted = np.argsort(scores)[::-1]\n",
" \n",
" if N_FEAT_UNIV == 0:\n",
" X = X[:,idx_sorted]\n",
" features = features.iloc[:,idx_sorted]\n",
" else:\n",
" X = X[:,idx_sorted[:N_FEAT_UNIV]]\n",
" features = features.iloc[:,idx_sorted[:N_FEAT_UNIV]]\n",
" \n",
" print('Done univariate')\n",
" print(X.shape, features.shape)\n",
"\n",
" \n",
"#%% RFE RANKING\n",
"if RFE_RANKING:\n",
" param_grid = create_param_grid(model_params)\n",
" \n",
" rfe_max_scores = []\n",
" rfe_rankings = []\n",
" \n",
" model_name = 'svc'\n",
" for i in range(len(param_grid)):\n",
" model.set_params(**param_grid[i])\n",
" print(param_grid[i])\n",
" rfe_selector = RFECV(model, step=1, cv=5, n_jobs=32, scoring=make_scorer(mcc))\n",
" rfe_selector = rfe_selector.fit(X, y)\n",
" \n",
" rfe_max_scores.append(np.max(rfe_selector.grid_scores_))\n",
" rfe_rankings.append(rfe_selector.ranking_)\n",
" \n",
" #%%\n",
" idx_best_model = np.argmax(rfe_max_scores)\n",
" print(model_params['C'][idx_best_model])\n",
" print(rfe_max_scores)\n",
" \n",
" rfe_ranking = rfe_rankings[idx_best_model]\n",
" idx_sorted = np.argsort(rfe_ranking)\n",
" \n",
" if N_FEAT_RFE == 0:\n",
" X = X[:,idx_sorted]\n",
" features = features.iloc[:,idx_sorted]\n",
" else:\n",
" X = X[:,idx_sorted[:N_FEAT_RFE]]\n",
" features = features.iloc[:,idx_sorted[:N_FEAT_RFE]]\n",
"\n",
" print('Done RFE')\n",
" print(X.shape, features.shape)\n",
"\n",
"#%%\n",
"print(features.shape)\n",
"features.to_csv(f'{FEATDIR}/{OUTFILE}')\n",
"\n",
"print('features saved')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (dappertf)",
"language": "python",
"name": "dappertf"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:code id: tags:
``` python
%HN_env
```
%%%% Output: execute_result
<IPython.core.display.HTML object>
%% Cell type:code id: tags:
``` python
import os
PATH = os.path.abspath(os.path.join(os.path.curdir,'../..'))
```
%% Cell type:code id: tags:
``` python
PATH
```
%%%% Output: execute_result
'/home/utente/bussola/networks_dami'
%% Cell type:code id: tags:
``` python
import numpy as np
import pandas as pd
import itertools
from sklearn.svm import SVC
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.feature_selection import RFECV, mutual_info_classif, f_classif, SelectKBest
from sklearn.metrics import matthews_corrcoef as mcc, make_scorer, auc, precision_score
def create_param_grid(params):
keys, values = zip(*params.items())
experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]
grid = dict([(i, d) for i, d in enumerate(experiments)])
return(grid)
```
%% Cell type:code id: tags:
``` python
FEATURE_NAME = 'features_noTx' #Tstage_binary_augmented_noTx_branch_wise_20191025-160304'
FEATUREFILE = f'{FEATURE_NAME}.csv'
LAB_COL = 'Locoregional'
EXP = 'LR_noTx_branch_wise_free_aug_20191027-003918'
OUTFILE = f'{FEATURE_NAME}_F_SVC_{LAB_COL}.csv'
#%%
PREPROCESS = True
REMOVE_CORRELATED = True
UNIVARIATE_RANKING = True
RFE_RANKING = True
UNIV_TESTER = f_classif
N_FEAT_UNIV = 1000
model = SVC()
model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']} # model_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']}
N_FEAT_RFE = 1000
#%%
FEATDIR = f'{PATH}/experiments/{EXP}/features/' #{PATH}/data/'
LABELFILE = f'{PATH}/data/clinical_data_noTx.csv'
#%%
features = pd.read_csv(f'{FEATDIR}/{FEATUREFILE}', index_col =0)
labels = pd.read_csv(LABELFILE, index_col =0)[LAB_COL].to_frame()
features = pd.merge(features, labels, left_index=True, right_index=True)
labels = features.pop(LAB_COL).to_frame()
X = features.values
y = labels.values[:,0]
print(X.shape)
#%% PREPROCESS
if PREPROCESS:
#% remove nan
imputer = Imputer()
X = imputer.fit_transform(X)
#% scale
scaler = StandardScaler()
X = scaler.fit_transform(X)
print('Done preprocessing')
print(X.shape, features.shape)
#%% REMOVE CORRELATED
if REMOVE_CORRELATED:
corr_matrix = abs(np.corrcoef(X, rowvar=False))
corr_matrix[np.where(np.tril(np.ones(corr_matrix.shape), k=1).astype(np.bool) == True)] = 0
idx_to_keep = [(corr_matrix[:,idx]<=0.95).all() for idx in range(corr_matrix.shape[1])]
X = X[:,idx_to_keep]
features = features.iloc[:,idx_to_keep]
print('Remove correlated')
print(X.shape, features.shape)
#%% UNIVARIATE RANKING
if UNIVARIATE_RANKING:
selector = SelectKBest(UNIV_TESTER, k='all')
selector.fit(X, y)
scores = selector.scores_
idx_sorted = np.argsort(scores)[::-1]
if N_FEAT_UNIV == 0:
X = X[:,idx_sorted]
features = features.iloc[:,idx_sorted]
else:
X = X[:,idx_sorted[:N_FEAT_UNIV]]
features = features.iloc[:,idx_sorted[:N_FEAT_UNIV]]
print('Done univariate')
print(X.shape, features.shape)
#%% RFE RANKING
if RFE_RANKING:
param_grid = create_param_grid(model_params)
rfe_max_scores = []
rfe_rankings = []
model_name = 'svc'
for i in range(len(param_grid)):
model.set_params(**param_grid[i])
print(param_grid[i])
rfe_selector = RFECV(model, step=1, cv=5, n_jobs=32, scoring=make_scorer(mcc))
rfe_selector = rfe_selector.fit(X, y)
rfe_max_scores.append(np.max(rfe_selector.grid_scores_))
rfe_rankings.append(rfe_selector.ranking_)
#%%
idx_best_model = np.argmax(rfe_max_scores)
print(model_params['C'][idx_best_model])
print(rfe_max_scores)
rfe_ranking = rfe_rankings[idx_best_model]
idx_sorted = np.argsort(rfe_ranking)
if N_FEAT_RFE == 0:
X = X[:,idx_sorted]
features = features.iloc[:,idx_sorted]
else:
X = X[:,idx_sorted[:N_FEAT_RFE]]
features = features.iloc[:,idx_sorted[:N_FEAT_RFE]]
print('Done RFE')
print(X.shape, features.shape)
#%%
print(features.shape)
features.to_csv(f'{FEATDIR}/{OUTFILE}')
print('features saved')
```
%%%% Output: stream
(2445, 512)
Done preprocessing
(2445, 512) (2445, 512)
Remove correlated
(2445, 512) (2445, 512)
Done univariate
(2445, 512) (2445, 512)
{'C': 0.001, 'kernel': 'linear'}
%%%% Output: stream
/home/utente/anaconda3/envs/dappertf/lib/python3.6/site-packages/sklearn/utils/deprecation.py:66: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.
warnings.warn(msg, category=DeprecationWarning)
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment