Commit 0bd7b229 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Remove unused files

parent 8a2ddba5
%% Cell type:code id: tags:
``` python
import os
PATH = os.getcwd()
import sys
import numpy as np
import pandas as pd
import SimpleITK as sitk
from dicom_utils import augmentation as aug, processing as dup
from dataset import NumpyCSVDataset
#%%
```
%% Cell type:code id: tags:
``` python
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
```
%% Cell type:code id: tags:
``` python
SIZE = 64
DATASETDIR = f"{PATH}/data/processed/bbox_fixed2_64_TRAIN"
#PRIMA SOLUZIONE
#OUTPUT_DIR = f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_cloned'
#os.makedirs( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_cloned' , exist_ok=False)
#SECONDA SOLUZIONE
OUTPUT_DIR = f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_buildingclones'
#os.makedirs( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_buildingclones' , exist_ok=False)
dataset = NumpyCSVDataset(DATASETDIR , f"{PATH}/data/labels.csv" , "Locoregional", SIZE , mode="test")
```
%% Cell type:markdown id: tags:
### Soluzione
%% Cell type:code id: tags:
``` python
# L'idea e' copiare i file negativi solamente una volta, mentre copiare i file "positivi" piu' volte,
# in modo da creare vari cloni dei file positivi
# Per far questo:
# 1. quando trovi un file positivo devi ripetere K volte questa operazione di cloning
# dove K e' il rapporto di sbilanciamento n_NEGATIVI / n_POSITIVI
# 2. devi aggiungere al nome del soggetto un suffisso,
# perche' se crei tante copie dello stesso file il nome deve essere diverso, altrimenti sovrascrivi e siamo al punto di prima
# occhio alla differenza tra:
# > sample: e' un oggetto di tipo dizionario che viene restituito da NumpyCSVDataset.
# contiene: sample['sample']: il nome del soggetto
# sample['data']: la matrice (3D) di numpy che contiene l'immagine 3D, ovvero i valori di grigio di ogni pixel
# sample['target']: la label (0 o 1)
# > file XXX.npy: file che salva in formato binario l'immagine 3D (= sample['data'])
# > dataset: oggetto di tipo NumpyCSVDataset: praticamente e' una lista di dizionari (vedi sample) un dizionario per ogni file in DATASETDIR
# Praticamente: e' la classe NumpyCSVDataset che si 'arrangia' a caricare i file .npy dalla classe DATASETDIR,
# capire il nome del soggetto (dal nome del file),
# cercare la label del soggetto (nel file labels.csv)
# e associare il tutto in un dizionario che poi viene messo in dataset
#TROVO K:
labels = dataset._labels.values
idx_positive = np.where(labels==1)[0]
K = int((len(labels) - len(idx_positive))/len(idx_positive))
for i in range(len(dataset)):
sample = dataset[i]
SUB = sample['sample']
print(SUB)
image_orig = sample['data']
label = sample['target']
ratio = 1 if label==0 else K # se negativo copia una volta (ratio=1), altrimenti copia K volte (ratio=K)
# CREO UN CONTATORE DA USARE COME SUFFISSO
id_image=0
for j in range(ratio):
# image_aug = augment_3D(image_orig, 'train', SIZE) # Questo passaggio non lo faccio perche' non voglio fare augmentation
np.save(f'{OUTPUT_DIR}/{SUB}_{id_image}.npy', image_orig) # nel nome del file ho inserito il suffisso
id_image +=1
```
%% Cell type:markdown id: tags:
## ================
%% Cell type:markdown id: tags:
PRIMA SOLUZIONE
%% Cell type:markdown id: tags:
Copy each file from one folder to another
%% Cell type:code id: tags:
``` python
# questo e' sbagliato perche' 'sample' e' un dizionario che contiene, tra le varie cose la variabile numpy con dentro l'immagine
# image = sample['data'], dovresti salvare quella: np.save(f'{OUTPUT_DIR}/{subject}.npy', IMAGE )
for i in range(len(dataset)):
sample = dataset[ i ]
subject = sample['sample']
np.save(f'{OUTPUT_DIR}/{subject}.npy', sample )
```
%% Cell type:markdown id: tags:
Clone 1-times only positive samples
%% Cell type:code id: tags:
``` python
labels = dataset._labels.values
idx_positive = np.where(labels==1)[0]
for j in range( len(idx_positive) ):
sample = dataset[ idx_positive[j] ]
image_orig = sample['data']
subject = sample['sample']
sample['target'] = int( sample['target'] )
#QUesta e' giusta
np.save(f'{OUTPUT_DIR}/{subject}_cl3.npy', image_orig )
# questa e' sbagliata (stesso errore che hai fatto sopra).
# siccome usi lo stesso nome del file, il fatto che prima tu abbia salvato il file correttamente diventa inutile
np.save(f'{OUTPUT_DIR}/{subject}_cl3.npy', sample )
```
%% Cell type:markdown id: tags:
SECONDA SOLUZIONE
%% Cell type:markdown id: tags:
"building" new files
%% Cell type:code id: tags:
``` python
for i in range(len(dataset)):
it= dataset[i].items()
it = list(it)
sample = sample = dataset[i]
subject = sample['sample']
data = it[0]
target = it[1]
sample = it[2]
dic = {data, target, sample}
#sbagliato, stai salvando un dizionario,come sopra
np.save(f'{OUTPUT_DIR}/{subject}.npy', dic )
```
%% Cell type:markdown id: tags:
CHECK SOLUZIONI
%% Cell type:code id: tags:
``` python
#newdataset = NumpyCSVDataset( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_cloned', f"{PATH}/data/labels.csv" , "Locoregional", SIZE , mode="test")
newdataset = NumpyCSVDataset( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_buildingclones', f"{PATH}/data/labels.csv" , "Locoregional", SIZE , mode="test")
```
%% Cell type:code id: tags:
``` python
idx_positive[0]#5
dataset[ 5 ]['target']
```
%% Cell type:code id: tags:
``` python
newdataset[ 5 ]['target']
```
%%%% Output: error
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-9-1c6cfa41156c> in <module>()
----> 1 newdataset[ 5 ]['target']
~/projects/networks_dami/dataset.py in __getitem__(self, idx, no_data)
46 data = np.load(data_file)
47
---> 48 data = self.augmentation(data, self.mode, self.size) #qst è numpy
49 data = torch.Tensor(data) #qst è tensor - la network vuole tensor
50 output = {'data': data, 'target': label, 'sample': sample}
~/projects/networks_dami/dataset.py in augment_3D(image, mode, size)
8
9 def augment_3D(image, mode, size):
---> 10 N_CHANNELS = image.shape[0]
11 image_seq = [sitk.GetImageFromArray(image[i,:,:,:]) for i in range(N_CHANNELS)]
12
IndexError: tuple index out of range
%% Cell type:code id: tags:
``` python
cloned = NumpyCSVDataset( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_cloned/bla' ,
f"{PATH}/data/labels.csv" , "Locoregional", SIZE , mode="test")
```
%% Cell type:code id: tags:
``` python
cloned.size
```
%% Cell type:markdown id: tags:
DA COMMAND LINE:
%% Cell type:code id: tags:
``` python
'''
dsalvalai@Starscream:~$ cp -r bbox_fixed2_64_TRAIN bbox_fixed2_64_TRAIN_cloned
dsalvalai@Starscream:~$ cp HN-CHUS-003.npy HN-CHUS-003_cl1.npy
'''
```
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score as acc, precision_score as prec, recall_score as rec, matthews_corrcoef as mcc
from sklearn.metrics import confusion_matrix
#%%
y = pd.read_csv('/home/bizzego/Downloads/TT/TT_merged_features_B_UNCORR_selected_SVMlinear/predictions_test.csv')
y_true = y['true'].values
y_pred = y['pred'].values
#%%
print(classification_report(y_true, y_pred))
print(acc(y_true, y_pred), mcc(y_true, y_pred))
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
from IPython import get_ipython
# %% [markdown]
# ## Deep features extraction
# %%
get_ipython().run_line_magic('HN_env', '')
# %%
import os
PATH = os.path.abspath(os.path.curdir)
# %%
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# %% [markdown]
# ### Import
# %%
import os
import sys
from tqdm import tqdm
import numpy as np
import pandas as pd
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from networks import CiompiDO
from dataset import NumpyCSVDataset, augment_3D_HN
# %%
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
multigpu = True
# %%
DATASETDIR = '/thunderdisk/HN/processed/bbox_64_augmented_LR'
EXPERIMENT_DIR = f'{PATH}/experiments'
# %%
MODEL_NAME = 'LR_noTx_branch_wise_free_aug_CT_20191027-124913'
SIZE = 64
OUTDIR = f'{EXPERIMENT_DIR}/{MODEL_NAME}/features/'
OUTFILE = 'features_noTx_AUG.csv'
os.makedirs(OUTDIR, exist_ok=True)
# %%
dataset = NumpyCSVDataset(DATASETDIR , f'{PATH}/data/clinical_data_noTx.csv' , 'Locoregional', SIZE , mode='test')
loader = DataLoader(dataset, batch_size=8, num_workers=12, pin_memory=True, shuffle=False, drop_last=False)
model_weights = f'{EXPERIMENT_DIR}/{MODEL_NAME}/weights.pth'
# %%
model = CiompiDO(n_classes=2, n_channels=1, modality='CT')
if multigpu:
model = nn.DataParallel(model.to(device))
model = model.module
model.load_state_dict(torch.load(model_weights))
# %%
#%%
deep_features = []
sample_names = []
labels = []
patients = []
with torch.no_grad():
for batch in tqdm(loader):
names_batch = [name.split('.')[0] for name in batch['filename']]
images_batch = batch['data'].to(device)
labels_batch = batch['target']
patients_batch = batch['patient']
out = model.extract_features(images_batch.cuda())
deep_features.append(out.data.cpu().numpy())
sample_names.append(names_batch)
labels.append(labels_batch)
patients.append(patients_batch)
# %%
deep_features = np.concatenate(deep_features)
sample_names = np.concatenate(sample_names)
labels = np.concatenate(labels)
patients = np.concatenate(patients)
# %%
len(labels)
# %%
print(deep_features.shape, len(sample_names),len(labels))
# %%
deep_features_pd = pd.DataFrame(deep_features, index=sample_names)
deep_features_pd['label'] = labels
deep_features_pd['patient'] = patients
#%% SAVE
print(deep_features_pd.shape)
deep_features_pd.to_csv(f'{OUTDIR}/{OUTFILE}')
# %%
%% Cell type:code id: tags:
``` python
import numpy as np
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
```
%% Cell type:code id: tags:
``` python
n=torch.tensor([1]).to(device)
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
### Set Path
%% Cell type:code id: tags:
``` python
%reload_ext autoreload
%autoreload 2
#PATH = '/home/dsalvalai/projects/networks_dami'
import os
PATH = os.getcwd()
print(PATH)
```
%%%% Output: stream
/home/dsalvalai/projects/networks_dami
%% Cell type:markdown id: tags:
TRANSFER LEARNING EXPERIMENT RESPECTING COHORTS .
NO AUGMENTATION ON DATA & NEITHER CLONING POSITIVE DATA .
TRAINING COHORTS : HGJ & CHUS
%% Cell type:markdown id: tags:
### Import packages
%% Cell type:code id: tags:
``` python
import sys
import torch
import pickle
from torch.utils.data import DataLoader
import torch.nn as nn
import numpy as np
import os
from networks import Ciompi
from dataset import NumpyCSVDataset
from sklearn.metrics import matthews_corrcoef as mcor, accuracy_score as acc, recall_score as recall, precision_score as precision
```
%% Cell type:code id: tags:
``` python
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
#os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
```
%% Cell type:code id: tags:
``` python
DATASETDIR = f"{PATH}/data/processed/bbox_fixed2_64_TRAIN" # fine tuning
#DATASETDIR = f"{PATH}/data/processed/bbox_fixed2_64_TEST" # prediction
EXPERIMENT_DIR = f"{PATH}/experiments"
```
%% Cell type:markdown id: tags:
### Settings
%% Cell type:code id: tags:
``` python
EXPERIMENT_NAME = 'multiCto01'
settings = {
'model': Ciompi,
'batch_size': 32,
'lr': 1e-4,
'epochs': 100,
'optim': torch.optim.Adam,
'K': 0.25,
'n_classes': 2,
'seed': 1234
}
#os.makedirs(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}', exist_ok=False)
```
%% Cell type:code id: tags:
``` python
MODEL = settings['model']
BATCH_SIZE = settings['batch_size']
LR = settings['lr']
EPOCHS = settings['epochs']
OPTIMIZER = settings['optim']
K = settings['K']
N_CLASSES = settings['n_classes']
SEED = settings['seed']
```
%% Cell type:markdown id: tags:
### Data Handlers
%% Cell type:markdown id: tags:
Train-Test split indexes
%% Cell type:code id: tags:
``` python
np.random.seed(SEED)
n_samples = len(os.listdir(DATASETDIR))
indexes = np.arange(n_samples)
np.random.shuffle(indexes)
k_idx = int(K*n_samples)
idx_test = indexes[:k_idx]
idx_train = indexes[k_idx:]
```
%% Cell type:markdown id: tags:
Create train-test datasets
%% Cell type:code id: tags:
``` python
dataset_test = NumpyCSVDataset(DATASETDIR , f"{PATH}/data/labels.csv" , "Locoregional",64 , mode="test")
dataset_test._indexes = idx_test
dataset_train = NumpyCSVDataset(DATASETDIR , f"{PATH}/data/labels.csv" , "Locoregional", 64 , mode="train")
dataset_train._indexes = idx_train
```
%% Cell type:markdown id: tags:
Create loaders
%% Cell type:code id: tags:
``` python
loader_test = DataLoader(dataset_test, batch_size=int(BATCH_SIZE/2), num_workers=1, shuffle=True)
loader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, num_workers=24, pin_memory=True, shuffle=True)
```
%% Cell type:markdown id: tags:
Compute weights
%% Cell type:code id: tags:
``` python
labels = dataset_train.get_labels()
weights = [1, (len(labels)-np.sum(labels))/np.sum(labels)]# [1, 1] #
print(weights)
settings['weights'] = weights
weights = torch.Tensor(weights).to(device)
```
%%%% Output: stream
[1, 5.545454545454546]
%% Cell type:markdown id: tags:
### Initialize
%% Cell type:markdown id: tags:
Model
%% Cell type:code id: tags:
``` python
model = MODEL()
state_dict = torch.load(f'{EXPERIMENT_DIR}/cohortsTstage-augm/weights.pth')
state_dict['linear.2.weight'] = state_dict['linear.2.weight'][0:2]
state_dict['linear.2.bias'] = state_dict['linear.2.bias'][0:2]
model.load_state_dict( state_dict ) #fine tuning
model.to(device)
```
%%%% Output: execute_result
Ciompi(
(CT_branch): Sequential(
(0): BatchNorm3d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): Conv3d(1, 32, kernel_size=(5, 5, 5), stride=(1, 1, 1))
(2): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(3): ReLU()
(4): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(5): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1))
(6): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(7): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1))
(8): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(9): ReLU()
(10): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(11): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
(12): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(13): ReLU()
(14): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(15): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
(16): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(17): ReLU()
(18): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(19): AdaptiveAvgPool3d(output_size=1)
)
(PT_branch): Sequential(
(0): BatchNorm3d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): Conv3d(1, 32, kernel_size=(5, 5, 5), stride=(1, 1, 1))
(2): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(3): ReLU()
(4): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(5): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1))
(6): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(7): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1))
(8): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(9): ReLU()
(10): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(11): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
(12): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)