Commit 608db4ad authored by Nicole Bussola's avatar Nicole Bussola Committed by Alessia Marcolini
Browse files

implement patient-wise splitting protocol for augmented dataset

parent e751d783
%% Cell type:markdown id: tags:
### NOTEBOOK INFO
Train on T-stage
Train/Test split: 80/20
## Training network for featture extraction
%% Cell type:markdown id: tags:
### Set Path
%% Cell type:code id: tags:
``` python
%reload_ext autoreload
%autoreload 2
import os
PATH = os.getcwd()
print(PATH)
```
%% Output
/home/utente/bussola/networks_dami
%% Cell type:markdown id: tags:
### Import packages
%% Cell type:code id: tags:
``` python
import datetime
import gc
import pickle
import sys
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import matthews_corrcoef as mcor, accuracy_score as acc, recall_score as recall, precision_score as precision, confusion_matrix
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from networks import CiompiDO, ResNet50_3d
from dataset import NumpyCSVDataset, augment_3D_HN
from split import train_test_indexes_patient_wise
```
%% Cell type:code id: tags:
``` python
#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
#os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multigpu = True
```
%% Cell type:code id: tags:
``` python
DATASET_DIR = f"/thunderdisk/HN/processed/bbox_fixed2_64" #Not augmented but already 64**3 (for faster loading)
DATASET_DIR = f"/thunderdisk/HN/processed/bbox_64_augmented/" #Not augmented but already 64**3 (for faster loading)
EXPERIMENT_DIR = f"{PATH}/experiments"
PRETRAINED_MED3D_WEIGHTS = '/thunderdisk/HN/MedicalNet_pytorch_files/pretrain/resnet_50.pth'
```
%% Cell type:markdown id: tags:
### Settings
%% Cell type:code id: tags:
``` python
EXPERIMENT_NAME = 'Tstage_binary_resnet_pretrained_stack_noTx_' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
EXPERIMENT_NAME = 'prova' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
settings = {
'model': ResNet50_3d,
'model': CiompiDO,
'batch_size': 16,
'lr': 1e-4,
'epochs': 1,
'optim': torch.optim.Adam,
'K': 0.2,
'n_classes': 2, #TSTAGE
'seed': 1234,
'dropout': 0.5,
'size': 64,
'pretrained': True
'pretrained': 'branch-wise',
}
assert settings['pretrained'] in ['Med3D', 'branch-wise', '']
os.makedirs(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}', exist_ok=False)
```
%% Cell type:code id: tags:
``` python
MODEL = settings['model']
BATCH_SIZE = settings['batch_size']
LR = settings['lr']
EPOCHS = settings['epochs']
OPTIMIZER = settings['optim']
K = settings['K']
N_CLASSES = settings['n_classes']
SEED = settings['seed']
DROPOUT = settings['dropout']
SIZE = settings['size']
PRETRAINED = settings['pretrained']
```
%% Cell type:markdown id: tags:
### Tensorboard settings
%% Cell type:code id: tags:
``` python
def new_run_log_dir(experiment_name):
log_dir = os.path.join(PATH, 'tb-runs')
if not os.path.exists(log_dir):
os.makedirs(log_dir)
run_log_dir = os.path.join(log_dir, experiment_name)
return run_log_dir
log_dir = new_run_log_dir(EXPERIMENT_NAME)
print(f'Tensorboard folder: {log_dir}')
writer = SummaryWriter(log_dir)
```
%% Output
Tensorboard folder: /home/utente/bussola/networks_dami/tb-runs/prova20191022-182638
%% Cell type:markdown id: tags:
### Data Handlers
%% Cell type:markdown id: tags:
Train-Test split indexes using sklearn StratifiedShuffleSplit
Train-Test split 80/20 indexes using sklearn StratifiedShuffleSplit
%% Cell type:code id: tags:
``` python
clinical_data = f'{PATH}/data/clinical_data_noTx.csv'
target_column = 'T-stage_binary'
```
%% Cell type:code id: tags:
``` python
np.random.seed(SEED)
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=K, random_state=SEED)
dataset = NumpyCSVDataset(DATASET_DIR , clinical_data , target_column, SIZE , mode="train")
n_samples = len(dataset)
fake_X = np.zeros(n_samples)
labels = dataset.get_labels()
split = sss.split(fake_X, labels)
idx_train, idx_test = next(iter(split))
dataset = NumpyCSVDataset(DATASET_DIR , clinical_data , target_column, SIZE , mode='train')
idx_train, idx_test = train_test_indexes_patient_wise(dataset, test_size=0.2, seed=SEED, stratify=True)
```
%% Cell type:markdown id: tags:
Create train-test datasets
%% Cell type:code id: tags:
``` python
dataset_test = NumpyCSVDataset(DATASET_DIR , clinical_data , target_column, SIZE, mode="test", augmentation_function=augment_3D_HN)
dataset_test = NumpyCSVDataset(DATASET_DIR, clinical_data, target_column, SIZE, mode='test', augmentation_function=augment_3D_HN)
dataset_test._indexes = idx_test
dataset_train = NumpyCSVDataset(DATASET_DIR , clinical_data , target_column, SIZE, mode="train", augmentation_function=augment_3D_HN)
dataset_train = NumpyCSVDataset(DATASET_DIR, clinical_data, target_column, SIZE, mode='train', augmentation_function=augment_3D_HN)
dataset_train._indexes = idx_train
```
%% Cell type:markdown id: tags:
Check class balance
%% Cell type:code id: tags:
``` python
labels_test = dataset_test.get_labels()
labels_train = dataset_train.get_labels()
c,n = np.unique(labels_test, return_counts=True)
print(np.c_[c,n/len(labels_test)])
c,n = np.unique(labels_train, return_counts=True)
print(np.c_[c,n/len(labels_train)])
```
%% Cell type:markdown id: tags:
Create loaders
%% Cell type:code id: tags:
``` python
loader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE//2, num_workers=12, shuffle=True)
loader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, num_workers=12, pin_memory=True, shuffle=True)
```
%% Cell type:markdown id: tags:
Compute weights
%% Cell type:code id: tags:
``` python
labels = dataset_train.get_labels()
#class_sample_count = np.array([len(np.where( labels == t )[0]) for t in np.unique( labels )])
_, class_sample_count = np.unique(labels, return_counts=True)
n_min = np.min(class_sample_count)
weights = n_min / class_sample_count # versione proporzionale, usare n_min invece che 1 per pesi ~1
weights = torch.Tensor(weights).to(device)
```
%% Cell type:markdown id: tags:
### Initialize
### Initialize Model
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
Model
``` python
model = MODEL(n_classes=N_CLASSES, dropout=DROPOUT)
if multigpu:
model = nn.DataParallel(model.to(device))
model = model.module
```
%% Cell type:code id: tags:
``` python
model = MODEL(n_classes=N_CLASSES, dropout=DROPOUT)
model = nn.DataParallel(model.to(device))
#print(model.state_dict()['module.conv1.weight'][1,0,0,0])
model.initialize_weights()
#model.initialize_weights()
if PRETRAINED:
if PRETRAINED == 'Med3D':
pretrained_dict = torch.load(PRETRAINED_MED3D_WEIGHTS)['state_dict']
model_dict = model.state_dict()
# discard layers not present in destination network or with different shape
pretrained_dict = {k: v for k, v in pretrained_dict.items() if
(k in model_dict) and (model_dict[k].shape == pretrained_dict[k].shape)}
for name in model.state_dict().keys():
if name in pretrained_dict.keys():
#print(name)
model.state_dict()[name].copy_(pretrained_dict[name])
#print(model.state_dict()['module.conv1.weight'][1,0,0,0])
elif PRETRAINED == 'branch-wise':
pretrained_CT_dict = torch.load(f'{EXPERIMENT_DIR}/Tstage_grouped_noTx_CT_20191021-143133/weights.pth')
pretrained_PT_dict = torch.load(f'{EXPERIMENT_DIR}/Tstage_binary_PET_noTx_20191022-124046/weights.pth')
model_dict = model.state_dict()
pretrained_CT_dict = {k: v for k, v in pretrained_CT_dict.items() if
(k in model_dict) and (model_dict[k].shape == pretrained_CT_dict[k].shape)}
pretrained_PT_dict = {k: v for k, v in pretrained_PT_dict.items() if
(k in model_dict) and (model_dict[k].shape == pretrained_PT_dict[k].shape)}
to_add = 'module.' if multigpu else ''
for name in model.CT_branch.state_dict().keys():
name_complete = to_add + 'CT_branch.' + name
#print(name_complete)
if name_complete in pretrained_CT_dict.keys():
print(name)
model.CT_branch.state_dict()[name].copy_(pretrained_CT_dict[name_complete])
for name in model.PT_branch.state_dict().keys():
name_complete = to_add + 'PT_branch.' + name
#print(name_complete)
if name_complete in pretrained_PT_dict.keys():
print(name)
model.PT_branch.state_dict()[name].copy_(pretrained_PT_dict[name_complete])
```
%% Cell type:markdown id: tags:
Optimizer
%% Cell type:code id: tags:
``` python
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
```
%% Cell type:code id: tags:
``` python
#[x.shape for x in model.parameters()]
```
%% Cell type:markdown id: tags:
Loss
%% Cell type:code id: tags:
``` python
criterion = nn.CrossEntropyLoss(weight=weights)
```
%% Cell type:code id: tags:
``` python
NEW_LABELS = list(range(len(list(np.unique(labels_train)))))
dictionary = dict(zip(list(np.unique(labels_train)), NEW_LABELS))
dictionary
```
%% Cell type:markdown id: tags:
### Train
%% Cell type:code id: tags:
``` python
model.train() # Set model to training mode
global_i = 0
losses_tr = []
losses_ts = []
last_loss_test = -1
iteration = 0
start_time = time.time()
for epoch in range(EPOCHS):
#print(epoch)
if epoch % 10 == 0: #save checkpoint
torch.save(model.state_dict(), f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/checkpoint_{epoch}.pth')
for j, data in enumerate(loader_train):
global_i += 1
if j%10 == 0:
print(time.time() - start_time)
start_time = time.time()
optimizer.zero_grad()
images_tr = data['data'].to(device)
labels_tr = torch.LongTensor([dictionary[i] for i in data['target']]).to(device)
outputs_tr = model(images_tr).to(device)
# backward
loss = criterion(outputs_tr, labels_tr)
loss.backward()
optimizer.step()
# check test set
if j % int(len(loader_train) / 2) == 0 and j != 0:
model.eval()
with torch.no_grad():
losses_sum = 0
num_samples_test = 0
for data_test in loader_test:
images_ts = data_test['data'].to(device)
labels_ts = torch.LongTensor([dictionary[i] for i in data_test['target']]).to(device)
outputs_ts = model.forward(images_ts)
loss_test_sum = criterion(outputs_ts, labels_ts).item()
losses_sum += loss_test_sum
num_samples_test += 1
loss_test_avg = losses_sum / num_samples_test
writer.add_scalar(f'{EXPERIMENT_NAME}/test_loss', loss_test_avg, global_i)
writer.flush()
#is_best = loss_val_avg < last_loss_val
#if is_best:
# torch.save(model.state_dict(),
# f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/checkpoint_best_{epoch}.pth')
last_loss_test = loss_test_avg
losses_tr.append(loss.item())
losses_ts.append(loss_test_avg)
del images_ts, labels_ts
iteration += 1
del images_tr, labels_tr
gc.collect()
model.train()
# sys.stdout.write
writer.add_scalar(f'{EXPERIMENT_NAME}/train_loss', loss.item(), global_i)
writer.flush()
sys.stdout.write('\r Epoch {} of {} [{:.2f}%] - loss TR/TS: {:.4f} / {:.4f} - {}'.format(epoch + 1, EPOCHS,
100 * j / len(
loader_train),
loss.item(),
last_loss_test,
optimizer.param_groups[
0]['lr']))
```
%% Cell type:markdown id: tags:
### Predict on Train
%% Cell type:code id: tags:
``` python
model.eval()
dataset_train.mode = 'test' #no augmentation
preds_tr = []
trues_tr = []
probs_tr = []
filenames_tr = []
with torch.no_grad():
for data in dataset_train:
image = data["data"].unsqueeze(0).to(device)
label = data["target"]
output = model(image) #forward
_, pred = torch.max(output,1)
preds_tr.append(pred.data.cpu().numpy())
# trues.append(label)
trues_tr.append(dictionary[label])
probs_tr.append(output.data.cpu().numpy())
filenames_tr.append(data['sample'])
probs_tr = np.concatenate(probs_tr)
preds_tr = np.concatenate(preds_tr)
trues_tr = np.array(trues_tr)
filenames_tr = np.array(filenames_tr)
MCC_tr = mcor(trues_tr, preds_tr)
ACC_tr = acc(trues_tr, preds_tr)
prec_tr = precision(trues_tr, preds_tr, average='weighted')
rec_tr = recall(trues_tr, preds_tr, average='weighted')
print("MCC train", MCC_tr, "ACC train", ACC_tr)
print("precision train", prec_tr, "recall train", rec_tr )
train_metrics = np.array([MCC_tr, ACC_tr, prec_tr, rec_tr])
```
%% Cell type:markdown id: tags:
### Predict on Test
%% Cell type:code id: tags:
``` python
model.eval()
preds_ts = []
trues_ts = []
probs_ts = []
filenames_ts = []
with torch.no_grad():
for data in dataset_test:
image = data["data"].unsqueeze(0).to(device)
label = data["target"]
output = model(image) #forward
_, pred = torch.max(output,1)
preds_ts.append(pred.data.cpu().numpy())
trues_ts.append(dictionary[label])
probs_ts.append(output.data.cpu().numpy())
filenames_ts.append(data['sample'])
probs_ts = np.concatenate(probs_ts)
preds_ts = np.concatenate(preds_ts)
trues_ts = np.array(trues_ts)
filenames_ts = np.array(filenames_ts)
MCC_ts = mcor(trues_ts, preds_ts)
ACC_ts = acc(trues_ts, preds_ts)
prec_ts = precision(trues_ts, preds_ts, average='weighted')
rec_ts = recall(trues_ts, preds_ts, average='weighted')
print("MCC test", MCC_ts, "ACC test", ACC_ts)
print("precision test", prec_ts, "recall test", rec_ts )
test_metrics = np.array([MCC_ts, ACC_ts, prec_ts, rec_ts])
```
%% Cell type:markdown id: tags:
## Save results
%% Cell type:markdown id: tags:
Save settings
%% Cell type:code id: tags:
``` python
with open(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/settings.pkl', 'wb') as f:
pickle.dump(settings, f, pickle.HIGHEST_PROTOCOL)
```
%% Cell type:markdown id: tags:
Save losses
%% Cell type:code id: tags:
``` python
losses_tr = np.array(losses_tr)
losses_vl = np.array(losses_ts)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/losses_tr.npy', losses_tr)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/losses_ts.npy', losses_vl)
```
%% Cell type:markdown id: tags:
Plot losses
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(20,10))
plt.plot(losses_tr, color='blue')
plt.plot(losses_ts, color='orange')
plt.legend(['train', 'valid'])
plt.savefig(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/losses.png', close=True, verbose=True)
plt.close()
```
%% Cell type:markdown id: tags:
Save predictions, ground truth, probabilities and filenames
%% Cell type:code id: tags:
``` python
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/preds_tr.npy', preds_tr)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/trues_tr.npy', trues_tr)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/probs_tr.npy', probs_tr)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/filenames_tr.npy', filenames_tr)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/preds_ts.npy', preds_ts)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/trues_ts.npy', trues_ts)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/probs_ts.npy', probs_ts)
np.save(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/filenames_ts.npy', filenames_ts)
```
%% Cell type:markdown id: tags:
Save metrics
%% Cell type:code id: tags:
``` python
metrics_out = np.stack([train_metrics, test_metrics], 0)
np.savetxt(f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/metrics_out.txt', metrics_out)
```
%% Cell type:markdown id: tags:
Save model weights
%% Cell type:code id: tags:
``` python
torch.save(model.state_dict(), f'{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/weights.pth')
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment