Commit 456b7009 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Use Path everywhere and update according to new dataset structure

parent 4b5d8d22
#!/usr/bin/env python #%% [markdown]
# coding: utf-8 # ## Training network for feature extraction
# %%
# ## Training network for featture extraction
# ### Set Path
# In[ ]:
# get_ipython().run_line_magic('reload_ext', 'autoreload')
# get_ipython().run_line_magic('autoreload', '2')
import os
PATH = os.getcwd()
print(PATH)
# ### Import packages
# In[ ]:
import datetime import datetime
import gc import gc
import os
import pickle import pickle
import sys import sys
import time import time
from pathlib import Path
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
matthews_corrcoef as mcor,
accuracy_score as acc,
recall_score as recall,
precision_score as precision,
confusion_matrix,
)
import torch import torch
import torch.nn as nn import torch.nn as nn
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef as mcor
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter
from networks import CiompiDO, ResNet50_3d
from dataset import NumpyCSVDataset, augment_3D_HN from dataset import NumpyCSVDataset, augment_3D_HN
from networks import CiompiDO, ResNet50_3d
from split import train_test_indexes_patient_wise from split import train_test_indexes_patient_wise
PATH = Path(os.getcwd())
# In[ ]: print(PATH)
#%%
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multigpu = True multigpu = True
# In[ ]: # In[ ]:
DATASET = 'HN_val'
BBOX_SUBDATASET = 'bbox_64'
DATASET_DIR = PATH / 'data' / DATASET / 'processed' / 'bbox' / BBOX_SUBDATASET
EXPERIMENT_DIR = PATH / 'experiment'
DATASET_DIR = ( PRETRAINED_MED3D_WEIGHTS = PATH / 'pretrained_weights' / 'resnet_50.pth'
f"/thunderdisk/HN/processed/bbox_fixed2_64" PRETRAINED_T_STAGE = EXPERIMENT_DIR / 'Tstage_4_noTx_CT_20191114-163418' / 'weights.pth'
) # Not augmented but already 64**3 (for faster loading) # %%
EXPERIMENT_DIR = f"{PATH}/experiments" ### Settings
EXPERIMENT_NAME = "prova" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
PRETRAINED_MED3D_WEIGHTS = (
"/thunderdisk/HN/MedicalNet_pytorch_files/pretrain/resnet_50.pth"
)
PRETRAINED_T_STAGE = f"{EXPERIMENT_DIR}/Tstage_4_noTx_CT_20191114-163418/weights.pth"
# ### Settings
# In[ ]:
EXPERIMENT_NAME = "prova" + datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S"
)
settings = { settings = {
"model": CiompiDO, "model": CiompiDO,
...@@ -94,15 +61,12 @@ settings = { ...@@ -94,15 +61,12 @@ settings = {
"pretrained": "", "pretrained": "",
} }
assert settings["split"] in ["valieres", "8020"] assert settings["split"] in ["vallieres", "8020"]
assert not settings["splits"] == "vallieres" or DATASET == 'HN_val'
assert settings["pretrained"] in ["Med3D", "branch-wise", "T-stage", ""] assert settings["pretrained"] in ["Med3D", "branch-wise", "T-stage", ""]
os.makedirs(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}", exist_ok=False) os.makedirs(EXPERIMENT_DIR / EXPERIMENT_NAME, exist_ok=False)
# %%
# In[ ]:
MODEL = settings["model"] MODEL = settings["model"]
BATCH_SIZE = settings["batch_size"] BATCH_SIZE = settings["batch_size"]
LR = settings["lr"] LR = settings["lr"]
...@@ -117,16 +81,15 @@ SIZE = settings["size"] ...@@ -117,16 +81,15 @@ SIZE = settings["size"]
PRETRAINED = settings["pretrained"] PRETRAINED = settings["pretrained"]
# %%
# ### Tensorboard settings # ### Tensorboard settings
# In[ ]:
def new_run_log_dir(experiment_name): def new_run_log_dir(experiment_name):
log_dir = os.path.join(PATH, "tb-runs") log_dir = PATH / "tb-runs"
if not os.path.exists(log_dir): if not os.path.exists(log_dir):
os.makedirs(log_dir) os.makedirs(log_dir)
run_log_dir = os.path.join(log_dir, experiment_name) run_log_dir = log_dir / experiment_name
return run_log_dir return run_log_dir
...@@ -134,152 +97,93 @@ log_dir = new_run_log_dir(EXPERIMENT_NAME) ...@@ -134,152 +97,93 @@ log_dir = new_run_log_dir(EXPERIMENT_NAME)
print(f"Tensorboard folder: {log_dir}") print(f"Tensorboard folder: {log_dir}")
writer = SummaryWriter(log_dir) writer = SummaryWriter(log_dir)
# %%
# ### Data Handlers # ### Data Handlers
# In[ ]: clinical_file = PATH / 'data' / DATASET / 'processed' / f'clinical_{DATASET}.csv'
clinical_data = f"{PATH}/data/clinical_data_noTx.csv"
target_column = "T-stage_grouped" target_column = "T-stage_grouped"
# %%
# In[ ]:
np.random.seed(SEED) np.random.seed(SEED)
dataset = NumpyCSVDataset(DATASET_DIR, clinical_data, target_column, SIZE, seed=SEED) dataset_train = NumpyCSVDataset(
data_dir=DATASET_DIR,
clinical_file=clinical_data,
label_col=target_column,
size=SIZE,
mode='train',
seed=SEED,
)
dataset_test = NumpyCSVDataset(
data_dir=DATASET_DIR,
clinical_file=clinical_data,
label_col=target_column,
size=SIZE,
mode='test',
seed=SEED,
)
# %%
# Create train-test datasets # Create train-test datasets
if SPLIT == "vallieres":
# In[ ]:
if SPLIT == "valieres":
dataset_train = NumpyCSVDataset(
DATASET_DIR,
clinical_data,
target_column,
SIZE,
mode="train",
transforms=augment_3D_HN,
)
# in this particular case getting `dataset_train._files_full` or `dataset_train.get_files()` is the same
idx_train = [ idx_train = [
i i
for i, f in enumerate(dataset_train.get_files()) for i, f in enumerate(dataset_train.patients)
if f.split("-")[1] in ["CHUS", "HGJ"] if f.split("-")[1] in ["CHUS", "HGJ"]
] ]
dataset_train.indexes = np.array(idx_train)
dataset_test = NumpyCSVDataset(
DATASET_DIR,
clinical_data,
target_column,
SIZE,
mode="test",
transforms=augment_3D_HN,
)
# in this particular case getting `dataset_train._files_full` or `dataset_train.get_files()` is the same
idx_test = [ idx_test = [
i i
for i, f in enumerate(dataset_test.get_files()) for i, f in enumerate(dataset_test.patients)
if f.split("-")[1] in ["HMR", "CHUM"] if f.split("-")[1] in ["HMR", "CHUM"]
] ]
dataset_test.indexes = np.array(idx_test)
else: else:
idx_train, idx_test = train_test_indexes_patient_wise( idx_train, idx_test = train_test_indexes_patient_wise(
dataset, test_size=K, stratify=True dataset_train, test_size=K, stratify=True
) )
dataset_test = NumpyCSVDataset( dataset_train.indices = np.array(idx_train)
DATASET_DIR, dataset_test.indices = np.array(idx_test)
clinical_data, # %%
target_column,
SIZE,
mode="test",
transforms=augment_3D_HN,
)
dataset_test.indexes = np.array(idx_test)
dataset_train = NumpyCSVDataset(
DATASET_DIR,
clinical_data,
target_column,
SIZE,
mode="train",
transforms=augment_3D_HN,
)
dataset_train.indexes = np.array(idx_train)
# Check class balance # Check class balance
labels_train = dataset_train.labels
# In[ ]: labels_test = dataset_test.labels
labels_test = dataset_test.get_labels()
labels_train = dataset_train.get_labels()
c, n = np.unique(labels_test, return_counts=True)
print(np.c_[c, n / len(labels_test)])
c, n = np.unique(labels_train, return_counts=True) c, n = np.unique(labels_train, return_counts=True)
print(np.c_[c, n / len(labels_train)]) print(np.c_[c, n / len(labels_train)])
c, n = np.unique(labels_test, return_counts=True)
print(np.c_[c, n / len(labels_test)])
# %%
# Create loaders # Create loaders
# In[ ]:
loader_test = DataLoader(
dataset_test, batch_size=BATCH_SIZE // 2, num_workers=12, shuffle=True
)
loader_train = DataLoader( loader_train = DataLoader(
dataset_train, batch_size=BATCH_SIZE, num_workers=12, pin_memory=True, shuffle=True dataset_train, batch_size=BATCH_SIZE, num_workers=12, pin_memory=True, shuffle=True
) )
loader_test = DataLoader(
dataset_test, batch_size=BATCH_SIZE, num_workers=12, shuffle=False
)
# %%
# Compute weights # Compute weights
labels_train = dataset_train.labels
# In[ ]:
labels = dataset_train.get_labels()
# class_sample_count = np.array([len(np.where( labels == t )[0]) for t in np.unique( labels )]) # class_sample_count = np.array([len(np.where( labels == t )[0]) for t in np.unique( labels )])
_, class_sample_count = np.unique(labels, return_counts=True) _, class_sample_count = np.unique(labels_train, return_counts=True)
n_min = np.min(class_sample_count) n_min = np.min(class_sample_count)
weights = ( weights = (
n_min / class_sample_count n_min / class_sample_count
) # versione proporzionale, usare n_min invece che 1 per pesi ~1 ) # versione proporzionale, usare n_min invece che 1 per pesi ~1
weights = torch.Tensor(weights).to(device) weights = torch.Tensor(weights).to(device)
# %%
# ### Initialize Model # ### Initialize Model
# In[ ]:
model = MODEL(n_classes=N_CLASSES, n_channels=2, modality="CT/PET", dropout=DROPOUT) model = MODEL(n_classes=N_CLASSES, n_channels=2, modality="CT/PET", dropout=DROPOUT)
if multigpu: if multigpu:
model = nn.DataParallel(model.to(device)) model = nn.DataParallel(model.to(device))
model = model.module model = model.module
# %%
# model.initialize_weights()
# In[ ]:
#model.initialize_weights()
if PRETRAINED == "Med3D": if PRETRAINED == "Med3D":
pretrained_dict = torch.load(PRETRAINED_MED3D_WEIGHTS)["state_dict"] pretrained_dict = torch.load(PRETRAINED_MED3D_WEIGHTS)["state_dict"]
...@@ -299,10 +203,14 @@ if PRETRAINED == "Med3D": ...@@ -299,10 +203,14 @@ if PRETRAINED == "Med3D":
elif PRETRAINED == "branch-wise": elif PRETRAINED == "branch-wise":
pretrained_CT_dict = torch.load( pretrained_CT_dict = torch.load(
f"{EXPERIMENT_DIR}/Tstage_grouped_noTx_CT_valieres_20191029-173736/checkpoint_290.pth" EXPERIMENT_DIR
/ 'Tstage_grouped_noTx_CT_valieres_20191029-173736'
/ 'checkpoint_290.pth'
) )
pretrained_PT_dict = torch.load( pretrained_PT_dict = torch.load(
f"{EXPERIMENT_DIR}/Tstage_grouped_noTx_PET_valieres_20191029-195338/checkpoint_290.pth" EXPERIMENT_DIR
/ 'Tstage_grouped_noTx_PET_valieres_20191029-195338'
/ 'checkpoint_290.pth'
) )
model_dict = model.state_dict() model_dict = model.state_dict()
...@@ -351,42 +259,13 @@ elif PRETRAINED == "T-stage": ...@@ -351,42 +259,13 @@ elif PRETRAINED == "T-stage":
# print(name) # print(name)
model.state_dict()[name].copy_(pretrained_dict[name]) model.state_dict()[name].copy_(pretrained_dict[name])
# %%
# Optimizer # Optimizer and criterion
optimizer = OPTIMIZER(model.parameters(), lr=LR)
# In[ ]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
# In[ ]:
# [x.shape for x in model.parameters()]
# Loss
# In[ ]:
criterion = nn.CrossEntropyLoss(weight=weights) criterion = nn.CrossEntropyLoss(weight=weights)
# %%
# In[ ]:
NEW_LABELS = list(range(len(list(np.unique(labels_train)))))
dictionary = dict(zip(list(np.unique(labels_train)), NEW_LABELS))
dictionary
# ### Train # ### Train
# In[ ]:
model.train() # Set model to training mode model.train() # Set model to training mode
global_i = 0 global_i = 0
...@@ -403,7 +282,7 @@ for epoch in range(EPOCHS): ...@@ -403,7 +282,7 @@ for epoch in range(EPOCHS):
if epoch % 10 == 0: # save checkpoint if epoch % 10 == 0: # save checkpoint
torch.save( torch.save(
model.state_dict(), model.state_dict(),
f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/checkpoint_{epoch}.pth", EXPERIMENT_DIR / EXPERIMENT_NAME / f'checkpoint_{epoch}.pth',
) )
for j, data in enumerate(loader_train): for j, data in enumerate(loader_train):
...@@ -416,7 +295,7 @@ for epoch in range(EPOCHS): ...@@ -416,7 +295,7 @@ for epoch in range(EPOCHS):
optimizer.zero_grad() optimizer.zero_grad()
images_tr = data["data"].to(device) images_tr = data["data"].to(device)
labels_tr = torch.LongTensor([dictionary[i] for i in data["target"]]).to(device) labels_tr = torch.LongTensor(data["target"]).to(device)
outputs_tr = model(images_tr).to(device) outputs_tr = model(images_tr).to(device)
# backward # backward
...@@ -436,9 +315,7 @@ for epoch in range(EPOCHS): ...@@ -436,9 +315,7 @@ for epoch in range(EPOCHS):
for data_test in loader_test: for data_test in loader_test:
images_ts = data_test["data"].to(device) images_ts = data_test["data"].to(device)
labels_ts = torch.LongTensor( labels_ts = torch.LongTensor(data_test["target"]).to(device)
[dictionary[i] for i in data_test["target"]]
).to(device)
outputs_ts = model.forward(images_ts) outputs_ts = model.forward(images_ts)
...@@ -453,6 +330,8 @@ for epoch in range(EPOCHS): ...@@ -453,6 +330,8 @@ for epoch in range(EPOCHS):
) )
writer.flush() writer.flush()
# TODO: fix best model check
# is_best = loss_val_avg < last_loss_val # is_best = loss_val_avg < last_loss_val
# if is_best: # if is_best:
# torch.save(model.state_dict(), # torch.save(model.state_dict(),
...@@ -484,12 +363,8 @@ for epoch in range(EPOCHS): ...@@ -484,12 +363,8 @@ for epoch in range(EPOCHS):
) )
) )
# %%
# ### Predict on Train ### Predict on Train
]# In[ ]:
model.eval() model.eval()
dataset_train.mode = "test" # no augmentation dataset_train.mode = "test" # no augmentation
...@@ -507,7 +382,7 @@ with torch.no_grad(): ...@@ -507,7 +382,7 @@ with torch.no_grad():
preds_tr.append(pred.data.cpu().numpy()) preds_tr.append(pred.data.cpu().numpy())
# trues.append(label) # trues.append(label)
trues_tr.append(dictionary[label]) trues_tr.append(label)
probs_tr.append(output.data.cpu().numpy()) probs_tr.append(output.data.cpu().numpy())
filenames_tr.append(data["filename"]) filenames_tr.append(data["filename"])
...@@ -532,11 +407,8 @@ train_metrics = [ ...@@ -532,11 +407,8 @@ train_metrics = [
] ]
# %%
# ### Predict on Test # ### Predict on Test
# In[ ]:
model.eval() model.eval()
preds_ts = [] preds_ts = []
...@@ -552,7 +424,7 @@ with torch.no_grad(): ...@@ -552,7 +424,7 @@ with torch.no_grad():
_, pred = torch.max(output, 1) _, pred = torch.max(output, 1)
preds_ts.append(pred.data.cpu().numpy()) preds_ts.append(pred.data.cpu().numpy())
trues_ts.append(dictionary[label]) trues_ts.append(label)
probs_ts.append(output.data.cpu().numpy()) probs_ts.append(output.data.cpu().numpy())
filenames_ts.append(data["filename"]) filenames_ts.append(data["filename"])
...@@ -570,71 +442,48 @@ print("MCC test", round(MCC_ts, 3), "ACC test", round(ACC_ts, 3)) ...@@ -570,71 +442,48 @@ print("MCC test", round(MCC_ts, 3), "ACC test", round(ACC_ts, 3))
print("precision test", round(prec_ts, 3), "recall test", round(rec_ts, 3)) print("precision test", round(prec_ts, 3), "recall test", round(rec_ts, 3))
test_metrics = [round(MCC_ts, 3), round(ACC_ts, 3), round(prec_ts, 3), round(rec_ts, 3)] test_metrics = [round(MCC_ts, 3), round(ACC_ts, 3), round(prec_ts, 3), round(rec_ts, 3)]
# %%
# ## Save results # ## Save results
# Save settings # Save settings
# with open(EXPERIMENT_DIR / EXPERIMENT_NAME / 'settings.pkl', 'wb') as f:
# In[ ]:
with open(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/settings.pkl", "wb") as f:
pickle.dump(settings, f, pickle.HIGHEST_PROTOCOL) pickle.dump(settings, f, pickle.HIGHEST_PROTOCOL)
# Save losses # Save losses
#
# In[ ]:
losses_tr = np.array(losses_tr) losses_tr = np.array(losses_tr)
losses_vl = np.array(losses_ts)