Commit 456b7009 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Use Path everywhere and update according to new dataset structure

parent 4b5d8d22
#!/usr/bin/env python
# coding: utf-8
# ## Training network for featture extraction
# ### Set Path
# In[ ]:
# get_ipython().run_line_magic('reload_ext', 'autoreload')
# get_ipython().run_line_magic('autoreload', '2')
import os
PATH = os.getcwd()
print(PATH)
# ### Import packages
# In[ ]:
#%% [markdown]
# ## Training network for feature extraction
# %%
import datetime
import gc
import os
import pickle
import sys
import time
from pathlib import Path
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
matthews_corrcoef as mcor,
accuracy_score as acc,
recall_score as recall,
precision_score as precision,
confusion_matrix,
)
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef as mcor
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from networks import CiompiDO, ResNet50_3d
from dataset import NumpyCSVDataset, augment_3D_HN
from networks import CiompiDO, ResNet50_3d
from split import train_test_indexes_patient_wise
# In[ ]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
PATH = Path(os.getcwd())
print(PATH)
#%%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multigpu = True
# In[ ]:
DATASET = 'HN_val'
BBOX_SUBDATASET = 'bbox_64'
DATASET_DIR = PATH / 'data' / DATASET / 'processed' / 'bbox' / BBOX_SUBDATASET
EXPERIMENT_DIR = PATH / 'experiment'
DATASET_DIR = (
f"/thunderdisk/HN/processed/bbox_fixed2_64"
) # Not augmented but already 64**3 (for faster loading)
EXPERIMENT_DIR = f"{PATH}/experiments"
PRETRAINED_MED3D_WEIGHTS = (
"/thunderdisk/HN/MedicalNet_pytorch_files/pretrain/resnet_50.pth"
)
PRETRAINED_T_STAGE = f"{EXPERIMENT_DIR}/Tstage_4_noTx_CT_20191114-163418/weights.pth"
# ### Settings
# In[ ]:
EXPERIMENT_NAME = "prova" + datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S"
)
PRETRAINED_MED3D_WEIGHTS = PATH / 'pretrained_weights' / 'resnet_50.pth'
PRETRAINED_T_STAGE = EXPERIMENT_DIR / 'Tstage_4_noTx_CT_20191114-163418' / 'weights.pth'
# %%
### Settings
EXPERIMENT_NAME = "prova" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
settings = {
"model": CiompiDO,
......@@ -94,15 +61,12 @@ settings = {
"pretrained": "",
}
assert settings["split"] in ["valieres", "8020"]
assert settings["split"] in ["vallieres", "8020"]
assert not settings["splits"] == "vallieres" or DATASET == 'HN_val'
assert settings["pretrained"] in ["Med3D", "branch-wise", "T-stage", ""]
os.makedirs(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}", exist_ok=False)
# In[ ]:
os.makedirs(EXPERIMENT_DIR / EXPERIMENT_NAME, exist_ok=False)
# %%
MODEL = settings["model"]
BATCH_SIZE = settings["batch_size"]
LR = settings["lr"]
......@@ -117,16 +81,15 @@ SIZE = settings["size"]
PRETRAINED = settings["pretrained"]
# %%
# ### Tensorboard settings
# In[ ]:
def new_run_log_dir(experiment_name):
log_dir = os.path.join(PATH, "tb-runs")
log_dir = PATH / "tb-runs"
if not os.path.exists(log_dir):
os.makedirs(log_dir)
run_log_dir = os.path.join(log_dir, experiment_name)
run_log_dir = log_dir / experiment_name
return run_log_dir
......@@ -134,152 +97,93 @@ log_dir = new_run_log_dir(EXPERIMENT_NAME)
print(f"Tensorboard folder: {log_dir}")
writer = SummaryWriter(log_dir)
# %%
# ### Data Handlers
# In[ ]:
clinical_data = f"{PATH}/data/clinical_data_noTx.csv"
clinical_file = PATH / 'data' / DATASET / 'processed' / f'clinical_{DATASET}.csv'
target_column = "T-stage_grouped"
# In[ ]:
# %%
np.random.seed(SEED)
dataset = NumpyCSVDataset(DATASET_DIR, clinical_data, target_column, SIZE, seed=SEED)
dataset_train = NumpyCSVDataset(
data_dir=DATASET_DIR,
clinical_file=clinical_data,
label_col=target_column,
size=SIZE,
mode='train',
seed=SEED,
)
dataset_test = NumpyCSVDataset(
data_dir=DATASET_DIR,
clinical_file=clinical_data,
label_col=target_column,
size=SIZE,
mode='test',
seed=SEED,
)
# %%
# Create train-test datasets
# In[ ]:
if SPLIT == "valieres":
dataset_train = NumpyCSVDataset(
DATASET_DIR,
clinical_data,
target_column,
SIZE,
mode="train",
transforms=augment_3D_HN,
)
# in this particular case getting `dataset_train._files_full` or `dataset_train.get_files()` is the same
if SPLIT == "vallieres":
idx_train = [
i
for i, f in enumerate(dataset_train.get_files())
for i, f in enumerate(dataset_train.patients)
if f.split("-")[1] in ["CHUS", "HGJ"]
]
dataset_train.indexes = np.array(idx_train)
dataset_test = NumpyCSVDataset(
DATASET_DIR,
clinical_data,
target_column,
SIZE,
mode="test",
transforms=augment_3D_HN,
)
# in this particular case getting `dataset_train._files_full` or `dataset_train.get_files()` is the same
idx_test = [
i
for i, f in enumerate(dataset_test.get_files())
for i, f in enumerate(dataset_test.patients)
if f.split("-")[1] in ["HMR", "CHUM"]
]
dataset_test.indexes = np.array(idx_test)
else:
idx_train, idx_test = train_test_indexes_patient_wise(
dataset, test_size=K, stratify=True
dataset_train, test_size=K, stratify=True
)
dataset_test = NumpyCSVDataset(
DATASET_DIR,
clinical_data,
target_column,
SIZE,
mode="test",
transforms=augment_3D_HN,
)
dataset_test.indexes = np.array(idx_test)
dataset_train = NumpyCSVDataset(
DATASET_DIR,
clinical_data,
target_column,
SIZE,
mode="train",
transforms=augment_3D_HN,
)
dataset_train.indexes = np.array(idx_train)
dataset_train.indices = np.array(idx_train)
dataset_test.indices = np.array(idx_test)
# %%
# Check class balance
# In[ ]:
labels_test = dataset_test.get_labels()
labels_train = dataset_train.get_labels()
c, n = np.unique(labels_test, return_counts=True)
print(np.c_[c, n / len(labels_test)])
labels_train = dataset_train.labels
labels_test = dataset_test.labels
c, n = np.unique(labels_train, return_counts=True)
print(np.c_[c, n / len(labels_train)])
c, n = np.unique(labels_test, return_counts=True)
print(np.c_[c, n / len(labels_test)])
# %%
# Create loaders
# In[ ]:
loader_test = DataLoader(
dataset_test, batch_size=BATCH_SIZE // 2, num_workers=12, shuffle=True
)
loader_train = DataLoader(
dataset_train, batch_size=BATCH_SIZE, num_workers=12, pin_memory=True, shuffle=True
)
loader_test = DataLoader(
dataset_test, batch_size=BATCH_SIZE, num_workers=12, shuffle=False
)
# %%
# Compute weights
# In[ ]:
labels = dataset_train.get_labels()
labels_train = dataset_train.labels
# class_sample_count = np.array([len(np.where( labels == t )[0]) for t in np.unique( labels )])
_, class_sample_count = np.unique(labels, return_counts=True)
_, class_sample_count = np.unique(labels_train, return_counts=True)
n_min = np.min(class_sample_count)
weights = (
n_min / class_sample_count
) # versione proporzionale, usare n_min invece che 1 per pesi ~1
weights = torch.Tensor(weights).to(device)
# %%
# ### Initialize Model
# In[ ]:
model = MODEL(n_classes=N_CLASSES, n_channels=2, modality="CT/PET", dropout=DROPOUT)
if multigpu:
model = nn.DataParallel(model.to(device))
model = model.module
# In[ ]:
#model.initialize_weights()
# %%
# model.initialize_weights()
if PRETRAINED == "Med3D":
pretrained_dict = torch.load(PRETRAINED_MED3D_WEIGHTS)["state_dict"]
......@@ -299,10 +203,14 @@ if PRETRAINED == "Med3D":
elif PRETRAINED == "branch-wise":
pretrained_CT_dict = torch.load(
f"{EXPERIMENT_DIR}/Tstage_grouped_noTx_CT_valieres_20191029-173736/checkpoint_290.pth"
EXPERIMENT_DIR
/ 'Tstage_grouped_noTx_CT_valieres_20191029-173736'
/ 'checkpoint_290.pth'
)
pretrained_PT_dict = torch.load(
f"{EXPERIMENT_DIR}/Tstage_grouped_noTx_PET_valieres_20191029-195338/checkpoint_290.pth"
EXPERIMENT_DIR
/ 'Tstage_grouped_noTx_PET_valieres_20191029-195338'
/ 'checkpoint_290.pth'
)
model_dict = model.state_dict()
......@@ -351,42 +259,13 @@ elif PRETRAINED == "T-stage":
# print(name)
model.state_dict()[name].copy_(pretrained_dict[name])
# Optimizer
# In[ ]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
# In[ ]:
# [x.shape for x in model.parameters()]
# Loss
# In[ ]:
# %%
# Optimizer and criterion
optimizer = OPTIMIZER(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(weight=weights)
# In[ ]:
NEW_LABELS = list(range(len(list(np.unique(labels_train)))))
dictionary = dict(zip(list(np.unique(labels_train)), NEW_LABELS))
dictionary
# %%
# ### Train
# In[ ]:
model.train() # Set model to training mode
global_i = 0
......@@ -403,7 +282,7 @@ for epoch in range(EPOCHS):
if epoch % 10 == 0: # save checkpoint
torch.save(
model.state_dict(),
f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/checkpoint_{epoch}.pth",
EXPERIMENT_DIR / EXPERIMENT_NAME / f'checkpoint_{epoch}.pth',
)
for j, data in enumerate(loader_train):
......@@ -416,7 +295,7 @@ for epoch in range(EPOCHS):
optimizer.zero_grad()
images_tr = data["data"].to(device)
labels_tr = torch.LongTensor([dictionary[i] for i in data["target"]]).to(device)
labels_tr = torch.LongTensor(data["target"]).to(device)
outputs_tr = model(images_tr).to(device)
# backward
......@@ -436,9 +315,7 @@ for epoch in range(EPOCHS):
for data_test in loader_test:
images_ts = data_test["data"].to(device)
labels_ts = torch.LongTensor(
[dictionary[i] for i in data_test["target"]]
).to(device)
labels_ts = torch.LongTensor(data_test["target"]).to(device)
outputs_ts = model.forward(images_ts)
......@@ -453,6 +330,8 @@ for epoch in range(EPOCHS):
)
writer.flush()
# TODO: fix best model check
# is_best = loss_val_avg < last_loss_val
# if is_best:
# torch.save(model.state_dict(),
......@@ -484,12 +363,8 @@ for epoch in range(EPOCHS):
)
)
# ### Predict on Train
]# In[ ]:
# %%
### Predict on Train
model.eval()
dataset_train.mode = "test" # no augmentation
......@@ -507,7 +382,7 @@ with torch.no_grad():
preds_tr.append(pred.data.cpu().numpy())
# trues.append(label)
trues_tr.append(dictionary[label])
trues_tr.append(label)
probs_tr.append(output.data.cpu().numpy())
filenames_tr.append(data["filename"])
......@@ -532,11 +407,8 @@ train_metrics = [
]
# %%
# ### Predict on Test
# In[ ]:
model.eval()
preds_ts = []
......@@ -552,7 +424,7 @@ with torch.no_grad():
_, pred = torch.max(output, 1)
preds_ts.append(pred.data.cpu().numpy())
trues_ts.append(dictionary[label])
trues_ts.append(label)
probs_ts.append(output.data.cpu().numpy())
filenames_ts.append(data["filename"])
......@@ -570,71 +442,48 @@ print("MCC test", round(MCC_ts, 3), "ACC test", round(ACC_ts, 3))
print("precision test", round(prec_ts, 3), "recall test", round(rec_ts, 3))
test_metrics = [round(MCC_ts, 3), round(ACC_ts, 3), round(prec_ts, 3), round(rec_ts, 3)]
# %%
# ## Save results
# Save settings
#
# In[ ]:
with open(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/settings.pkl", "wb") as f:
with open(EXPERIMENT_DIR / EXPERIMENT_NAME / 'settings.pkl', 'wb') as f:
pickle.dump(settings, f, pickle.HIGHEST_PROTOCOL)
# Save losses
#
# In[ ]:
losses_tr = np.array(losses_tr)
losses_vl = np.array(losses_ts)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/losses_tr.npy", losses_tr)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/losses_ts.npy", losses_vl)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'losses_tr.npy', losses_tr)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'losses_ts.npy', losses_vl)
# %%
# Plot losses
# In[ ]:
plt.figure(figsize=(20, 10))
plt.plot(losses_tr, color="blue")
plt.plot(losses_ts, color="orange")
plt.legend(["train", "valid"])
plt.savefig(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/losses.png", close=True, verbose=True)
plt.savefig(EXPERIMENT_DIR / EXPERIMENT_NAME / 'losses.png', close=True, verbose=True)
plt.close()
# %%
# Save predictions, ground truth, probabilities and filenames
# In[ ]:
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/preds_tr.npy", preds_tr)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/trues_tr.npy", trues_tr)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/probs_tr.npy", probs_tr)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/filenames_tr.npy", filenames_tr)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/preds_ts.npy", preds_ts)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/trues_ts.npy", trues_ts)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/probs_ts.npy", probs_ts)
np.save(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/filenames_ts.npy", filenames_ts)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'preds_tr.npy', preds_tr)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'trues_tr.npy', trues_tr)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'probs_tr.npy', probs_tr)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'filenames_tr.npy', filenames_tr)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'preds_ts.npy', preds_ts)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'trues_ts.npy', trues_ts)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'probs_ts.npy', probs_ts)
np.save(EXPERIMENT_DIR / EXPERIMENT_NAME / 'filenames_ts.npy', filenames_ts)
# %%
# Save metrics
# In[ ]:
metrics_out = pd.DataFrame(
(train_metrics, test_metrics),
columns=["MCC", "ACC", "prec", "rec"],
index=["train", "test"],
)
metrics_out.to_csv(f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/metrics_out.csv")
metrics_out.to_csv(EXPERIMENT_DIR / EXPERIMENT_NAME / 'metrics_out.csv')
# Save model weights
torch.save(model.state_dict(), f"{EXPERIMENT_DIR}/{EXPERIMENT_NAME}/weights.pth")
torch.save(model.state_dict(), EXPERIMENT_DIR / EXPERIMENT_NAME / 'weights.pth')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment