Commit 705f2690 authored by Nicole Bussola's avatar Nicole Bussola
Browse files

full workflow with new data

parent be2e7160
......@@ -3,16 +3,23 @@
__pycache__/
.spyproject/
.vscode
.venv
data
images*/
data/
experiments
augmented_plots/
experiments/
files/
dicom_utils/
NBIA_data_retriever_CLI
out*
err*
*.tar.gz
tb-runs/
tmp*/
*.csv
*.pkl
*.pth
*.deb
\ No newline at end of file
{"gitignore":"none"}
# %% [markdown]
# ## Augment dataset class-wise
# %%
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from dataset import NumpyCSVDataset, augment_3D_HN
import SimpleITK as sitk
from config import get_project_root
from dataset import NumpyCSVDataset, augment_3D_HN
from tqdm import tqdm
#%%
# %%
PROJECT_ROOT = get_project_root()
DATASET = 'HN_val'
......
import os
from numpy.core.arrayprint import printoptions
from config import get_project_root
import pandas as pd
PROJECT_ROOT = get_project_root()
DATASET_NAME = 'HN_val_QIN_HN1'
PROJECT_DATA_PATH = PROJECT_ROOT / 'data' / DATASET_NAME
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
RAW_DATA_PATH = PROJECT_DATA_PATH / 'raw'
if DATASET_NAME == 'QIN':
clinical = pd.read_csv(PROCESSED_DATA_PATH / 'QIN clinical.csv')
for i in range(clinical.shape[0]):
if clinical.loc[i, ['Final T']].item() in ['4a', '4b']:
clinical.loc[i, ['Final T']] = '4'
series_data = pd.read_csv(RAW_DATA_PATH / 'tcia_original_metadata_QIN.csv')
df = pd.merge(clinical, series_data, left_on='PatientID', right_on='Subject ID')
df = pd.DataFrame(
df,
columns=[
'PatientID',
'Patient Birth Date',
'Study Date',
'Gender',
'Final Site',
'Final T',
'Date of Death',
'Cause of Death',
'Date of Recurrence',
'Location of First Recurrence',
],
dtype='str',
)
df.drop_duplicates(keep="first", inplace=True)
df.rename(
columns={
"PatientID": "patient",
'Final Site': 'Primary Site',
'Final T': 'T-stage_grouped',
'Gender': 'Sex',
},
inplace=True,
)
df['Age'] = ""
df['Death'] = ""
df['locoregional'] = ""
df['T-stage_binary'] = ""
df["filename"] = ""
df['ROI_name'] = "GTV-1"
df['ROI_modality'] = "CT"
for i, row in df.iterrows():
study_date = row['Study Date']
study_date = study_date.split('/')
study_date = study_date[2]
df.loc[i, 'Age'] = str(int(study_date) - int(row['Patient Birth Date']))
df.loc[i, 'Death'] = 1 if row['Date of Death'] != "nan" else 0
df.loc[i, 'locoregional'] = 1 if row['Date of Recurrence'] != "nan" else 0
df.loc[i, 'T-stage_binary'] = 1 if row['T-stage_grouped'] in ['3', '4'] else 0
df.loc[i, 'T-stage_grouped'] = (
str(int(df.loc[i, 'T-stage_grouped']) - 1)
if row['T-stage_grouped'] != '0'
else 0
)
patient = row['patient']
df.loc[i, 'filename'] = f'{patient}.npy'
df.drop(
columns=[
'Date of Death',
'Cause of Death',
'Date of Recurrence',
'Location of First Recurrence',
'Patient Birth Date',
'Study Date',
],
inplace=True,
)
df = df.reindex(
columns=[
'patient',
'Age',
'Death',
'locoregional',
'Primary Site',
'Sex',
'filename',
'T-stage_binary',
'T-stage_grouped',
'ROI_name',
'ROI_modality',
],
)
df.to_csv(PROCESSED_DATA_PATH / 'clinical_QIN.csv', index=False)
if DATASET_NAME == 'HN1':
clinical = pd.read_csv(PROCESSED_DATA_PATH / 'clinical data PET HN1.csv', sep=';')
series_data = pd.read_csv(RAW_DATA_PATH / 'tcia_original_metadata_HN1.csv')
df = pd.merge(clinical, series_data, left_on='id', right_on='Subject ID')
df = pd.DataFrame(
clinical,
columns=[
'id',
'age_at_diagnosis',
'event_overall_survival',
'event_locoregional_recurrence',
'index_tumour_location',
'biological_sex',
'clin_t',
],
dtype='str',
)
df.drop_duplicates(keep="first", inplace=True)
df.rename(
columns={
'id': 'patient',
'age_at_diagnosis': 'Age',
'event_overall_survival': 'Death',
'event_locoregional_recurrence': 'locoregional',
'index_tumour_location': 'Primary Site',
'biological_sex': 'Sex',
'clin_t': 'T-stage_grouped',
},
inplace=True,
)
df['T-stage_binary'] = ""
df["filename"] = ""
df['ROI_name'] = "GTV-1"
df['ROI_modality'] = "CT"
for i, row in df.iterrows():
df.loc[i, 'T-stage_binary'] = 1 if row['T-stage_grouped'] in ['3', '4'] else 0
df.loc[i, 'T-stage_grouped'] = (
str(int(df.loc[i, 'T-stage_grouped']) - 1)
if row['T-stage_grouped'] != '0'
else 0
)
patient = row['patient']
df.loc[i, 'filename'] = f'{patient}.npy'
df = df.reindex(
columns=[
'patient',
'Age',
'Death',
'locoregional',
'Primary Site',
'Sex',
'filename',
'T-stage_binary',
'T-stage_grouped',
'ROI_name',
'ROI_modality',
],
)
df.to_csv(PROCESSED_DATA_PATH / 'clinical_HN1_PROVA.csv', index=False)
if DATASET_NAME == 'HNSCC':
clinical = pd.read_csv(
PROCESSED_DATA_PATH / 'clinical HNSCC segmented.csv', sep=';'
)
for i in range(clinical.shape[0]):
if clinical.loc[i, ['T']].item() in ['4a', '4b']:
clinical.loc[i, ['T']] = '4'
# series_data = pd.read_csv(PROCESSED_DATA_PATH / 'series-data1620822702516.csv')
# df = pd.merge(clinical, series_data, left_on='PatientID', right_on='Subject ID')
# df.drop('Subject ID')
df = pd.DataFrame(
clinical,
columns=[
'TCIA code',
'Age',
'Sex',
'Site',
'T',
'Overall Survival Censor',
'Loco-regional Control Censor',
],
)
df.to_csv(PROCESSED_DATA_PATH / 'clinical_HNSCC.csv', index=False)
if DATASET_NAME == 'HN_val':
df = pd.read_csv(PROCESSED_DATA_PATH / 'clinical_HN_val.csv')
df = df.reindex(
columns=[
'patient',
'Age',
'Death',
'locoregional',
'Primary Site',
'Sex',
'filename',
'T-stage_binary',
'T-stage_grouped',
'ROI_name',
'ROI_modality',
],
)
df.to_csv(PROCESSED_DATA_PATH / 'clinical_HN_val_reduced.csv', index=False)
if DATASET_NAME == 'HN_val_QIN_HN1':
clinical_HN_val = pd.read_csv(
PROJECT_ROOT / 'data' / 'HN_val' / 'processed' / 'clinical_HN_val_reduced.csv'
)
clinical_QIN = pd.read_csv(
PROJECT_ROOT / 'data' / 'QIN' / 'processed' / 'clinical_QIN.csv'
)
clinical_HN1 = pd.read_csv(
PROJECT_ROOT / 'data' / 'HN1' / 'processed' / 'clinical_HN1.csv'
)
df = pd.concat([clinical_HN_val, clinical_QIN, clinical_HN1], ignore_index=True)
df['cohort'] = ''
for i, row in df.iterrows():
if row['filename'].startswith('QIN'):
df.loc[i, 'cohort'] = 'QIN'
if row['filename'].startswith('HN'):
df.loc[i, 'cohort'] = 'HN1'
if row['filename'].startswith('HN-CHUM'):
df.loc[i, 'cohort'] = 'HN-CHUM'
if row['filename'].startswith('HN-CHUS'):
df.loc[i, 'cohort'] = 'HN-CHUS'
if row['filename'].startswith('HN-HGJ'):
df.loc[i, 'cohort'] = 'HN-HGJ'
if row['filename'].startswith('HN-HMR'):
df.loc[i, 'cohort'] = 'HN-HMR'
df.to_csv(PROCESSED_DATA_PATH / 'clinical_HN_val_QIN_HN1.csv', index=False)
# %%
#
import gc
import os
import sys
from pathlib import Path
import numpy as np
from numpy.core.fromnumeric import mean
import pandas as pd
import SimpleITK as sitk
from pandas.core import frame
from tqdm import tqdm
import dicom_utils.dicom_utils as du
import dicom_utils.dicom_utils.visualize as viz
from config import get_project_root
from dicom_utils.dicom_utils import augmentation as aug
from dicom_utils.dicom_utils import processing as dup
from utils import filter_outbounds, sitk_to_numpy
from pydicom.filereader import dcmread
# import pydicom_seg
from highdicom.seg.utils import iter_segments
from matplotlib import pyplot as plt
def resize(image, size):
N_CHANNELS = image.shape[0]
image_seq = [sitk.GetImageFromArray(image[i, :, :, :]) for i in range(N_CHANNELS)]
image_seq = [dup.resample(image, size=(size, size, size)) for image in image_seq]
image_seq = [sitk.GetArrayFromImage(image) for image in image_seq]
image = np.stack(image_seq, axis=0)
return image
# %%
PROJECT_ROOT = get_project_root()
BOX_SIZE = 128 # mm == pixels for voxel size of 1mm3
RESIZE = True
DATASET_NAME = 'QIN'
PROJECT_DATA_PATH = PROJECT_ROOT / 'data' / DATASET_NAME
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
CLINICAL_DATA_FILENAME_CLEAN = f'clinical_{DATASET_NAME}.csv'
DCM_DIR = PROCESSED_DATA_PATH / 'dcm'
BBOX_OUT_DIR = PROCESSED_DATA_PATH / 'bbox'
BBOX_SUBDATASET_DEEP = f'bbox_{BOX_SIZE//2}_deep'
BBOX_SUBDATASET_RAD = f'bbox_rad'
OUTDIR_DEEP = BBOX_OUT_DIR / BBOX_SUBDATASET_DEEP
OUTDIR_RAD = BBOX_OUT_DIR / BBOX_SUBDATASET_RAD
os.makedirs(OUTDIR_DEEP, exist_ok=True)
os.makedirs(OUTDIR_RAD, exist_ok=True)
CT_ranges = [-1050, 3050]
PT_ranges = [0, 50]
VOXEL_SIZE = [1, 1, 1]
# gaussian = sitk.SmoothingRecursiveGaussianImageFilter()
# gaussian.SetSigma(2)
# %%
clinical = pd.read_csv(PROCESSED_DATA_PATH / CLINICAL_DATA_FILENAME_CLEAN)
available_patients = patients = [
f for f in os.listdir(DCM_DIR) if os.path.isdir(DCM_DIR / f)
]
clinical = clinical.loc[clinical['patient'].isin(available_patients)]
errors = []
# %%
for i, row in tqdm(clinical.iterrows()):
patient = row['patient']
filename_out = row['filename']
roi_name = row['ROI_name']
roi_modality = row['ROI_modality']
scan_CT = du.load_series(DCM_DIR / patient / 'CT')
print('scans size', scan_CT.GetSize())
# scan_PT = du.load_SUV(DCM_DIR / patient / 'PT')
scan_PT = du.load_series(DCM_DIR / patient / 'PT')
print(scan_PT.GetSize())
if i == 0:
firstDimensions = scan_PT.GetSize()
rs_filename = 'tumor segmentation - User1 Manual trial 1.dcm'
# Read SEG Image data set from PS3.10 files on disk
seg_dataset = dcmread(str(DCM_DIR / patient / 'SEG' / rs_filename))
# Iterate over segments and print the information about the frames
# that encode the segment across different image positions
zrange, xrange, yrange = (
[
int(
round(
scan_PT.GetSize()[2] * 0.8
- scan_PT.GetSize()[2] / firstDimensions[2] * BOX_SIZE // 4
)
), # scale the z axis wrt the dimensions of the first image; take the tumour area around the 80% of the image height
int(
round(
scan_PT.GetSize()[2] * 0.8
+ scan_PT.GetSize()[2] / firstDimensions[2] * BOX_SIZE // 4
)
),
],
# [scan_PT.GetSize()[2] - BOX_SIZE // 2, scan_PT.GetSize()[2]],
[np.inf, 0],
[np.inf, 0],
)
for frames, frame_descriptions, description in iter_segments(seg_dataset):
# print('Frame shape', frames.shape)
# with open(str(f'{PROCESSED_DATA_PATH}/interest_2.txt'), 'w') as f:
for k in range(frames.shape[0]):
for i in range(frames.shape[1]):
for j in range(frames.shape[2]):
if frames[k, i, j] != 0:
if i < yrange[0]:
yrange[0] = i
elif i > yrange[1]:
yrange[1] = i
if j < xrange[0]:
xrange[0] = j
elif j > xrange[1]:
xrange[1] = j
# f.write(str(k) + ' ' + str(i) + ' ' + str(j) + '\n')
# print('ranges', xrange, yrange, zrange)
xcenter = round(mean(xrange))
xrange = [
int(
round(
xcenter - scan_PT.GetSize()[0] / scan_CT.GetSize()[0] * (BOX_SIZE // 2)
)
),
int(
round(
xcenter + scan_PT.GetSize()[0] / scan_CT.GetSize()[0] * (BOX_SIZE // 2)
)
),
]
# print(xrange)
if xrange[0] < 0:
xrange = [x - xrange[0] for x in xrange]
elif xrange[1] > frames.shape[2]:
xrange = [x - (xrange[1] - frames.shape[2]) for x in xrange]
print(xrange)
ycenter = round(mean(yrange))
yrange = [
int(
round(
ycenter - scan_PT.GetSize()[1] / scan_CT.GetSize()[1] * (BOX_SIZE // 2)
)
),
int(
round(
ycenter + scan_PT.GetSize()[1] / scan_CT.GetSize()[1] * (BOX_SIZE // 2)
)
),
]
# print(yrange)
if yrange[0] < 0:
yrange = [y - yrange[0] for y in yrange]
elif yrange[1] > frames.shape[1]:
yrange = [y - (yrange[1] - frames.shape[1]) for y in yrange]
print(yrange)
# print(yrange)
if zrange[0] < 0:
zrange = [z - zrange[0] for z in zrange]
elif zrange[1] > scan_PT.GetSize()[2]:
zrange = [z - (zrange[1] - scan_PT.GetSize()[2]) for z in zrange]
print(zrange)
# print('OK here')
# print([xrange[0] : xrange[1]])
# print(yrange[0] : yrange[1])
# print(list(range(zrange[1] - 1, zrange[0] - 1, -1)))
print(scan_PT.GetSize())
scan_PT_box = scan_PT[
xrange[0] : xrange[1],
yrange[0] : yrange[1],
zrange[0] : zrange[1],
]
print('box', scan_PT_box.GetSize())
scan_PT_box = du.processing.resample(scan_PT_box, spacing=VOXEL_SIZE)
print(scan_PT_box.GetSize())
scan_PT_box = du.processing.resample(scan_PT_box, size=[BOX_SIZE] * 3)
print(scan_PT_box.GetSize())
scan_CT_box = sitk.Resample(scan_CT, scan_PT_box)
# print(scan_CT_box.GetSize())
out_deep = np.stack(
[sitk_to_numpy(scan_CT_box), sitk_to_numpy(scan_PT_box)], axis=0
)
out_deep_resized = resize(out_deep, size=BOX_SIZE // 2)
# Save
np.save(OUTDIR_DEEP / filename_out, out_deep_resized)
# mask_box = segmentation_box > 0.5
# mask_box = sitk.Resample(segmentation, scan_CT_box)
# # Save
# os.makedirs(OUTDIR_RAD / patient, exist_ok=False)
# sitk.WriteImage(scan_CT_box, str(OUTDIR_RAD / patient / 'CT.nrrd'))
# sitk.WriteImage(scan_PT_box, str(OUTDIR_RAD / patient / 'PT.nrrd'))
# sitk.WriteImage(mask_box, str(OUTDIR_RAD / patient / 'mask.nrrd'))
# except Exception as e:
# print(f'\nError processing patient: {row["patient"]}')
# print('Exception:', e)
# errors.append(row['patient'])
# %%
print(errors)
#%%
# %%
import gc
import os
import sys
......@@ -16,6 +16,8 @@ from dicom_utils.dicom_utils import augmentation as aug
from dicom_utils.dicom_utils import processing as dup
from utils import filter_outbounds, sitk_to_numpy
from matplotlib import pyplot as plt
def resize(image, size):
N_CHANNELS = image.shape[0]
......@@ -26,7 +28,7 @@ def resize(image, size):
return image
#%%
# %%
PROJECT_ROOT = get_project_root()
BOX_SIZE = 128 # mm == pixels for voxel size of 1mm3
......@@ -45,8 +47,8 @@ BBOX_SUBDATASET_RAD = f'bbox_rad'
OUTDIR_DEEP = BBOX_OUT_DIR / BBOX_SUBDATASET_DEEP
OUTDIR_RAD = BBOX_OUT_DIR / BBOX_SUBDATASET_RAD
os.makedirs(OUTDIR_DEEP, exist_ok=False)
os.makedirs(OUTDIR_RAD, exist_ok=False)
os.makedirs(OUTDIR_DEEP, exist_ok=True)
os.makedirs(OUTDIR_RAD, exist_ok=True)
CT_ranges = [-1050, 3050]
......@@ -58,7 +60,7 @@ VOXEL_SIZE = [1, 1, 1]
# gaussian = sitk.SmoothingRecursiveGaussianImageFilter()
# gaussian.SetSigma(2)
#%%
# %%
clinical = pd.read_csv(PROCESSED_DATA_PATH / CLINICAL_DATA_FILENAME_CLEAN)
available_patients = patients = [
......@@ -68,8 +70,8 @@ available_patients = patients = [
clinical = clinical.loc[clinical['patient'].isin(available_patients)]
errors = []
#%%
for i, row in tqdm(clinical.iterrows()):
# %%
for i, row in clinical.iterrows():
patient = row['patient']
filename_out = row['filename']
......@@ -79,10 +81,9 @@ for i, row in tqdm(clinical.iterrows()):
try:
scan_CT = du.load_series(DCM_DIR / patient / 'CT')
scan_PT = du.load_SUV(DCM_DIR / patient / 'PT')
# scan_PT = du.load_SUV(DCM_DIR / patient / 'PT')
scan_PT = du.load_series(DCM_DIR / patient / 'PT')
rs_filename = os.listdir(DCM_DIR / patient / 'RTSTRUCT')[0]
if roi_modality == 'CT':
segmentation = du.load_roi(
DCM_DIR / patient / 'RTSTRUCT' / rs_filename, roi_name, scan_CT
......@@ -93,9 +94,11 @@ for i, row in tqdm(clinical.iterrows()):
)
else:
raise ValueError('ROI modality must be `CT` or `PT`.')
# Calculate segmentation mask vertices
print(type(segmentation))
start_mm, stop_mm = du.get_bbox_vertices(segmentation)
# print(start_mm, stop_mm)
center_mm = (np.array(stop_mm) + np.array(start_mm)) / 2
# Calculate vertices for deep learning bbox centered at the center of the segmentation mask
......@@ -105,17 +108,22 @@ for i, row in tqdm(clinical.iterrows()):
stop_mm = center_mm + BOX_SIZE // 2 + 5
scan_CT_box = du.extract_volume(scan_CT, start_mm, stop_mm)