Commit 521d1f5a authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Merge branch 'master' into 'pyradiomics_radler'

# Conflicts:
#   README.md
#   env.yml
parents bd126a71 aea99cc7
This diff is collapsed.
# %% [markdown]
# ## Augment dataset class-wise
# %%
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from dataset import NumpyCSVDataset, augment_3D_HN
import SimpleITK as sitk
from config import get_project_root
#%%
PROJECT_ROOT = get_project_root()
DATASET = 'HN_val'
BBOX_DIR = PROJECT_ROOT / 'data' / DATASET / 'processed' / 'bbox'
BBOX_SUBDATASET = 'bbox_64'
BBOX_SUBDATASET_AUG = f'{BBOX_SUBDATASET}_aug'
IN_DIR = BBOX_DIR / BBOX_SUBDATASET
OUT_DIR = BBOX_DIR / BBOX_SUBDATASET_AUG
os.makedirs(OUT_DIR, exist_ok=False)
CLINICAL_DATA_PATH = (
PROJECT_ROOT / 'data' / DATASET / 'processed' / f'clinical_{DATASET}.csv'
)
CLINICAL_DATA_AUG_PATH = (
PROJECT_ROOT
/ 'data'
/ DATASET
/ 'processed'
/ f'clinical_{BBOX_SUBDATASET_AUG}.csv'
)
LABEL_COLUMN = 'locoregional'
SIZE = 64
dataset = NumpyCSVDataset(IN_DIR, CLINICAL_DATA_PATH, LABEL_COLUMN, SIZE, mode='test')
clinical_original = pd.read_csv(CLINICAL_DATA_PATH)
labels = dataset.labels
idx_to_augment = np.where(labels == 1)[0]
ratio_NP = int((len(labels) - len(idx_to_augment)) / len(idx_to_augment))
print('Augmentation factor: ', ratio_NP)
clinical_augumented_rows = []
#%%
for sample in tqdm(dataset):
filename = sample['filename']
filename_no_ext = filename.split('.')[-2]
# print(filename)
image_orig = sample['data']
label = sample['target']
ratio = 1 if label == 0 else ratio_NP
clinical_filename = clinical_original[clinical_original['filename'] == filename]
assert (
len(clinical_filename) == 1
), f'There is more than one row corresponding to the filename {filename} in the clinical file.'
id_image = 0
for j in range(ratio):
image_aug = augment_3D_HN(image_orig, 'train', SIZE)
filename_aug = f'{filename_no_ext}_{id_image}.npy'
np.save(OUT_DIR / filename_aug, image_aug)
clinical_filename['filename'] = filename_aug
clinical_augumented_rows.append(clinical_filename)
id_image += 1
# %%
clinical_augmented = pd.concat(clinical_augumented_rows, ignore_index=True)
#%%
clinical_augmented.to_csv(CLINICAL_DATA_AUG_PATH, index=False)
......@@ -6,29 +6,33 @@ import pandas as pd
from pathlib import Path
import sys
PATH = os.path.join(os.path.abspath(os.path.curdir), '..')
sys.path.append(PATH)
from tqdm import tqdm
import dicom_utils.dicom_utils as du
import dicom_utils.dicom_utils.visualize as viz
import gc
from config import get_project_root
from utils import sitk_to_numpy, filter_outbounds
# os.chdir(PATH)
#%%
DATASET_NAME = 'HN_BZ'
PROJECT_DATA_PATH = Path('data') / DATASET_NAME
PROJECT_ROOT = get_project_root()
DATASET_NAME = 'HN_val'
PROJECT_DATA_PATH = PROJECT_ROOT / 'data' / DATASET_NAME
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
CLINICAL_DATA_FILENAME_CLEAN = f'clinical_{DATASET_NAME}.csv'
DCM_DIR = PROCESSED_DATA_PATH / 'dcm'
BBOX_OUT_DIR = PROCESSED_DATA_PATH / 'bbox'
BBOX_SUBDATASET = 'bbox_64'
BBOX_SUBDATASET_DEEP = 'bbox_64'
BBOX_SUBDATASET_RAD = 'bbox_64_rad'
OUTDIR = BBOX_OUT_DIR / BBOX_SUBDATASET
OUTDIR_DEEP = BBOX_OUT_DIR / BBOX_SUBDATASET_DEEP
OUTDIR_RAD = BBOX_OUT_DIR / BBOX_SUBDATASET_RAD
os.makedirs(OUTDIR, exist_ok=False)
os.makedirs(OUTDIR_DEEP, exist_ok=False)
os.makedirs(OUTDIR_RAD, exist_ok=False)
CT_ranges = [-1050, 3050]
......@@ -93,22 +97,35 @@ for i, row in tqdm(clinical.iterrows()):
scan_PT_box = sitk.Resample(scan_PT, scan_CT_box)
scan_PT_box = filter_outbounds(scan_PT_box, PT_ranges)
# segmentation_box = sitk.Resample(segmentation, scan_CT_box)
segmentation_box = sitk.Resample(segmentation, scan_CT_box)
# segmentation_box = gaussian.Execute(segmentation_box)
# out = segmentation_box>0.5
mask_box = segmentation_box > 0.5
start_mm = start_mm + 5 # remove margin
stop_mm = stop_mm - 5 # remove margin
scan_CT_box = du.extract_volume(scan_CT_box, start_mm, stop_mm)
scan_PT_box = du.extract_volume(scan_PT_box, start_mm, stop_mm)
out = np.stack([sitk_to_numpy(scan_CT_box), sitk_to_numpy(scan_PT_box)], axis=0)
mask_box = du.extract_volume(mask_box, start_mm, stop_mm)
out_deep = np.stack(
[sitk_to_numpy(scan_CT_box), sitk_to_numpy(scan_PT_box)], axis=0
)
out_rad = np.stack(
[
sitk_to_numpy(scan_CT_box),
sitk_to_numpy(scan_PT_box),
sitk_to_numpy(mask_box),
],
axis=0,
)
# save
np.save(OUTDIR / filename_out, out)
np.save(OUTDIR_DEEP / filename_out, out_deep)
np.save(OUTDIR_RAD / filename_out, out_rad)
del scan_CT_box, scan_PT_box, segmentation, out
del scan_CT_box, scan_PT_box, segmentation, mask_box, out_deep, out_rad
gc.collect()
except Exception as e:
......@@ -118,4 +135,3 @@ for i, row in tqdm(clinical.iterrows()):
#%%
print(errors)
......@@ -4,42 +4,71 @@ import pandas as pd
import numpy as np
from functools import reduce
from pathlib import Path
import shutil
import subprocess
from config import get_project_root
# %%
PROJECT_ROOT = get_project_root()
DATASETS = ['HN_val', 'HN_BZ']
BBOX_SUBDATASETS_NAMES = ['bbox_64', 'bbox_64']
BBOX_SUBDATASETS_PATHS = [
PROJECT_ROOT / 'data' / dataset / 'processed' / 'bbox' / bbox_name
for dataset, bbox_name in zip(DATASETS, BBOX_SUBDATASETS_NAMES)
]
MERGED_BBOX_SUBDATASET_NAME = 'bbox_64'
print('Copying from: ', '\t'.join([str(path) for path in BBOX_SUBDATASETS_PATHS]))
# %%
# find union names in datasets
DATASETS_NAME_UNION = list(
reduce(
lambda x, y: x.union(y),
[set(DATASETS[i].split('_')) for i in range(len(DATASETS))],
)
reduce(lambda x, y: x.union(y), [set(dataset.split('_')) for dataset in DATASETS],)
)
# rearrange name order (HN should be first)
DATASETS_NAME_UNION.insert(0, DATASETS_NAME_UNION.pop(DATASETS_NAME_UNION.index('HN')))
NEW_DATASET_NAME = '_'.join(DATASETS_NAME_UNION)
os.makedirs(f'data/{NEW_DATASET_NAME}/processed/bbox/', exist_ok=True)
NEW_DATASET_PATH = (
PROJECT_ROOT
/ 'data'
/ NEW_DATASET_NAME
/ 'processed'
/ 'bbox'
/ MERGED_BBOX_SUBDATASET_NAME
)
os.makedirs(NEW_DATASET_PATH, exist_ok=False)
print('Copying into: ', str(NEW_DATASET_PATH))
#%% [markdown]
# Merge and create a new file clinical
# # Merge and create a new clinical file
#%%
clinicals = []
n_patients = []
dataset_name = []
for dataset in DATASETS:
clinical = pd.read_csv(f'data/{dataset}/processed/clinical_{dataset}.csv')
n_patients.append(len(clinical.index))
dataset_name.append(list(np.repeat(dataset, len(clinical))))
clinical = pd.read_csv(
PROJECT_ROOT / 'data' / dataset / 'processed' / f'clinical_{dataset}.csv'
)
dataset_name.extend(list(np.repeat(dataset, len(clinical))))
clinicals.append(clinical)
merged_clinical = pd.concat([i for i in clinicals], join='inner')
merged_clinical['dataset'] = [item for sublist in dataset_name for item in sublist]
merged_clinical['dataset'] = dataset_name
merged_clinical.to_csv(
f'data/{NEW_DATASET_NAME}/processed/clinical_{NEW_DATASET_NAME}.csv', index=False
PROJECT_ROOT
/ 'data'
/ NEW_DATASET_NAME
/ 'processed'
/ f'clinical_{NEW_DATASET_NAME}.csv',
index=False,
)
#%% [markdown]
# # Link files from respective folders
for old_path in BBOX_SUBDATASETS_PATHS:
subprocess.call(f'ln -s {old_path}/* {NEW_DATASET_PATH}', shell=True)
# %%
......@@ -13,10 +13,12 @@ import itertools
from tqdm import tqdm
from utils import remove_na, remove_constant_cols
# os.chdir('..')
from config import get_project_root
# %%
PROJECT_ROOT = get_project_root()
DATASET_NAME = 'HN_val'
PROJECT_DATA_PATH = Path('data') / DATASET_NAME
PROJECT_DATA_PATH = PROJECT_ROOT / 'data' / DATASET_NAME
RAW_DATA_PATH = PROJECT_DATA_PATH / 'raw'
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
......
......@@ -29,9 +29,13 @@ import sys
import shutil
from tqdm import tqdm
from config import get_project_root
# %%
PROJECT_ROOT = get_project_root()
DATASET_NAME = 'HN_BZ'
PROJECT_DATA_PATH = Path('data') / DATASET_NAME
PROJECT_DATA_PATH = PROJECT_ROOT / 'data' / DATASET_NAME
RAW_DATA_PATH = PROJECT_DATA_PATH / 'raw'
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
PROCESSED_DCM_PATH = PROCESSED_DATA_PATH / 'dcm'
......
......@@ -27,20 +27,20 @@ from torch.utils.tensorboard import SummaryWriter
from dataset import NumpyCSVDataset, augment_3D_HN
from networks import CiompiDO, ResNet50_3d
from split import train_test_indexes_patient_wise
from config import get_project_root
PATH = Path(os.getcwd())
print(PATH)
#%%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multigpu = True
#
PROJECT_ROOT = get_project_root()
DATASET = 'HN_val'
BBOX_SUBDATASET = 'bbox_64'
DATASET_DIR = PATH / 'data' / DATASET / 'processed' / 'bbox' / BBOX_SUBDATASET
EXPERIMENT_DIR = PATH / 'experiments'
DATASET_DIR = PROJECT_ROOT / 'data' / DATASET / 'processed' / 'bbox' / BBOX_SUBDATASET
EXPERIMENT_DIR = PROJECT_ROOT / 'experiments'
PRETRAINED_MED3D_WEIGHTS = PATH / 'pretrained_weights' / 'resnet_50.pth'
PRETRAINED_MED3D_WEIGHTS = PROJECT_ROOT / 'pretrained_weights' / 'resnet_50.pth'
PRETRAINED_T_STAGE = EXPERIMENT_DIR / 'Tstage_4_noTx_CT_20191114-163418' / 'weights.pth'
# %%
### Settings
......@@ -50,7 +50,7 @@ settings = {
"model": CiompiDO,
"batch_size": 16,
"lr": 1e-5,
"epochs": 300,
"epochs": 1,
"optim": torch.optim.Adam,
"K": 0.2,
"n_classes": 4, # TSTAGE
......@@ -86,7 +86,7 @@ PRETRAINED = settings["pretrained"]
def new_run_log_dir(experiment_name):
log_dir = PATH / "tb-runs"
log_dir = PROJECT_ROOT / "tb-runs"
if not os.path.exists(log_dir):
os.makedirs(log_dir)
run_log_dir = log_dir / experiment_name
......@@ -100,7 +100,9 @@ writer = SummaryWriter(log_dir)
# %%
# ### Data Handlers
clinical_file = PATH / 'data' / DATASET / 'processed' / f'clinical_{DATASET}.csv'
clinical_file = (
PROJECT_ROOT / 'data' / DATASET / 'processed' / f'clinical_{DATASET}.csv'
)
target_column = "T-stage_grouped"
# %%
np.random.seed(SEED)
......
......@@ -23,7 +23,9 @@ conda activate radler
`mlpy` package is required for some operations included in the DAP procedure.
The `mlpy` package available on PyPI is outdated and not working on OSX platforms.
These are the steps to follow:
Let `<ANACONDA>` be your anaconda path (e.g., `/home/user/anaconda3`).
Adjust these environmental variables:
......@@ -37,6 +39,7 @@ and then install `mlpy` from GitLab:
pip install git+https://gitlab.fbk.eu/MPBA/mlpy.git
```
#### OCTAVE
`octave` is needed as some first order features are extracted using functions not yet implemented in Python.
......
"""
This python script take as input a csv with patient information,
data from medical imaging,
plot a 2-dim plot of the raw data reduced using umap.
It also provide a tool for visualizing the actual image,
selecting a point on the scatter plot
RUN: run with bokeh server: 'bokeh serve <script path>,
go to the specified local url
LOCATION: csv a upper level folder callend 'vendor_plot',
data in upper level folder called 'bbox64'
REQUIREMENTS:
numpy
pandas (sort of)
bokeh==1.4.0
holoviews==1.12.7
umap
PARAMETER: (important ones)
modality 'PT' /'CT
"""
import os
import numpy as np
import pandas as pd
from holoviews import opts
import holoviews as hv
import holoviews.plotting.bokeh
from holoviews.streams import Selection1D
import umap
from pathlib import Path
hv.extension('bokeh')
renderer = hv.renderer('bokeh')
def scan_explore(datapath_val,
datapath_bz,
subject_id,
slice_id=0,
modality='CT'):
print(f"Subject ID:{subject_id} Slice(x):{slice_id} Modality:{modality}")
slice_id = int(slice_id)
data_ext = ".npy"
if modality == 'PT':
mod_id = 1
elif modality == 'CT':
mod_id = 0
else:
print("Invalid scan modality.")
# extract slice from data
object_path_val = datapath_val/(subject_id + data_ext) # path op
object_path_bz = datapath_bz/(subject_id + data_ext)
try:
img_slice = np.load(object_path_val)[mod_id, slice_id, :, :]
except FileNotFoundError:
img_slice = np.load(object_path_bz)[mod_id, slice_id, :, :]
return hv.Image(img_slice).opts(title=f"id:{subject_id} mod:{modality}",
fontsize={'title': 10})
def scan_wrap(datapath_val, datapath_bz, points_data,
index, slice_id=0, modality='CT'):
print("Index supplied ", index)
# the supplied index is always a list with one element
if (index is None) | (index == []):
pt_index = 0
else:
pt_index = index[0]
subject_id = (points_data.data.iloc[pt_index])['Subject']
res = scan_explore(datapath_val, datapath_bz,
subject_id, slice_id, modality)
return res
def get_data_lbl(path, file_list, path_orig_df, modality):
"""
Extract data and manufacturer label from data folder and info file
IN:
path: data folder
file_list: list of files with extension
path_orig_df: our dataset with file info
modality: either PT (for PET) or CT
OUT:
pandas dataframe with cols:
data: list of 3d-numpy arrays
manufacturer: scan device manufacturer
dataset: dataset id
subject_id: scanned subject id
"""
if modality == "CT":
mod_id = 0
elif modality == "PT":
mod_id = 1
else:
print("Please enter a valid modality parameter")
#
df_sel_mod = path_orig_df.loc[path_orig_df["Modality"] == modality]
data = []
lbl = []
dataset_id = []
subject_id = []
for elem in file_list:
# subj id removing extension
subj_id = elem.split(".")[0]
ds_split = subj_id.split("-")
if len(ds_split) == 3:
dataset_tmp = subj_id.split("-")[1]
# save manufacturer
vendor = df_sel_mod.loc[df_sel_mod["Subject ID"] == subj_id,
"Manufacturer"].values[0]
elif len(ds_split) == 1: # bz unique with different format
dataset_tmp = "BZ_0"
vendor = "Philips" # bolzano manufacturer
# load patient data
data_tmp = np.load(path/elem)[mod_id, :, :, :]
# save to lists
data.append(data_tmp)
lbl.append(vendor)
dataset_id.append(dataset_tmp)
subject_id.append(subj_id)
return pd.DataFrame(data={'data': data, 'manufacturer': lbl,
'dataset': dataset_id, 'subject_id': subject_id})
def get_scatter(df, group, ds):
keydims = ['x_u', 'y_u']
valdims = ['Manufacturer', 'Dataset', 'Subject']
df_sel = df.loc[(df['manufacturer'] == group) & (df['dataset'] == ds)]
red_x = df_sel['red_x']
red_y = df_sel['red_y']
manufacturer = df_sel['manufacturer']
dataset = df_sel['dataset']
subject_id = df_sel['subject_id']
plot = hv.Scatter((red_x, red_y, manufacturer, dataset, subject_id),
kdims=keydims, vdims=valdims)
return plot
def get_scatter_full(df):
keydims = ['x_u', 'y_u']
valdims = ['Manufacturer', 'Dataset', 'Subject']
df_sel = df
red_x = df_sel['red_x']
red_y = df_sel['red_y']
manufacturer = df_sel['manufacturer']
dataset = df_sel['dataset']
subject_id = df_sel['subject_id']
plot = hv.Scatter((red_x, red_y, manufacturer, dataset, subject_id),
kdims=keydims, vdims=valdims)
return plot
#
# set script parameters
modality = "CT" # choose a modality for the 2d rim red plot
data_folder = Path().cwd()/'..'/'..'/'data'
info_df_path = data_folder/'HN_val'/'processed'/'path_original_data.csv'
DATAPATH = data_folder/'HN_val'/'processed'/'bbox'/'bbox_64'
DATAPATH_BZ = data_folder/'HN_BZ'/'processed'/'bbox'/'bbox_64'
file_list = os.listdir(DATAPATH)
file_list_bz = os.listdir(DATAPATH_BZ)
file_list.sort()
file_list_bz.sort()
path_orig_df = pd.read_csv(info_df_path)
print("Environment set.")
# prepare data and dim red
# actually get the data using pandas dataframes
# collect df from multiple folder for inputs and merge vertically
df_val = get_data_lbl(DATAPATH, file_list, path_orig_df, modality)
df_bz = get_data_lbl(DATAPATH_BZ, file_list_bz, path_orig_df, modality)
df = df_val.append(df_bz, ignore_index=True)
# flatten the 3d array
df['flat_data'] = [x.flatten('C') for x in df['data']]
df.drop(columns=['data'], inplace=True) # remove data col to save memory
umap_euclid = umap.UMAP(metric="euclidean", n_components=2,
n_neighbors=20, min_dist=.2)
reduced_data = umap_euclid.fit_transform(df['flat_data'].tolist())
df['red_x'] = reduced_data[:, 0]
df['red_y'] = reduced_data[:, 1]
df.drop(columns=['flat_data'], inplace=True) # remove flat_data col
print("Dim red completed")
# set up the layered scatterplot
# compute possible combination in the dataset
# use them to generate all the relevant sub-plot to be stacked
# compute unique co occurrences of manufacturer and
# dataset id actually present in the data
comb_vendor_ds = np.unique(df['manufacturer'] + '-' + df['dataset'])
comb_vendor_ds = [x.split('-') for x in comb_vendor_ds]
scatter_dict = {(group, ds): get_scatter(df, group, ds)
for group, ds in comb_vendor_ds}
# overlayed plots with different group, give color/legend can be hovered over
overlay = hv.NdOverlay(scatter_dict, kdims=['Manufacturer', 'dataset'])
overlay.opts(opts.Scatter(alpha=0.75, size=7, width=650,
height=650, tools=['hover']))
# one transparent layer just for point index information with tapping widgets
scatter_title = f"Umap 2d plot HN mod: {modality}"
p_ref = get_scatter_full(df).opts(opts.Scatter(title=scatter_title,
alpha=0.0, size=7,
width=650, height=650,
tools=['tap']))
s = overlay * p_ref
# attach stream to overlayed plot
# stream depend on tap widget, unique selection widget and
# output a list with its index
# in this case it will account for index over the last layered
# scatterplot (in this case the complete one)
sel_stream = Selection1D(source=s, index=[0])
print("overlay plot and stream have been set.")
# define dynamic map taking subj id from stream and points data,
# slice id and modality from widgets
# to use already there the index variable and the kdim from
# sliders explicitly in the plot func
# it required that lambda func otherwise only the func has to be passed.
# prob there is a better way...
scan_dmap = hv.DynamicMap(lambda index, slice_id,
modality: scan_wrap(datapath_val=DATAPATH,
datapath_bz=DATAPATH_BZ,
points_data=p_ref,
index=index,
slice_id=slice_id,
modality=modality),
streams=[sel_stream],
kdims=['slice_id', 'modality'])
# # define value domains for key dimensions not in streams
scan_dmap = scan_dmap.redim.range(slice_id=(0, 64))
scan_dmap = scan_dmap.redim.values(modality=['CT', 'PT'])
# rendering
layout = s + scan_dmap # build layout
doc = renderer.server_doc(layout) # instantiate renderer
doc.title = 'HN Scan explorer' # set title
This diff is collapsed.