Commit 7916909e authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Standardize dataset structure and associated clinical file

parent 4b221185
#%% [markdown]
# Create clinical file with (at least) the following columns:
# * filename
# * patient
# * label(s)
#%%
import pandas as pd
from pathlib import Path
import os
import numpy as np
import itertools
from tqdm import tqdm
from utils import remove_na, remove_constant_cols
# os.chdir('..')
# %%
DATASET_NAME = 'HN_val'
PROJECT_DATA_PATH = Path('data') / DATASET_NAME
RAW_DATA_PATH = PROJECT_DATA_PATH / 'raw'
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
CLINICAL_DATA_FILENAME_CLEAN = f'clinical_{DATASET_NAME}.csv'
FILENAME_COL = 'filename'
PATIENT_COL = 'patient'
LABEL_COL_LOCOREGIONAL = 'locoregional'
LABEL_COL_T_STAGE_GROUPED = 'T-stage_grouped'
LABEL_COL_T_STAGE_BINARY = 'T-stage_binary'
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
# Tx T-stages cluster with low T-stages in the UMAP projection
grading_dict_binary = {
'T1': '0',
'T2': '0',
'T3': '1',
'T4': '1',
'T4a': '1',
'T4b': '1',
'Tx': '0',
}
grading_dict_grouped = {
'T1': '0',
'T2': '1',
'T3': '2',
'T4': '3',
'T4a': '3',
'T4b': '3',
'Tx': '0',
}
if DATASET_NAME == 'HN_val':
# prepare clinical file
CLINICAL_DATA_FILENAME = 'INFOclinical_HN_Version2_30may2018.xlsx'
clinical_dict = pd.read_excel(
RAW_DATA_PATH / CLINICAL_DATA_FILENAME, sheet_name=None
) # get all sheets as a dict of DataFrames
clinical_sheets = []
for sheet_name in clinical_dict.keys():
if not sheet_name == 'Excluded':
clinical_sheet = clinical_dict[sheet_name]
clinical_sheet = remove_na(clinical_sheet, columns=['Patient #'])
# remove authors original notes
clinical_sheet = clinical_sheet.loc[
clinical_sheet['Patient #'].apply(lambda x: x.startswith('HN'))
]
clinical_sheets.append(clinical_sheet)
clinical = pd.concat(clinical_sheets, ignore_index=True, sort=True)
clinical = remove_constant_cols(clinical)
clinical.sort_values(by='Patient #', inplace=True)
clinical.rename(
{'Patient #': PATIENT_COL, 'Locoregional': LABEL_COL_LOCOREGIONAL},
inplace=True,
axis=1,
)
clinical[FILENAME_COL] = clinical[PATIENT_COL].apply(lambda x: x + '.npy')
clinical.set_index(PATIENT_COL, inplace=True)
# compute new label columns
clinical[LABEL_COL_T_STAGE_BINARY] = clinical['T-stage'].apply(
lambda x: grading_dict_binary[x]
)
clinical[LABEL_COL_T_STAGE_GROUPED] = clinical['T-stage'].apply(
lambda x: grading_dict_grouped[x]
)
clinical.to_csv(PROCESSED_DATA_PATH / CLINICAL_DATA_FILENAME_CLEAN)
################### clean metadata file to keep only CT scans, PT scans and 1 segmentation mask
METADATA_FILENAME = 'tcia_original_metadata_HN_val.csv'
metadata = pd.read_csv(RAW_DATA_PATH / METADATA_FILENAME).sort_values(
by='Subject ID'
)
# discard all modalities other than CT, PT and RTSTRUCT
metadata = metadata.loc[metadata['Modality'].isin(['CT', 'PT', 'RTSTRUCT'])]
# remove rows with missing values in 'Study Description' and 'Series Description' columns
metadata = remove_na(metadata, columns=['Study Description', 'Series Description'])
# remove rows where 'TomoTherapy' is found (we are interested in scans and masks pre-radion therapy)
metadata = metadata.loc[
metadata['Study Description'].apply(lambda x: 'TomoTherapy' not in x)
]
metadata = metadata.loc[metadata['Manufacturer'] != 'TomoTherapy Incorporated']
# to uniform the masks, we remove the ones NOT performed by the MIM software Manufacturer
metadata = metadata.loc[
metadata.apply(
lambda x: x['Manufacturer'] == 'MIM Software Inc.'
or x['Modality'] != 'RTSTRUCT',
axis=1,
)
]
# NB: some patients have more than one segmentation mask!! We need to choose one.
# find the patients with more than one segmentation ...
indices_multiple_rs = np.where(
metadata.loc[metadata['Modality'] == 'RTSTRUCT']
.groupby('Subject ID')
.count()['Series Description']
> 1
)[0]
patients_multiple_rs = (
metadata.loc[metadata['Modality'] == 'RTSTRUCT']
.groupby('Subject ID')
.count()['Series Description']
> 1
).index.values[indices_multiple_rs]
# ... and find the corresponding metadata
metadata_multiple_rs = metadata.loc[
metadata['Subject ID'].isin(patients_multiple_rs)
]
# For these patients, based on the metadata, we decided to eliminate the segmentation different from 'RTstruct_CTsim->CT(PET-CT)'
indices_to_drop = metadata_multiple_rs.loc[
~metadata_multiple_rs.apply(
lambda x: x['Modality'] != 'RTSTRUCT'
or x['Series Description'] == 'RTstruct_CTsim->CT(PET-CT)',
axis=1,
)
].index.values
metadata.drop(indices_to_drop, axis=0, inplace=True)
# Now we have only one segmentation for each patient.
# However, some patients have multiple CT folders, and we need to choose one.
# Find patients with multiple CT folders
patients_multiple_CT = (
metadata.groupby('Subject ID').count()['Modality'] > 3
).index.values[np.where(metadata.groupby('Subject ID').count()['Modality'] > 3)[0]]
# For these patients, group by number of images (both CT and PET) and 'Study Desctiption'
# Most of these patients have the same number of PET and CT scans with the same 'Study Desctiption'.
patients_study_nimages_count = (
metadata.loc[
metadata.apply(
lambda x: x['Subject ID'] in patients_multiple_CT
and x['Modality'] != 'RTSTRUCT',
axis=1,
)
]
.groupby(['Subject ID', 'Study Description', 'Number of images'])
.count()
)
patients_study_nimages_count.reset_index(inplace=True)
# Find the patients for which the number of CT scans is different from the number of PET scans
# Most of these patients have only two values of 'Study Description'.
# We pick the patients without a matching number of PET and CT scans (denoted as ambiguous).
patients_nimages_study_uneven_boolean = (
patients_study_nimages_count[['Subject ID', 'Number of images']]
.loc[patients_study_nimages_count['Series Description'] == 1]
.groupby('Subject ID')
.count()['Number of images']
!= 1
)
patients_nimages_study_uneven_boolean = (
patients_nimages_study_uneven_boolean.reset_index()
)
patients_ambiguous = patients_nimages_study_uneven_boolean.iloc[
np.where(patients_nimages_study_uneven_boolean['Number of images'])[0]
]['Subject ID'].values
# Find the combination Subject ID - Number of images to be dropped (because the CT and PET numbers don't match).
patients_nimages_uneven_CT = patients_study_nimages_count[
['Subject ID', 'Number of images']
].loc[patients_study_nimages_count['Series Description'] == 1]
patients_nimages_uneven_CT.reset_index(inplace=True, drop=True)
# Find the rows in `patients_nimages_uneven_CT` that correspond to ambiguous patients, to be dropped from metadata later.
rows_to_drop = np.where(
patients_nimages_uneven_CT['Subject ID'].isin(patients_ambiguous)
)[0]
patients_nimages_uneven_CT.drop(rows_to_drop, inplace=True)
# Filter rows with patient and number of images to drop
metadata = metadata.loc[
metadata.apply(
lambda x: (x['Subject ID'], x['Number of images'])
not in patients_nimages_uneven_CT.itertuples(index=False, name=None),
axis=1,
)
]
# Notice that the ambiguous patients are still here.
metadata_ambiguous_patients = metadata.loc[
metadata['Subject ID'].isin(patients_ambiguous)
& ((metadata['Modality'] == 'CT') | (metadata['Modality'] == 'PT'))
]
# As a rule to pick up CT scan folder we decided to keep the CT with 'Study Description' matching the PT 'Study Description'.
ambiguous_patient_count_study_boolean = (
metadata_ambiguous_patients.groupby(
['Subject ID', 'Study Description']
).count()['Modality']
== 1
)
ambiguous_patient_count_study_boolean = (
ambiguous_patient_count_study_boolean.reset_index()
)
# Find the combination Subject ID - Study Descriptooi to be dropped (because the CT's and PET's Study Description don't match).
ambiguous_patients_unmatch_study = ambiguous_patient_count_study_boolean.iloc[
np.where(ambiguous_patient_count_study_boolean['Modality'])[0]
][['Subject ID', 'Study Description']]
# Filter rows with patient and number of images to drop and save
metadata = metadata.loc[
metadata.apply(
lambda x: (x['Subject ID'], x['Study Description'])
not in ambiguous_patients_unmatch_study.itertuples(index=False, name=None),
axis=1,
)
]
# reconstruct path to raw data
metadata['Study Date'] = pd.to_datetime(metadata['Study Date'], yearfirst=True)
metadata['Study Date'] = metadata['Study Date'].apply(
lambda ts: ts.strftime("%m-%d-%Y")
)
# We need to sanitize the 'Study Description' column as some characters have been removed in the folder names.
# First, we compute all the possibile characters in 'Study Description' column
all_chars_study = set(list(''.join(metadata['Study Description'].values)))
# Secondly, we find all the valid characters (i.e. in the folder names)
valid_chars_study = []
for patient in os.listdir(RAW_DATA_PATH):
if not patient.endswith('xlsx') and not patient.endswith('csv'):
for folder in os.listdir(RAW_DATA_PATH / patient):
valid_chars_study += list(folder)
valid_chars_study = set(valid_chars_study)
chars_to_remove_study = all_chars_study - valid_chars_study
# Same sanitization needs to be done for the 'Series Description' column
all_chars_series = set(list(''.join(metadata['Series Description'].values)))
valid_chars_series = []
for patient in os.listdir(RAW_DATA_PATH):
if not patient.endswith('xlsx') and not patient.endswith('csv'):
for folder in os.listdir(RAW_DATA_PATH / patient):
for series in os.listdir(RAW_DATA_PATH / patient / folder):
valid_chars_series += list(series)
valid_chars_series = set(valid_chars_series)
chars_to_remove_series = all_chars_series - valid_chars_series
# For each row in the metadata we retrieve the data path and save it as a new column named 'dicom_folder'
dicom_folders = []
for i, row in tqdm(metadata.iterrows()):
patient = row['Subject ID']
study_uid = row['Study UID'][-5:]
study_description = row['Study Description']
study_date = row['Study Date']
series_description = row['Series Description']
study_description = ''.join(
[c for c in list(study_description) if c not in chars_to_remove_study]
)
series_description = ''.join(
[c for c in list(series_description) if c not in chars_to_remove_series]
)
# print(study_description)
# break
path_to_study = Path(patient) / f'{study_date}-{study_description}-{study_uid}'
# print(path_to_study)
folders = [
f
for f in os.listdir(RAW_DATA_PATH / path_to_study)
if series_description in f
]
if len(folders) == 1:
folder = folders[0]
# print(folder)
elif len(folders) == 0:
print(f'Empty folder {path_to_study}', sys.stderr)
else:
print(
f'Multiple matching folders for {series_description} in {path_to_study}',
file=sys.stderr,
)
# print('\t', folders)
dicom_folder = str(path_to_study / folder)
dicom_folders.append(dicom_folder)
# TODO: finish to reconstruct path
metadata['dicom_folder'] = dicom_folders
metadata.to_csv(PROCESSED_DATA_PATH / 'path_original_data.csv', index=False)
elif DATASET_NAME == 'HN_BZ':
CLINICAL_DATA_FILENAME = 'pz_HN_BZ.csv'
clinical = pd.read_csv(RAW_DATA_PATH / CLINICAL_DATA_FILENAME, sep=';')
clinical_no_na = remove_na(clinical, columns=[LABEL_COL_LOCOREGIONAL])
clinical_no_na[LABEL_COL_LOCOREGIONAL] = clinical_no_na[
LABEL_COL_LOCOREGIONAL
].astype(np.uint8)
# clinical_no_na.sort_values(by=PATIENT_COL, inplace=True)
clinical_no_na.to_csv(
PROCESSED_DATA_PATH / CLINICAL_DATA_FILENAME_CLEAN, index=False
)
# %%
# %%
#%% [markdown]
# Create dataset with the following structure:
# ```bash
# ├── data
# │ ├── <dataset name>
# │ │ ├── processed
# │ | | ├── dcm
# │ | | | ├── <patient ID>
# │ | | | | ├── CT
# │ | | | | ├── PT
# │ │ | | | └── RTSTRUCT
# │ | | | ├── ...
# │ | | | └── <patient ID>
# │ | | | ├── CT
# │ | | | ├── PT
# │ │ | | └── RTSTRUCT
# │ | | ├── bbox
# │ │ | | └── <subdatasets>
# │ │ | └── <processed clinical file>
# │ │ └── raw
# │ | ├── <original data>
# │ │ └── <original clinical / metadata files>
# ```
# %%
import pandas as pd
from pathlib import Path
import os
import sys
import shutil
from tqdm import tqdm
# %%
DATASET_NAME = 'HN_val'
PROJECT_DATA_PATH = Path('data') / DATASET_NAME
RAW_DATA_PATH = PROJECT_DATA_PATH / 'raw'
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
PROCESSED_DCM_PATH = PROCESSED_DATA_PATH / 'dcm'
os.makedirs(PROCESSED_DCM_PATH, exist_ok=False)
if DATASET_NAME == 'HN_val':
# TODO: read path from path_original_data.csv and not from summary.csv
DATASET_DESCRIPTION_FILE = 'path_original_data.csv'
dataset_description = pd.read_csv(PROCESSED_DATA_PATH / DATASET_DESCRIPTION_FILE)
for i, row in tqdm(dataset_description.iterrows()):
patient = row['Subject ID']
old_files_dir = RAW_DATA_PATH / row['dicom_folder']
modality = row['Modality']
n_slices = row['Number of images']
new_files_dir = PROCESSED_DCM_PATH / patient / modality
# os.makedirs(new_files_dir, exist_ok=True)
files = os.listdir(old_files_dir)
if len(files) != int(n_slices):
print(
f'The number of files in folder {old_files_dir} ({len(files)}) is different from the one declared in the metadata file ({int(n_slices)}).',
file=sys.stderr,
)
shutil.copytree(old_files_dir, new_files_dir)
# %%
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment