Commit 668dee72 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Add ROI modality column to clinical file

parent 56e662df
......@@ -54,57 +54,6 @@ grading_dict_grouped = {
}
if DATASET_NAME == 'HN_val':
# prepare clinical file
CLINICAL_DATA_FILENAME = 'INFOclinical_HN_Version2_30may2018.xlsx'
clinical_dict = pd.read_excel(
RAW_DATA_PATH / CLINICAL_DATA_FILENAME, sheet_name=None
) # get all sheets as a dict of DataFrames
clinical_sheets = []
for sheet_name in clinical_dict.keys():
if not sheet_name == 'Excluded':
clinical_sheet = clinical_dict[sheet_name]
clinical_sheet = remove_na(clinical_sheet, columns=['Patient #'])
# remove authors original notes
clinical_sheet = clinical_sheet.loc[
clinical_sheet['Patient #'].apply(lambda x: x.startswith('HN'))
]
clinical_sheets.append(clinical_sheet)
clinical = pd.concat(clinical_sheets, ignore_index=True, sort=True)
clinical = remove_constant_cols(clinical)
clinical.sort_values(by='Patient #', inplace=True)
clinical.rename(
{'Patient #': PATIENT_COL, 'Locoregional': LABEL_COL_LOCOREGIONAL},
inplace=True,
axis=1,
)
clinical[FILENAME_COL] = clinical[PATIENT_COL].apply(lambda x: x + '.npy')
clinical.set_index(PATIENT_COL, inplace=True)
# compute new label columns
clinical[LABEL_COL_T_STAGE_BINARY] = clinical['T-stage'].apply(
lambda x: grading_dict_binary[x]
)
clinical[LABEL_COL_T_STAGE_GROUPED] = clinical['T-stage'].apply(
lambda x: grading_dict_grouped[x]
)
old_clinical_len = len(clinical)
# Get ROI name for each segmentation mask
ROI_INFO_FILENAME = 'INFO_GTVcontours_HN.csv'
roi_info = pd.read_csv(RAW_DATA_PATH / ROI_INFO_FILENAME)
clinical = clinical.merge(roi_info, on='patient')
assert old_clinical_len == len(clinical)
clinical.to_csv(PROCESSED_DATA_PATH / CLINICAL_DATA_FILENAME_CLEAN)
################### clean metadata file to keep only CT scans, PT scans and 1 segmentation mask
METADATA_FILENAME = 'tcia_original_metadata_HN_val.csv'
......@@ -345,6 +294,83 @@ if DATASET_NAME == 'HN_val':
metadata.to_csv(PROCESSED_DATA_PATH / 'path_original_data.csv', index=False)
##############################
# prepare clinical file
CLINICAL_DATA_FILENAME = 'INFOclinical_HN_Version2_30may2018.xlsx'
clinical_dict = pd.read_excel(
RAW_DATA_PATH / CLINICAL_DATA_FILENAME, sheet_name=None
) # get all sheets as a dict of DataFrames
clinical_sheets = []
for sheet_name in clinical_dict.keys():
if not sheet_name == 'Excluded':
clinical_sheet = clinical_dict[sheet_name]
clinical_sheet = remove_na(clinical_sheet, columns=['Patient #'])
# remove authors original notes
clinical_sheet = clinical_sheet.loc[
clinical_sheet['Patient #'].apply(lambda x: x.startswith('HN'))
]
clinical_sheets.append(clinical_sheet)
clinical = pd.concat(clinical_sheets, ignore_index=True, sort=True)
clinical = remove_constant_cols(clinical)
clinical.sort_values(by='Patient #', inplace=True)
clinical.rename(
{'Patient #': PATIENT_COL, 'Locoregional': LABEL_COL_LOCOREGIONAL},
inplace=True,
axis=1,
)
clinical[FILENAME_COL] = clinical[PATIENT_COL].apply(lambda x: x + '.npy')
clinical.set_index(PATIENT_COL, inplace=True)
# compute new label columns
clinical[LABEL_COL_T_STAGE_BINARY] = clinical['T-stage'].apply(
lambda x: grading_dict_binary[x]
)
clinical[LABEL_COL_T_STAGE_GROUPED] = clinical['T-stage'].apply(
lambda x: grading_dict_grouped[x]
)
old_clinical_len = len(clinical)
# Get ROI name for each segmentation mask
ROI_INFO_FILENAME = 'INFO_GTVcontours_HN.csv'
roi_info = pd.read_csv(RAW_DATA_PATH / ROI_INFO_FILENAME)
roi_info.rename({'roi_name': 'ROI_name'}, axis=1, inplace=True)
clinical = clinical.merge(roi_info, on='patient')
assert old_clinical_len == len(
clinical
), f'Clinical file and {ROI_INFO_FILENAME} differ in patients.'
# Get ROI modality
series_description = metadata[metadata['Modality'] == 'RTSTRUCT'][
['Subject ID', 'Series Description']
]
modality_dict = {
'RTstruct_CTsim->CT(PET-CT)': 'CT',
'RTstruct_CTsim->PET(PET-CT)': 'PT',
}
series_description['ROI_modality'] = series_description['Series Description'].apply(
lambda x: modality_dict[x]
)
series_description = series_description[['Subject ID', 'ROI_modality']]
clinical = clinical.merge(
series_description, left_on='patient', right_on='Subject ID'
)
assert old_clinical_len == len(
clinical
), f'Clinical file and metadata file differ in patients.'
clinical.to_csv(PROCESSED_DATA_PATH / CLINICAL_DATA_FILENAME_CLEAN, index=False)
elif DATASET_NAME == 'HN_BZ':
CLINICAL_DATA_FILENAME = 'pz_HN_BZ.csv'
......@@ -357,6 +383,8 @@ elif DATASET_NAME == 'HN_BZ':
clinical['ROI_name'] = 'lesione'
clinical['ROI_modality'] = 'PT'
clinical_no_na.to_csv(
PROCESSED_DATA_PATH / CLINICAL_DATA_FILENAME_CLEAN, index=False
)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment