Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
MPBA Radiomics
RADLER
Commits
7916909e
Commit
7916909e
authored
Mar 05, 2020
by
Alessia Marcolini
Browse files
Standardize dataset structure and associated clinical file
parent
4b221185
Changes
2
Hide whitespace changes
Inline
Side-by-side
preprocessing/prepare_clinical.py
0 → 100644
View file @
7916909e
#%% [markdown]
# Create clinical file with (at least) the following columns:
# * filename
# * patient
# * label(s)
#%%
import
pandas
as
pd
from
pathlib
import
Path
import
os
import
numpy
as
np
import
itertools
from
tqdm
import
tqdm
from
utils
import
remove_na
,
remove_constant_cols
# os.chdir('..')
# %%
DATASET_NAME
=
'HN_val'
PROJECT_DATA_PATH
=
Path
(
'data'
)
/
DATASET_NAME
RAW_DATA_PATH
=
PROJECT_DATA_PATH
/
'raw'
PROCESSED_DATA_PATH
=
PROJECT_DATA_PATH
/
'processed'
CLINICAL_DATA_FILENAME_CLEAN
=
f
'clinical_
{
DATASET_NAME
}
.csv'
FILENAME_COL
=
'filename'
PATIENT_COL
=
'patient'
LABEL_COL_LOCOREGIONAL
=
'locoregional'
LABEL_COL_T_STAGE_GROUPED
=
'T-stage_grouped'
LABEL_COL_T_STAGE_BINARY
=
'T-stage_binary'
os
.
makedirs
(
PROCESSED_DATA_PATH
,
exist_ok
=
True
)
# Tx T-stages cluster with low T-stages in the UMAP projection
grading_dict_binary
=
{
'T1'
:
'0'
,
'T2'
:
'0'
,
'T3'
:
'1'
,
'T4'
:
'1'
,
'T4a'
:
'1'
,
'T4b'
:
'1'
,
'Tx'
:
'0'
,
}
grading_dict_grouped
=
{
'T1'
:
'0'
,
'T2'
:
'1'
,
'T3'
:
'2'
,
'T4'
:
'3'
,
'T4a'
:
'3'
,
'T4b'
:
'3'
,
'Tx'
:
'0'
,
}
if
DATASET_NAME
==
'HN_val'
:
# prepare clinical file
CLINICAL_DATA_FILENAME
=
'INFOclinical_HN_Version2_30may2018.xlsx'
clinical_dict
=
pd
.
read_excel
(
RAW_DATA_PATH
/
CLINICAL_DATA_FILENAME
,
sheet_name
=
None
)
# get all sheets as a dict of DataFrames
clinical_sheets
=
[]
for
sheet_name
in
clinical_dict
.
keys
():
if
not
sheet_name
==
'Excluded'
:
clinical_sheet
=
clinical_dict
[
sheet_name
]
clinical_sheet
=
remove_na
(
clinical_sheet
,
columns
=
[
'Patient #'
])
# remove authors original notes
clinical_sheet
=
clinical_sheet
.
loc
[
clinical_sheet
[
'Patient #'
].
apply
(
lambda
x
:
x
.
startswith
(
'HN'
))
]
clinical_sheets
.
append
(
clinical_sheet
)
clinical
=
pd
.
concat
(
clinical_sheets
,
ignore_index
=
True
,
sort
=
True
)
clinical
=
remove_constant_cols
(
clinical
)
clinical
.
sort_values
(
by
=
'Patient #'
,
inplace
=
True
)
clinical
.
rename
(
{
'Patient #'
:
PATIENT_COL
,
'Locoregional'
:
LABEL_COL_LOCOREGIONAL
},
inplace
=
True
,
axis
=
1
,
)
clinical
[
FILENAME_COL
]
=
clinical
[
PATIENT_COL
].
apply
(
lambda
x
:
x
+
'.npy'
)
clinical
.
set_index
(
PATIENT_COL
,
inplace
=
True
)
# compute new label columns
clinical
[
LABEL_COL_T_STAGE_BINARY
]
=
clinical
[
'T-stage'
].
apply
(
lambda
x
:
grading_dict_binary
[
x
]
)
clinical
[
LABEL_COL_T_STAGE_GROUPED
]
=
clinical
[
'T-stage'
].
apply
(
lambda
x
:
grading_dict_grouped
[
x
]
)
clinical
.
to_csv
(
PROCESSED_DATA_PATH
/
CLINICAL_DATA_FILENAME_CLEAN
)
################### clean metadata file to keep only CT scans, PT scans and 1 segmentation mask
METADATA_FILENAME
=
'tcia_original_metadata_HN_val.csv'
metadata
=
pd
.
read_csv
(
RAW_DATA_PATH
/
METADATA_FILENAME
).
sort_values
(
by
=
'Subject ID'
)
# discard all modalities other than CT, PT and RTSTRUCT
metadata
=
metadata
.
loc
[
metadata
[
'Modality'
].
isin
([
'CT'
,
'PT'
,
'RTSTRUCT'
])]
# remove rows with missing values in 'Study Description' and 'Series Description' columns
metadata
=
remove_na
(
metadata
,
columns
=
[
'Study Description'
,
'Series Description'
])
# remove rows where 'TomoTherapy' is found (we are interested in scans and masks pre-radion therapy)
metadata
=
metadata
.
loc
[
metadata
[
'Study Description'
].
apply
(
lambda
x
:
'TomoTherapy'
not
in
x
)
]
metadata
=
metadata
.
loc
[
metadata
[
'Manufacturer'
]
!=
'TomoTherapy Incorporated'
]
# to uniform the masks, we remove the ones NOT performed by the MIM software Manufacturer
metadata
=
metadata
.
loc
[
metadata
.
apply
(
lambda
x
:
x
[
'Manufacturer'
]
==
'MIM Software Inc.'
or
x
[
'Modality'
]
!=
'RTSTRUCT'
,
axis
=
1
,
)
]
# NB: some patients have more than one segmentation mask!! We need to choose one.
# find the patients with more than one segmentation ...
indices_multiple_rs
=
np
.
where
(
metadata
.
loc
[
metadata
[
'Modality'
]
==
'RTSTRUCT'
]
.
groupby
(
'Subject ID'
)
.
count
()[
'Series Description'
]
>
1
)[
0
]
patients_multiple_rs
=
(
metadata
.
loc
[
metadata
[
'Modality'
]
==
'RTSTRUCT'
]
.
groupby
(
'Subject ID'
)
.
count
()[
'Series Description'
]
>
1
).
index
.
values
[
indices_multiple_rs
]
# ... and find the corresponding metadata
metadata_multiple_rs
=
metadata
.
loc
[
metadata
[
'Subject ID'
].
isin
(
patients_multiple_rs
)
]
# For these patients, based on the metadata, we decided to eliminate the segmentation different from 'RTstruct_CTsim->CT(PET-CT)'
indices_to_drop
=
metadata_multiple_rs
.
loc
[
~
metadata_multiple_rs
.
apply
(
lambda
x
:
x
[
'Modality'
]
!=
'RTSTRUCT'
or
x
[
'Series Description'
]
==
'RTstruct_CTsim->CT(PET-CT)'
,
axis
=
1
,
)
].
index
.
values
metadata
.
drop
(
indices_to_drop
,
axis
=
0
,
inplace
=
True
)
# Now we have only one segmentation for each patient.
# However, some patients have multiple CT folders, and we need to choose one.
# Find patients with multiple CT folders
patients_multiple_CT
=
(
metadata
.
groupby
(
'Subject ID'
).
count
()[
'Modality'
]
>
3
).
index
.
values
[
np
.
where
(
metadata
.
groupby
(
'Subject ID'
).
count
()[
'Modality'
]
>
3
)[
0
]]
# For these patients, group by number of images (both CT and PET) and 'Study Desctiption'
# Most of these patients have the same number of PET and CT scans with the same 'Study Desctiption'.
patients_study_nimages_count
=
(
metadata
.
loc
[
metadata
.
apply
(
lambda
x
:
x
[
'Subject ID'
]
in
patients_multiple_CT
and
x
[
'Modality'
]
!=
'RTSTRUCT'
,
axis
=
1
,
)
]
.
groupby
([
'Subject ID'
,
'Study Description'
,
'Number of images'
])
.
count
()
)
patients_study_nimages_count
.
reset_index
(
inplace
=
True
)
# Find the patients for which the number of CT scans is different from the number of PET scans
# Most of these patients have only two values of 'Study Description'.
# We pick the patients without a matching number of PET and CT scans (denoted as ambiguous).
patients_nimages_study_uneven_boolean
=
(
patients_study_nimages_count
[[
'Subject ID'
,
'Number of images'
]]
.
loc
[
patients_study_nimages_count
[
'Series Description'
]
==
1
]
.
groupby
(
'Subject ID'
)
.
count
()[
'Number of images'
]
!=
1
)
patients_nimages_study_uneven_boolean
=
(
patients_nimages_study_uneven_boolean
.
reset_index
()
)
patients_ambiguous
=
patients_nimages_study_uneven_boolean
.
iloc
[
np
.
where
(
patients_nimages_study_uneven_boolean
[
'Number of images'
])[
0
]
][
'Subject ID'
].
values
# Find the combination Subject ID - Number of images to be dropped (because the CT and PET numbers don't match).
patients_nimages_uneven_CT
=
patients_study_nimages_count
[
[
'Subject ID'
,
'Number of images'
]
].
loc
[
patients_study_nimages_count
[
'Series Description'
]
==
1
]
patients_nimages_uneven_CT
.
reset_index
(
inplace
=
True
,
drop
=
True
)
# Find the rows in `patients_nimages_uneven_CT` that correspond to ambiguous patients, to be dropped from metadata later.
rows_to_drop
=
np
.
where
(
patients_nimages_uneven_CT
[
'Subject ID'
].
isin
(
patients_ambiguous
)
)[
0
]
patients_nimages_uneven_CT
.
drop
(
rows_to_drop
,
inplace
=
True
)
# Filter rows with patient and number of images to drop
metadata
=
metadata
.
loc
[
metadata
.
apply
(
lambda
x
:
(
x
[
'Subject ID'
],
x
[
'Number of images'
])
not
in
patients_nimages_uneven_CT
.
itertuples
(
index
=
False
,
name
=
None
),
axis
=
1
,
)
]
# Notice that the ambiguous patients are still here.
metadata_ambiguous_patients
=
metadata
.
loc
[
metadata
[
'Subject ID'
].
isin
(
patients_ambiguous
)
&
((
metadata
[
'Modality'
]
==
'CT'
)
|
(
metadata
[
'Modality'
]
==
'PT'
))
]
# As a rule to pick up CT scan folder we decided to keep the CT with 'Study Description' matching the PT 'Study Description'.
ambiguous_patient_count_study_boolean
=
(
metadata_ambiguous_patients
.
groupby
(
[
'Subject ID'
,
'Study Description'
]
).
count
()[
'Modality'
]
==
1
)
ambiguous_patient_count_study_boolean
=
(
ambiguous_patient_count_study_boolean
.
reset_index
()
)
# Find the combination Subject ID - Study Descriptooi to be dropped (because the CT's and PET's Study Description don't match).
ambiguous_patients_unmatch_study
=
ambiguous_patient_count_study_boolean
.
iloc
[
np
.
where
(
ambiguous_patient_count_study_boolean
[
'Modality'
])[
0
]
][[
'Subject ID'
,
'Study Description'
]]
# Filter rows with patient and number of images to drop and save
metadata
=
metadata
.
loc
[
metadata
.
apply
(
lambda
x
:
(
x
[
'Subject ID'
],
x
[
'Study Description'
])
not
in
ambiguous_patients_unmatch_study
.
itertuples
(
index
=
False
,
name
=
None
),
axis
=
1
,
)
]
# reconstruct path to raw data
metadata
[
'Study Date'
]
=
pd
.
to_datetime
(
metadata
[
'Study Date'
],
yearfirst
=
True
)
metadata
[
'Study Date'
]
=
metadata
[
'Study Date'
].
apply
(
lambda
ts
:
ts
.
strftime
(
"%m-%d-%Y"
)
)
# We need to sanitize the 'Study Description' column as some characters have been removed in the folder names.
# First, we compute all the possibile characters in 'Study Description' column
all_chars_study
=
set
(
list
(
''
.
join
(
metadata
[
'Study Description'
].
values
)))
# Secondly, we find all the valid characters (i.e. in the folder names)
valid_chars_study
=
[]
for
patient
in
os
.
listdir
(
RAW_DATA_PATH
):
if
not
patient
.
endswith
(
'xlsx'
)
and
not
patient
.
endswith
(
'csv'
):
for
folder
in
os
.
listdir
(
RAW_DATA_PATH
/
patient
):
valid_chars_study
+=
list
(
folder
)
valid_chars_study
=
set
(
valid_chars_study
)
chars_to_remove_study
=
all_chars_study
-
valid_chars_study
# Same sanitization needs to be done for the 'Series Description' column
all_chars_series
=
set
(
list
(
''
.
join
(
metadata
[
'Series Description'
].
values
)))
valid_chars_series
=
[]
for
patient
in
os
.
listdir
(
RAW_DATA_PATH
):
if
not
patient
.
endswith
(
'xlsx'
)
and
not
patient
.
endswith
(
'csv'
):
for
folder
in
os
.
listdir
(
RAW_DATA_PATH
/
patient
):
for
series
in
os
.
listdir
(
RAW_DATA_PATH
/
patient
/
folder
):
valid_chars_series
+=
list
(
series
)
valid_chars_series
=
set
(
valid_chars_series
)
chars_to_remove_series
=
all_chars_series
-
valid_chars_series
# For each row in the metadata we retrieve the data path and save it as a new column named 'dicom_folder'
dicom_folders
=
[]
for
i
,
row
in
tqdm
(
metadata
.
iterrows
()):
patient
=
row
[
'Subject ID'
]
study_uid
=
row
[
'Study UID'
][
-
5
:]
study_description
=
row
[
'Study Description'
]
study_date
=
row
[
'Study Date'
]
series_description
=
row
[
'Series Description'
]
study_description
=
''
.
join
(
[
c
for
c
in
list
(
study_description
)
if
c
not
in
chars_to_remove_study
]
)
series_description
=
''
.
join
(
[
c
for
c
in
list
(
series_description
)
if
c
not
in
chars_to_remove_series
]
)
# print(study_description)
# break
path_to_study
=
Path
(
patient
)
/
f
'
{
study_date
}
-
{
study_description
}
-
{
study_uid
}
'
# print(path_to_study)
folders
=
[
f
for
f
in
os
.
listdir
(
RAW_DATA_PATH
/
path_to_study
)
if
series_description
in
f
]
if
len
(
folders
)
==
1
:
folder
=
folders
[
0
]
# print(folder)
elif
len
(
folders
)
==
0
:
print
(
f
'Empty folder
{
path_to_study
}
'
,
sys
.
stderr
)
else
:
print
(
f
'Multiple matching folders for
{
series_description
}
in
{
path_to_study
}
'
,
file
=
sys
.
stderr
,
)
# print('\t', folders)
dicom_folder
=
str
(
path_to_study
/
folder
)
dicom_folders
.
append
(
dicom_folder
)
# TODO: finish to reconstruct path
metadata
[
'dicom_folder'
]
=
dicom_folders
metadata
.
to_csv
(
PROCESSED_DATA_PATH
/
'path_original_data.csv'
,
index
=
False
)
elif
DATASET_NAME
==
'HN_BZ'
:
CLINICAL_DATA_FILENAME
=
'pz_HN_BZ.csv'
clinical
=
pd
.
read_csv
(
RAW_DATA_PATH
/
CLINICAL_DATA_FILENAME
,
sep
=
';'
)
clinical_no_na
=
remove_na
(
clinical
,
columns
=
[
LABEL_COL_LOCOREGIONAL
])
clinical_no_na
[
LABEL_COL_LOCOREGIONAL
]
=
clinical_no_na
[
LABEL_COL_LOCOREGIONAL
].
astype
(
np
.
uint8
)
# clinical_no_na.sort_values(by=PATIENT_COL, inplace=True)
clinical_no_na
.
to_csv
(
PROCESSED_DATA_PATH
/
CLINICAL_DATA_FILENAME_CLEAN
,
index
=
False
)
# %%
# %%
preprocessing/prepare_dataset.py
0 → 100644
View file @
7916909e
#%% [markdown]
# Create dataset with the following structure:
# ```bash
# ├── data
# │ ├── <dataset name>
# │ │ ├── processed
# │ | | ├── dcm
# │ | | | ├── <patient ID>
# │ | | | | ├── CT
# │ | | | | ├── PT
# │ │ | | | └── RTSTRUCT
# │ | | | ├── ...
# │ | | | └── <patient ID>
# │ | | | ├── CT
# │ | | | ├── PT
# │ │ | | └── RTSTRUCT
# │ | | ├── bbox
# │ │ | | └── <subdatasets>
# │ │ | └── <processed clinical file>
# │ │ └── raw
# │ | ├── <original data>
# │ │ └── <original clinical / metadata files>
# ```
# %%
import
pandas
as
pd
from
pathlib
import
Path
import
os
import
sys
import
shutil
from
tqdm
import
tqdm
# %%
DATASET_NAME
=
'HN_val'
PROJECT_DATA_PATH
=
Path
(
'data'
)
/
DATASET_NAME
RAW_DATA_PATH
=
PROJECT_DATA_PATH
/
'raw'
PROCESSED_DATA_PATH
=
PROJECT_DATA_PATH
/
'processed'
PROCESSED_DCM_PATH
=
PROCESSED_DATA_PATH
/
'dcm'
os
.
makedirs
(
PROCESSED_DCM_PATH
,
exist_ok
=
False
)
if
DATASET_NAME
==
'HN_val'
:
# TODO: read path from path_original_data.csv and not from summary.csv
DATASET_DESCRIPTION_FILE
=
'path_original_data.csv'
dataset_description
=
pd
.
read_csv
(
PROCESSED_DATA_PATH
/
DATASET_DESCRIPTION_FILE
)
for
i
,
row
in
tqdm
(
dataset_description
.
iterrows
()):
patient
=
row
[
'Subject ID'
]
old_files_dir
=
RAW_DATA_PATH
/
row
[
'dicom_folder'
]
modality
=
row
[
'Modality'
]
n_slices
=
row
[
'Number of images'
]
new_files_dir
=
PROCESSED_DCM_PATH
/
patient
/
modality
# os.makedirs(new_files_dir, exist_ok=True)
files
=
os
.
listdir
(
old_files_dir
)
if
len
(
files
)
!=
int
(
n_slices
):
print
(
f
'The number of files in folder
{
old_files_dir
}
(
{
len
(
files
)
}
) is different from the one declared in the metadata file (
{
int
(
n_slices
)
}
).'
,
file
=
sys
.
stderr
,
)
shutil
.
copytree
(
old_files_dir
,
new_files_dir
)
# %%
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment