Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
MPBA Radiomics
RADLER
Commits
668dee72
Commit
668dee72
authored
Mar 06, 2020
by
Alessia Marcolini
Browse files
Add ROI modality column to clinical file
parent
56e662df
Changes
1
Show whitespace changes
Inline
Side-by-side
preprocessing/prepare_clinical.py
View file @
668dee72
...
...
@@ -54,57 +54,6 @@ grading_dict_grouped = {
}
if
DATASET_NAME
==
'HN_val'
:
# prepare clinical file
CLINICAL_DATA_FILENAME
=
'INFOclinical_HN_Version2_30may2018.xlsx'
clinical_dict
=
pd
.
read_excel
(
RAW_DATA_PATH
/
CLINICAL_DATA_FILENAME
,
sheet_name
=
None
)
# get all sheets as a dict of DataFrames
clinical_sheets
=
[]
for
sheet_name
in
clinical_dict
.
keys
():
if
not
sheet_name
==
'Excluded'
:
clinical_sheet
=
clinical_dict
[
sheet_name
]
clinical_sheet
=
remove_na
(
clinical_sheet
,
columns
=
[
'Patient #'
])
# remove authors original notes
clinical_sheet
=
clinical_sheet
.
loc
[
clinical_sheet
[
'Patient #'
].
apply
(
lambda
x
:
x
.
startswith
(
'HN'
))
]
clinical_sheets
.
append
(
clinical_sheet
)
clinical
=
pd
.
concat
(
clinical_sheets
,
ignore_index
=
True
,
sort
=
True
)
clinical
=
remove_constant_cols
(
clinical
)
clinical
.
sort_values
(
by
=
'Patient #'
,
inplace
=
True
)
clinical
.
rename
(
{
'Patient #'
:
PATIENT_COL
,
'Locoregional'
:
LABEL_COL_LOCOREGIONAL
},
inplace
=
True
,
axis
=
1
,
)
clinical
[
FILENAME_COL
]
=
clinical
[
PATIENT_COL
].
apply
(
lambda
x
:
x
+
'.npy'
)
clinical
.
set_index
(
PATIENT_COL
,
inplace
=
True
)
# compute new label columns
clinical
[
LABEL_COL_T_STAGE_BINARY
]
=
clinical
[
'T-stage'
].
apply
(
lambda
x
:
grading_dict_binary
[
x
]
)
clinical
[
LABEL_COL_T_STAGE_GROUPED
]
=
clinical
[
'T-stage'
].
apply
(
lambda
x
:
grading_dict_grouped
[
x
]
)
old_clinical_len
=
len
(
clinical
)
# Get ROI name for each segmentation mask
ROI_INFO_FILENAME
=
'INFO_GTVcontours_HN.csv'
roi_info
=
pd
.
read_csv
(
RAW_DATA_PATH
/
ROI_INFO_FILENAME
)
clinical
=
clinical
.
merge
(
roi_info
,
on
=
'patient'
)
assert
old_clinical_len
==
len
(
clinical
)
clinical
.
to_csv
(
PROCESSED_DATA_PATH
/
CLINICAL_DATA_FILENAME_CLEAN
)
################### clean metadata file to keep only CT scans, PT scans and 1 segmentation mask
METADATA_FILENAME
=
'tcia_original_metadata_HN_val.csv'
...
...
@@ -345,6 +294,83 @@ if DATASET_NAME == 'HN_val':
metadata
.
to_csv
(
PROCESSED_DATA_PATH
/
'path_original_data.csv'
,
index
=
False
)
##############################
# prepare clinical file
CLINICAL_DATA_FILENAME
=
'INFOclinical_HN_Version2_30may2018.xlsx'
clinical_dict
=
pd
.
read_excel
(
RAW_DATA_PATH
/
CLINICAL_DATA_FILENAME
,
sheet_name
=
None
)
# get all sheets as a dict of DataFrames
clinical_sheets
=
[]
for
sheet_name
in
clinical_dict
.
keys
():
if
not
sheet_name
==
'Excluded'
:
clinical_sheet
=
clinical_dict
[
sheet_name
]
clinical_sheet
=
remove_na
(
clinical_sheet
,
columns
=
[
'Patient #'
])
# remove authors original notes
clinical_sheet
=
clinical_sheet
.
loc
[
clinical_sheet
[
'Patient #'
].
apply
(
lambda
x
:
x
.
startswith
(
'HN'
))
]
clinical_sheets
.
append
(
clinical_sheet
)
clinical
=
pd
.
concat
(
clinical_sheets
,
ignore_index
=
True
,
sort
=
True
)
clinical
=
remove_constant_cols
(
clinical
)
clinical
.
sort_values
(
by
=
'Patient #'
,
inplace
=
True
)
clinical
.
rename
(
{
'Patient #'
:
PATIENT_COL
,
'Locoregional'
:
LABEL_COL_LOCOREGIONAL
},
inplace
=
True
,
axis
=
1
,
)
clinical
[
FILENAME_COL
]
=
clinical
[
PATIENT_COL
].
apply
(
lambda
x
:
x
+
'.npy'
)
clinical
.
set_index
(
PATIENT_COL
,
inplace
=
True
)
# compute new label columns
clinical
[
LABEL_COL_T_STAGE_BINARY
]
=
clinical
[
'T-stage'
].
apply
(
lambda
x
:
grading_dict_binary
[
x
]
)
clinical
[
LABEL_COL_T_STAGE_GROUPED
]
=
clinical
[
'T-stage'
].
apply
(
lambda
x
:
grading_dict_grouped
[
x
]
)
old_clinical_len
=
len
(
clinical
)
# Get ROI name for each segmentation mask
ROI_INFO_FILENAME
=
'INFO_GTVcontours_HN.csv'
roi_info
=
pd
.
read_csv
(
RAW_DATA_PATH
/
ROI_INFO_FILENAME
)
roi_info
.
rename
({
'roi_name'
:
'ROI_name'
},
axis
=
1
,
inplace
=
True
)
clinical
=
clinical
.
merge
(
roi_info
,
on
=
'patient'
)
assert
old_clinical_len
==
len
(
clinical
),
f
'Clinical file and
{
ROI_INFO_FILENAME
}
differ in patients.'
# Get ROI modality
series_description
=
metadata
[
metadata
[
'Modality'
]
==
'RTSTRUCT'
][
[
'Subject ID'
,
'Series Description'
]
]
modality_dict
=
{
'RTstruct_CTsim->CT(PET-CT)'
:
'CT'
,
'RTstruct_CTsim->PET(PET-CT)'
:
'PT'
,
}
series_description
[
'ROI_modality'
]
=
series_description
[
'Series Description'
].
apply
(
lambda
x
:
modality_dict
[
x
]
)
series_description
=
series_description
[[
'Subject ID'
,
'ROI_modality'
]]
clinical
=
clinical
.
merge
(
series_description
,
left_on
=
'patient'
,
right_on
=
'Subject ID'
)
assert
old_clinical_len
==
len
(
clinical
),
f
'Clinical file and metadata file differ in patients.'
clinical
.
to_csv
(
PROCESSED_DATA_PATH
/
CLINICAL_DATA_FILENAME_CLEAN
,
index
=
False
)
elif
DATASET_NAME
==
'HN_BZ'
:
CLINICAL_DATA_FILENAME
=
'pz_HN_BZ.csv'
...
...
@@ -357,6 +383,8 @@ elif DATASET_NAME == 'HN_BZ':
clinical
[
'ROI_name'
]
=
'lesione'
clinical
[
'ROI_modality'
]
=
'PT'
clinical_no_na
.
to_csv
(
PROCESSED_DATA_PATH
/
CLINICAL_DATA_FILENAME_CLEAN
,
index
=
False
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment