Commit dd49a005 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Rename folder according to execution order

parent 2c7a1343
#%%
import os
import pandas as pd
import numpy as np
from functools import reduce
from pathlib import Path
import shutil
# %%
DATASETS = ['HN_val', 'HN_BZ']
# %%
# find union names in datasets
DATASETS_NAME_UNION = list(
reduce(
lambda x, y: x.union(y),
[set(DATASETS[i].split('_')) for i in range(len(DATASETS))],
)
)
# rearrange name order (HN should be first)
DATASETS_NAME_UNION.insert(0, DATASETS_NAME_UNION.pop(DATASETS_NAME_UNION.index('HN')))
NEW_DATASET_NAME = '_'.join(DATASETS_NAME_UNION)
os.makedirs(f'data/{NEW_DATASET_NAME}/processed/bbox/', exist_ok=True)
#%% [markdown]
# Merge and create a new file clinical
#%%
clinicals = []
n_patients = []
dataset_name = []
for dataset in DATASETS:
clinical = pd.read_csv(f'data/{dataset}/processed/clinical_{dataset}.csv')
n_patients.append(len(clinical.index))
dataset_name.append(list(np.repeat(dataset, len(clinical))))
clinicals.append(clinical)
merged_clinical = pd.concat([i for i in clinicals], join='inner')
merged_clinical['dataset'] = [item for sublist in dataset_name for item in sublist]
merged_clinical.to_csv(
f'data/{NEW_DATASET_NAME}/processed/clinical_{NEW_DATASET_NAME}.csv', index=False
)
# %%
......@@ -45,9 +45,7 @@ if DATASET_NAME == 'HN_val':
dataset_description = pd.read_csv(PROCESSED_DATA_PATH / DATASET_DESCRIPTION_FILE)
available_patients = patients = [
f for f in os.listdir(DCM_DIR) if os.path.isdir(DCM_DIR / f)
]
available_patients = [f for f in os.listdir(DCM_DIR) if os.path.isdir(DCM_DIR / f)]
for i, row in tqdm(dataset_description.iterrows()):
patient = row['Subject ID']
......
import pandas as pd
import SimpleITK as sitk
def remove_na(dataframe, columns):
dataframe.dropna(subset=columns, inplace=True)
return dataframe
def remove_constant_cols(dataframe):
return dataframe.loc[:, dataframe.apply(pd.Series.nunique) != 1]
def sitk_to_numpy(image):
return sitk.GetArrayFromImage(image)
def filter_outbounds(image, range_pixel):
image = sitk.Threshold(
image, lower=-2000, upper=range_pixel[1], outsideValue=range_pixel[1]
)
image = sitk.Threshold(
image, lower=range_pixel[0], upper=5000, outsideValue=range_pixel[0]
)
return image
#%%
import os
from radiomics import featureextractor
import SimpleITK as sitk
import numpy as np
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm
from multiproc import ListMultiprocessing
# %%
DATASET = Path('HN_val')
BBOX = 'bbox_64'
VOXEL_SPACING = (1.0, 1.0, 1.0)
SCAN_NAMES = ['CT', 'PT'] # you are lucky that CT scans are the first in the stack
N_JOBS = 32
N_BINS = [8, 16, 32, 64]
PIXEL_SPACING = [1.0, 2.0, 3.0, 4.0, 5.0]
#%%
DATADIR = Path('data') / DATASET / 'processed' / 'bbox' / BBOX
OUTDIR = Path('data') / DATASET / 'processed'
OUTFILE = f'radiomics_features_{BBOX}_prova.csv' # output file name
os.chdir('..')
clinical = pd.read_csv(Path('data') / DATASET / 'processed' / f'clinical_{DATASET}.csv')
#%%
params_shape = 'shape.yaml' # param file to use to create the extractor
extractor_shape = featureextractor.RadiomicsFeaturesExtractor(params_shape)
params_intensity = 'intensity.yaml' # param file to use to create the extractor
extractor_intensity = featureextractor.RadiomicsFeaturesExtractor(params_intensity)
params_texture = 'texture.yaml' # param file to use to create the extractor
extractor_texture = featureextractor.RadiomicsFeaturesExtractor(params_texture)
filenames = [f for f in os.listdir(DATADIR) if f.endswith('.npy')][:3]
exclude_list = []
# 'HN-CHUM-011',
# 'HN-HMR-008',
# 'HN-CHUS-043',
# 'HN-HMR-002',
# 'HN-CHUM-010',
# 'HN-HGJ-044',
# ] #'HN-CHUM-020', 'HN-CHUS-093', 'HN-HMR-001', 'HN-HMR-007', 'HN-HMR-017'] # BAD SUV conversion
#%%
def process_file(filename):
broken = False
data = np.load(f'{DATADIR}/{filename}')
file_no_ext = filename.split('.')[0]
if file_no_ext not in exclude_list:
n_modalities = data.shape[0]
modalities = []
for i in range(n_modalities):
curr_modality = sitk.GetImageFromArray(data[i, :, :, :].astype(np.float32))
curr_modality.SetSpacing(VOXEL_SPACING)
modalities.append(curr_modality)
has_label = np.max(data[-1, :, :, :]) > 0
if has_label:
segmentation = sitk.GetImageFromArray(data[-1, :, :, :].astype(np.uint8))
segmentation.SetSpacing(VOXEL_SPACING)
feature = {}
# shape features
result = extractor_shape.execute(modalities[0], segmentation)
for key, value in result.items():
if not key.startswith("general_"):
feature[key] = result[key]
for modality, name in zip(modalities, SCAN_NAMES):
# intensity features
result = extractor_intensity.execute(modality, segmentation)
for key, value in result.items():
if not key.startswith("general_"):
feature[f'{key}_{name}'] = result[key]
# texture features
for P in PIXEL_SPACING:
if not broken:
extractor_texture.settings['resampledPixelSpacing'] = [P, P, P]
for NB in N_BINS:
if not broken:
extractor_texture.settings['binCount'] = NB
# uniform binning
extractor_texture.settings['binMode'] = 'uniform'
try:
result = extractor_texture.execute(
modality, segmentation
)
for key, value in result.items():
if not key.startswith("general_"):
feature[
f'{key}_{P}_{NB}_uniform_{name}'
] = result[key]
# equalized binning
extractor_texture.settings['binMode'] = 'equal'
result = extractor_texture.execute(
modality, segmentation
)
for key, value in result.items():
if not key.startswith("general_"):
feature[
f'{key}_{P}_{NB}_equalized_{name}'
] = result[key]
except:
broken = True
if not broken:
feature['filename'] = filename
patient = clinical.loc[clinical['filename'] == filename][
'patient'
].values[0]
feature['patient'] = patient
return feature
else:
return {}
#%% MULTIPROCESS
multiproc = ListMultiprocessing(process_file, N_JOBS)
multiproc.execute(filenames)
result = multiproc.get_result()
print('done')
#%% Save features in a Pandas df
features = pd.DataFrame(result)
features.set_index('filename', inplace=True)
print('Number of files: ', features.shape[0])
print('Number of features: ', features.shape[1])
# %%
features.to_csv(OUTDIR / OUTFILE)
#
# Damiana
#
setting:
label: 1
interpolator: 'sitkBSpline' # This is an enumerated value, here None is not allowed
weightingNorm: 'euclidean'
normalize: False
# Image types to use: "Original" for unfiltered image, for possible filters, see documentation.
imageType:
Original: {}
# Featureclasses, from which features must be calculated. If a featureclass is not mentioned, no features are calculated
# for that class. Otherwise, the specified features are calculated, or, if none are specified, all are calculated (excluding redundant/deprecated features).
featureClass:
firstorder:
from multiprocessing import Manager, Lock, Pool, cpu_count
from contextlib import closing
import sys
class ListMultiprocessing(object):
def __init__(self, function, n_cpu=None, verbose=True):
super(ListMultiprocessing).__init__()
manager = Manager()
self.function = function
self.n_cpu = cpu_count() if n_cpu is None else n_cpu
self.result = manager.list()
self.count = manager.Value('i', 0)
self.verbose = verbose
def __segment__(self):
avg = len(self.array) / self.n_cpu
last = 0.0
while last < len(self.array):
yield self.array[int(last):int(last + avg)]
last += avg
def __compute__(self, samples):
for sample in samples:
result=self.function(sample)
with Lock():
self.count.value +=1
self.result.append(result)
if self.verbose:
self.__progress__(f'Processing sample: {sample}')
def execute(self, array):
self.array = array
self.total = len(array)
with closing(Pool(self.n_cpu, maxtasksperchild=1)) as pool:
tasks = [pool.apply_async(self.__compute__, args=(part,)) for part in self.__segment__()]
[task.get() for task in tasks]
print('\nDone')
def get_result(self):
return([x for x in self.result])
def __progress__(self, status):#count, total, status=''):
bar_len = 40
filled_len = int(round(bar_len * self.count.value / float(self.total)))
percents = round(100.0 * self.count.value / float(self.total), 1)
bar = '█' * filled_len + '░' * (bar_len - filled_len)
sys.stdout.write(f'\r|{bar}| {percents}% - {status} ')
sys.stdout.flush()
\ No newline at end of file
#
# Damiana
#
setting:
label: 1
interpolator: 'sitkBSpline' # This is an enumerated value, here None is not allowed
weightingNorm: 'euclidean'
normalize: False
# Image types to use: "Original" for unfiltered image, for possible filters, see documentation.
imageType:
Original: {}
# Featureclasses, from which features must be calculated. If a featureclass is not mentioned, no features are calculated
# for that class. Otherwise, the specified features are calculated, or, if none are specified, all are calculated (excluding redundant/deprecated features).
featureClass:
shape:
#
# Damiana
#
setting:
label: 1
interpolator: 'sitkBSpline' # This is an enumerated value, here None is not allowed
weightingNorm: 'euclidean'
normalize: False
# Image types to use: "Original" for unfiltered image, for possible filters, see documentation.
imageType:
Original: {}
# Featureclasses, from which features must be calculated. If a featureclass is not mentioned, no features are calculated
# for that class. Otherwise, the specified features are calculated, or, if none are specified, all are calculated (excluding redundant/deprecated features).
featureClass:
glcm:
- 'JointEnergy'
- 'Contrast'
- 'Correlation'
- 'Id'
- 'SumSquares'
- 'JointAverage'
- 'DifferenceVariance'
- 'Autocorrelation'
glrlm:
- 'ShortRunEmphasis'
- 'LongRunEmphasis'
- 'GrayLevelNonUniformity'
- 'RunLengthNonUniformity'
- 'RunPercentage'
- 'LowGrayLevelRunEmphasis'
- 'HighGrayLevelRunEmphasis'
- 'ShortRunLowGrayLevelEmphasis'
- 'ShortRunHighGrayLevelEmphasis'
- 'LongRunLowGrayLevelEmphasis'
- 'LongRunHighGrayLevelEmphasis'
- 'GrayLevelVariance'
- 'RunVariance'
glszm:
- 'SmallAreaEmphasis'
- 'LargeAreaEmphasis'
- 'GrayLevelNonUniformity'
- 'SizeZoneNonUniformity'
- 'ZonePercentage'
- 'ZoneEntropy'
- 'LowGrayLevelZoneEmphasis'
- 'HighGrayLevelZoneEmphasis'
- 'SmallAreaLowGrayLevelEmphasis'
- 'SmallAreaHighGrayLevelEmphasis'
- 'LargeAreaLowGrayLevelEmphasis'
- 'LargeAreaHighGrayLevelEmphasis'
- 'GrayLevelVariance'
- 'ZoneVariance'
ngtdm:
- 'Coarseness'
- 'Contrast'
- 'Busyness'
- 'Complexity'
- 'Strength'
\ No newline at end of file