Commit 78146831 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Better management of project root directory

parent fb9fc0cb
......@@ -6,19 +6,20 @@ import pandas as pd
from pathlib import Path
import sys
PATH = os.path.join(os.path.abspath(os.path.curdir), '..')
sys.path.append(PATH)
from tqdm import tqdm
import dicom_utils.dicom_utils as du
import dicom_utils.dicom_utils.visualize as viz
import gc
from utils import sitk_to_numpy, filter_outbounds
from config import get_project_root
# os.chdir(PATH)
#%%
PROJECT_ROOT = get_project_root()
DATASET_NAME = 'HN_BZ'
PROJECT_DATA_PATH = Path('data') / DATASET_NAME
PROJECT_DATA_PATH = PROJECT_ROOT / 'data' / DATASET_NAME
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
CLINICAL_DATA_FILENAME_CLEAN = f'clinical_{DATASET_NAME}.csv'
......@@ -118,4 +119,3 @@ for i, row in tqdm(clinical.iterrows()):
#%%
print(errors)
......@@ -6,40 +6,47 @@ from functools import reduce
from pathlib import Path
import shutil
from config import get_project_root
# %%
PROJECT_ROOT = get_project_root()
DATASETS = ['HN_val', 'HN_BZ']
# %%
# find union names in datasets
DATASETS_NAME_UNION = list(
reduce(
lambda x, y: x.union(y),
[set(DATASETS[i].split('_')) for i in range(len(DATASETS))],
)
reduce(lambda x, y: x.union(y), [set(dataset.split('_')) for dataset in DATASETS],)
)
# rearrange name order (HN should be first)
DATASETS_NAME_UNION.insert(0, DATASETS_NAME_UNION.pop(DATASETS_NAME_UNION.index('HN')))
NEW_DATASET_NAME = '_'.join(DATASETS_NAME_UNION)
os.makedirs(f'data/{NEW_DATASET_NAME}/processed/bbox/', exist_ok=True)
os.makedirs(
PROJECT_ROOT / 'data' / NEW_DATASET_NAME / 'processed' / 'bbox', exist_ok=False
)
#%% [markdown]
# Merge and create a new file clinical
#%%
clinicals = []
n_patients = []
dataset_name = []
for dataset in DATASETS:
clinical = pd.read_csv(f'data/{dataset}/processed/clinical_{dataset}.csv')
n_patients.append(len(clinical.index))
dataset_name.append(list(np.repeat(dataset, len(clinical))))
clinical = pd.read_csv(
PROJECT_ROOT / 'data' / dataset / 'processed' / f'clinical_{dataset}.csv'
)
dataset_name.extend(list(np.repeat(dataset, len(clinical))))
clinicals.append(clinical)
merged_clinical = pd.concat([i for i in clinicals], join='inner')
merged_clinical['dataset'] = [item for sublist in dataset_name for item in sublist]
merged_clinical['dataset'] = dataset_name
merged_clinical.to_csv(
f'data/{NEW_DATASET_NAME}/processed/clinical_{NEW_DATASET_NAME}.csv', index=False
PROJECT_ROOT
/ 'data'
/ NEW_DATASET_NAME
/ 'processed'
/ f'clinical_{NEW_DATASET_NAME}.csv',
index=False,
)
# %%
......@@ -13,10 +13,12 @@ import itertools
from tqdm import tqdm
from utils import remove_na, remove_constant_cols
# os.chdir('..')
from config import get_project_root
# %%
PROJECT_ROOT = get_project_root()
DATASET_NAME = 'HN_val'
PROJECT_DATA_PATH = Path('data') / DATASET_NAME
PROJECT_DATA_PATH = PROJECT_ROOT / 'data' / DATASET_NAME
RAW_DATA_PATH = PROJECT_DATA_PATH / 'raw'
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
......
......@@ -29,9 +29,13 @@ import sys
import shutil
from tqdm import tqdm
from config import get_project_root
# %%
PROJECT_ROOT = get_project_root()
DATASET_NAME = 'HN_BZ'
PROJECT_DATA_PATH = Path('data') / DATASET_NAME
PROJECT_DATA_PATH = PROJECT_ROOT / 'data' / DATASET_NAME
RAW_DATA_PATH = PROJECT_DATA_PATH / 'raw'
PROCESSED_DATA_PATH = PROJECT_DATA_PATH / 'processed'
PROCESSED_DCM_PATH = PROCESSED_DATA_PATH / 'dcm'
......
from pathlib import Path
def get_project_root():
'''
Returns the absolute path of the project root folder
'''
return Path(__file__).parent
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment