#%% import os import pandas as pd import numpy as np from functools import reduce from pathlib import Path import subprocess from config import get_project_root # %% PROJECT_ROOT = get_project_root() DATASETS = ['HN_val', 'HN_BZ'] BBOX_SUBDATASETS_NAMES = ['bbox_64', 'bbox_64'] BBOX_SUBDATASETS_PATHS = [ PROJECT_ROOT / 'data' / dataset / 'processed' / 'bbox' / bbox_name for dataset, bbox_name in zip(DATASETS, BBOX_SUBDATASETS_NAMES) ] MERGED_BBOX_SUBDATASET_NAME = 'bbox_64' print('Copying from: ', '\t'.join([str(path) for path in BBOX_SUBDATASETS_PATHS])) # %% # find union names in datasets DATASETS_NAME_UNION = list( reduce(lambda x, y: x.union(y), [set(dataset.split('_')) for dataset in DATASETS],) ) # rearrange name order (HN should be first) DATASETS_NAME_UNION.insert(0, DATASETS_NAME_UNION.pop(DATASETS_NAME_UNION.index('HN'))) NEW_DATASET_NAME = '_'.join(DATASETS_NAME_UNION) NEW_DATASET_PATH = ( PROJECT_ROOT / 'data' / NEW_DATASET_NAME / 'processed' / 'bbox' / MERGED_BBOX_SUBDATASET_NAME ) os.makedirs(NEW_DATASET_PATH, exist_ok=False) print('Copying into: ', str(NEW_DATASET_PATH)) #%% [markdown] # # Merge and create a new clinical file #%% clinicals = [] dataset_name = [] for dataset in DATASETS: clinical = pd.read_csv( PROJECT_ROOT / 'data' / dataset / 'processed' / f'clinical_{dataset}.csv' ) dataset_name.extend(list(np.repeat(dataset, len(clinical)))) clinicals.append(clinical) merged_clinical = pd.concat([i for i in clinicals], join='inner') merged_clinical['dataset'] = dataset_name merged_clinical.to_csv( PROJECT_ROOT / 'data' / NEW_DATASET_NAME / 'processed' / f'clinical_{NEW_DATASET_NAME}.csv', index=False, ) #%% [markdown] # # Link files from respective folders for old_path in BBOX_SUBDATASETS_PATHS: subprocess.call(f'ln -s {old_path}/* {NEW_DATASET_PATH}', shell=True) # %%