merge_datasets.py 1.87 KB
Newer Older
1
2
3
4
5
6
#%%
import os
import pandas as pd
import numpy as np
from functools import reduce
from pathlib import Path
Alessia Marcolini's avatar
Alessia Marcolini committed
7
import subprocess
8

9
from config import get_project_root
10
11

# %%
12
13
PROJECT_ROOT = get_project_root()

14
DATASETS = ['HN_val', 'HN_BZ']
Alessia Marcolini's avatar
Alessia Marcolini committed
15
16
17
18
19
20
BBOX_SUBDATASETS_NAMES = ['bbox_64', 'bbox_64']
BBOX_SUBDATASETS_PATHS = [
    PROJECT_ROOT / 'data' / dataset / 'processed' / 'bbox' / bbox_name
    for dataset, bbox_name in zip(DATASETS, BBOX_SUBDATASETS_NAMES)
]
MERGED_BBOX_SUBDATASET_NAME = 'bbox_64'
21

Alessia Marcolini's avatar
Alessia Marcolini committed
22
print('Copying from: ', '\t'.join([str(path) for path in BBOX_SUBDATASETS_PATHS]))
23
24
25
# %%
# find union names in datasets
DATASETS_NAME_UNION = list(
26
    reduce(lambda x, y: x.union(y), [set(dataset.split('_')) for dataset in DATASETS],)
27
28
29
30
31
)

# rearrange name order (HN should be first)
DATASETS_NAME_UNION.insert(0, DATASETS_NAME_UNION.pop(DATASETS_NAME_UNION.index('HN')))
NEW_DATASET_NAME = '_'.join(DATASETS_NAME_UNION)
Alessia Marcolini's avatar
Alessia Marcolini committed
32
33
34
35
36
37
38
39

NEW_DATASET_PATH = (
    PROJECT_ROOT
    / 'data'
    / NEW_DATASET_NAME
    / 'processed'
    / 'bbox'
    / MERGED_BBOX_SUBDATASET_NAME
40
)
Alessia Marcolini's avatar
Alessia Marcolini committed
41
os.makedirs(NEW_DATASET_PATH, exist_ok=False)
42

Alessia Marcolini's avatar
Alessia Marcolini committed
43
print('Copying into: ', str(NEW_DATASET_PATH))
44
#%% [markdown]
Alessia Marcolini's avatar
Alessia Marcolini committed
45
# # Merge and create a new clinical file
46
47
48
49
50
#%%
clinicals = []
dataset_name = []

for dataset in DATASETS:
51
52
53
54
    clinical = pd.read_csv(
        PROJECT_ROOT / 'data' / dataset / 'processed' / f'clinical_{dataset}.csv'
    )
    dataset_name.extend(list(np.repeat(dataset, len(clinical))))
55
56
57
    clinicals.append(clinical)

merged_clinical = pd.concat([i for i in clinicals], join='inner')
Alessia Marcolini's avatar
Alessia Marcolini committed
58

59
merged_clinical['dataset'] = dataset_name
Alessia Marcolini's avatar
Alessia Marcolini committed
60

61
merged_clinical.to_csv(
62
63
64
65
66
67
    PROJECT_ROOT
    / 'data'
    / NEW_DATASET_NAME
    / 'processed'
    / f'clinical_{NEW_DATASET_NAME}.csv',
    index=False,
68
)
Alessia Marcolini's avatar
Alessia Marcolini committed
69
70
71
72
73
#%% [markdown]
# # Link files from respective folders
for old_path in BBOX_SUBDATASETS_PATHS:
    subprocess.call(f'ln -s {old_path}/* {NEW_DATASET_PATH}', shell=True)

74
# %%