synth_data_generator.py 5.56 KB
Newer Older
Marco Chierici's avatar
Marco Chierici committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from functools import reduce
from pathlib import Path
import configparser

parser = argparse.ArgumentParser()
parser.add_argument('--datafolder', type=str, default='data', help='Data folder (default: %(default)s)')
parser.add_argument('--dataset', type=str, default='synthetic', help='Dataset name (default: %(default)s)')
parser.add_argument('--target', type=str, default='ST', help='Synthetic endpoint name (default: %(default)s)')
parser.add_argument('--n_samples', type=int, default=380, help='Number of generated samples (default: %(default)s)')
parser.add_argument('--weight', type=float, default=0.3, help='Proportion of the minority class (default: %(default)s)')
parser.add_argument('--n_splits_end', type=int, default=10, help='(default: %(default)s)')
parser.add_argument('--n_splits_start', type=int, default=0, help='(default: %(default)s)')
parser.add_argument('--split_offset', type=int, default=50, help='(default: %(default)s)')
parser.add_argument('--base_seed', type=int, default=42, help='Base seed for train_test_split (default: %(default)s)')

args = parser.parse_args()

### synthetic data generation parameters
NS = args.n_samples # number of samples
W = args.weight # proportion of the minority class
### random splits parameters
N_SPLITS_START = args.n_splits_start
N_SPLITS_END = args.n_splits_end
SPLIT_OFFSET = args.split_offset # for compatibility with latest naming convention
base_seed = args.base_seed # base seed for train_test_split
### file I/O parameters
DATAFOLDER = args.datafolder
DATASET = args.dataset
TARGET = args.target # "ST" = synthetic target :)

X1, y1 = make_classification(n_samples=NS, n_features=100, n_informative=10, n_classes=2, 
                             weights=[W],
                             shuffle=False, # don't shuffle samples or features
                             flip_y=0, # don't randomly assign a fraction of samples to a class
                             class_sep=1.0, # default class separation
                             random_state=1)

X2, y2 = make_classification(n_samples=NS, n_features=50, n_informative=5, n_classes=2,
                             weights=[W],
                             shuffle=False, # don't shuffle samples or features
                             flip_y=0, # don't randomly assign a fraction of samples to a class
                             class_sep=1.2, # slightly easier task
                             random_state=2)

X3, y3 = make_classification(n_samples=NS, n_features=250, n_informative=25, n_classes=2,
                             weights=[W],
                             scale=10,
                             shuffle=False, # don't shuffle samples or features
                             flip_y=0, # don't randomly assign a fraction of samples to a class
                             class_sep=0.8, # slightly harder task
                             random_state=3)

# check that the labels are the same for the 3 datasets
np.all(y1 == y2)
np.all(y2 == y3)

# generate TR/TS/TS2 splits
X_list = [X1, X2, X3]
y = pd.DataFrame(y1)
Path(f"{DATAFOLDER}/{DATASET}/{TARGET}/").mkdir(parents=True, exist_ok=True)
y.to_csv(f"{DATAFOLDER}/{DATASET}/{TARGET}/labels_{TARGET}.txt", sep='\t', header=False, index=False)

samples = [f'S{sample_id}' for sample_id in range(y.shape[0])]

for i, X in enumerate(X_list):
    df = pd.DataFrame(X,
                      columns=[f"LAYER{i}:f{feat_id}" for feat_id in range(X.shape[1])],
                      index=samples)
    
    total_dataset_len = df.shape[0]
    ts2_dataset_len = round(0.2 * total_dataset_len)
    
    # generate train/test splits
    for split_old in range(N_SPLITS_START, N_SPLITS_END):
        split_id = split_old + SPLIT_OFFSET
        path_new = f'{DATAFOLDER}/{DATASET}/{TARGET}/{split_id}'
        Path(path_new).mkdir(parents=True, exist_ok=True)
        
        df_tr, df_ts, y_tr, y_ts = train_test_split(df, y, test_size=0.3, random_state=(base_seed+split_old), stratify=y)

        # save TS
        df_ts.to_csv(f'{path_new}/layer{i}_ts.txt', sep='\t', index_label='Sample')
        y_ts.to_csv(f'{path_new}/labels_{TARGET}_ts.txt', sep='\t', index=False, header=False)
        
        # generate TR_new and TS2 by resplitting TR
        ts2_ratio = round(ts2_dataset_len / y_tr.shape[0], 2)
        df_tr_new, df_ts2, y_tr_new, y_ts2 = train_test_split(df_tr, y_tr, test_size=ts2_ratio, random_state=0, stratify=y_tr)
        
        # save TR_new, TS2
        df_tr_new.to_csv(f'{path_new}/layer{i}_tr.txt', sep='\t', index_label='Sample')
        df_ts2.to_csv(f'{path_new}/layer{i}_ts2.txt', sep='\t', index_label='Sample')
        y_tr_new.to_csv(f'{path_new}/labels_{TARGET}_tr.txt', sep='\t', index=False, header=False)
        y_ts2.to_csv(f'{path_new}/labels_{TARGET}_ts2.txt', sep='\t', index=False, header=False)
    
    # save unsplit data
    df.to_csv(f"{DATAFOLDER}/{DATASET}/{TARGET}/layer{i}.txt", sep='\t', index_label='Sample')

# write experiment parameters
config = configparser.ConfigParser()
config['INPUT/OUTPUT'] = {'Datafolder': DATAFOLDER,
                          'Dataset name': DATASET,
                          'Target name': TARGET}

config['DATA GENERATION'] = {'Number of samples': NS,
                             'Proportion of the minority class': W}

config['SPLITS'] = {'Number of splits': N_SPLITS_END - N_SPLITS_START,
                    'Base seed for split generation': base_seed}

with open(f'{DATAFOLDER}/{DATASET}/{TARGET}/config.ini', 'w') as configfile:
    config.write(configfile)