Commit 0ae7fe15 authored by Marco Chierici's avatar Marco Chierici
Browse files

Synthetic data generator

parent 3c6ba2f4
import argparse
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from functools import reduce
from pathlib import Path
import configparser
parser = argparse.ArgumentParser()
parser.add_argument('--datafolder', type=str, default='data', help='Data folder (default: %(default)s)')
parser.add_argument('--dataset', type=str, default='synthetic', help='Dataset name (default: %(default)s)')
parser.add_argument('--target', type=str, default='ST', help='Synthetic endpoint name (default: %(default)s)')
parser.add_argument('--n_samples', type=int, default=380, help='Number of generated samples (default: %(default)s)')
parser.add_argument('--weight', type=float, default=0.3, help='Proportion of the minority class (default: %(default)s)')
parser.add_argument('--n_splits_end', type=int, default=10, help='(default: %(default)s)')
parser.add_argument('--n_splits_start', type=int, default=0, help='(default: %(default)s)')
parser.add_argument('--split_offset', type=int, default=50, help='(default: %(default)s)')
parser.add_argument('--base_seed', type=int, default=42, help='Base seed for train_test_split (default: %(default)s)')
args = parser.parse_args()
### synthetic data generation parameters
NS = args.n_samples # number of samples
W = args.weight # proportion of the minority class
### random splits parameters
N_SPLITS_START = args.n_splits_start
N_SPLITS_END = args.n_splits_end
SPLIT_OFFSET = args.split_offset # for compatibility with latest naming convention
base_seed = args.base_seed # base seed for train_test_split
### file I/O parameters
DATAFOLDER = args.datafolder
DATASET = args.dataset
TARGET = # "ST" = synthetic target :)
X1, y1 = make_classification(n_samples=NS, n_features=100, n_informative=10, n_classes=2,
shuffle=False, # don't shuffle samples or features
flip_y=0, # don't randomly assign a fraction of samples to a class
class_sep=1.0, # default class separation
X2, y2 = make_classification(n_samples=NS, n_features=50, n_informative=5, n_classes=2,
shuffle=False, # don't shuffle samples or features
flip_y=0, # don't randomly assign a fraction of samples to a class
class_sep=1.2, # slightly easier task
X3, y3 = make_classification(n_samples=NS, n_features=250, n_informative=25, n_classes=2,
shuffle=False, # don't shuffle samples or features
flip_y=0, # don't randomly assign a fraction of samples to a class
class_sep=0.8, # slightly harder task
# check that the labels are the same for the 3 datasets
np.all(y1 == y2)
np.all(y2 == y3)
# generate TR/TS/TS2 splits
X_list = [X1, X2, X3]
y = pd.DataFrame(y1)
Path(f"{DATAFOLDER}/{DATASET}/{TARGET}/").mkdir(parents=True, exist_ok=True)
y.to_csv(f"{DATAFOLDER}/{DATASET}/{TARGET}/labels_{TARGET}.txt", sep='\t', header=False, index=False)
samples = [f'S{sample_id}' for sample_id in range(y.shape[0])]
for i, X in enumerate(X_list):
df = pd.DataFrame(X,
columns=[f"LAYER{i}:f{feat_id}" for feat_id in range(X.shape[1])],
total_dataset_len = df.shape[0]
ts2_dataset_len = round(0.2 * total_dataset_len)
# generate train/test splits
for split_old in range(N_SPLITS_START, N_SPLITS_END):
split_id = split_old + SPLIT_OFFSET
path_new = f'{DATAFOLDER}/{DATASET}/{TARGET}/{split_id}'
Path(path_new).mkdir(parents=True, exist_ok=True)
df_tr, df_ts, y_tr, y_ts = train_test_split(df, y, test_size=0.3, random_state=(base_seed+split_old), stratify=y)
# save TS
df_ts.to_csv(f'{path_new}/layer{i}_ts.txt', sep='\t', index_label='Sample')
y_ts.to_csv(f'{path_new}/labels_{TARGET}_ts.txt', sep='\t', index=False, header=False)
# generate TR_new and TS2 by resplitting TR
ts2_ratio = round(ts2_dataset_len / y_tr.shape[0], 2)
df_tr_new, df_ts2, y_tr_new, y_ts2 = train_test_split(df_tr, y_tr, test_size=ts2_ratio, random_state=0, stratify=y_tr)
# save TR_new, TS2
df_tr_new.to_csv(f'{path_new}/layer{i}_tr.txt', sep='\t', index_label='Sample')
df_ts2.to_csv(f'{path_new}/layer{i}_ts2.txt', sep='\t', index_label='Sample')
y_tr_new.to_csv(f'{path_new}/labels_{TARGET}_tr.txt', sep='\t', index=False, header=False)
y_ts2.to_csv(f'{path_new}/labels_{TARGET}_ts2.txt', sep='\t', index=False, header=False)
# save unsplit data
df.to_csv(f"{DATAFOLDER}/{DATASET}/{TARGET}/layer{i}.txt", sep='\t', index_label='Sample')
# write experiment parameters
config = configparser.ConfigParser()
config['INPUT/OUTPUT'] = {'Datafolder': DATAFOLDER,
'Dataset name': DATASET,
'Target name': TARGET}
config['DATA GENERATION'] = {'Number of samples': NS,
'Proportion of the minority class': W}
config['SPLITS'] = {'Number of splits': N_SPLITS_END - N_SPLITS_START,
'Base seed for split generation': base_seed}
with open(f'{DATAFOLDER}/{DATASET}/{TARGET}/config.ini', 'w') as configfile:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment