Commit 6be97062 authored by Valerio Maggio's avatar Valerio Maggio
Browse files

renamed folder for SEQC dataset and updated loading script

parent 8df08db5
......@@ -14,7 +14,7 @@ from sklearn.model_selection import train_test_split
# General Settings
# ========================================
TARGET_NB = 'TARGET_NB'
SEQC_NB = 'CAMDA'
SEQC_NB = 'SEQC_NB'
OS_LAB = 'OS'
EFS_LAB = 'EFS'
......@@ -33,10 +33,10 @@ DATA_FOLDER_NAME = 'data'
# ========================================
# SEQC-NB data Settings
# ========================================
SEQC2_TRAINING_DATA_FILEPATH = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'CAMDA', 'nsFilterDflt_v0_MAV-G_498_ALL_tr.txt')
SEQC2_TRAINING_LABS_FILEPATH = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'CAMDA', 'labels_498_ALL_tr.txt')
SEQC2_TEST_DATA_FILEPATH = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'CAMDA', 'nsFilterDflt_v0_MAV-G_498_ALL_ts.txt')
SEQC2_TEST_LABS_FILEPATH = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'CAMDA', 'labels_498_ALL_ts.txt')
SEQC2_TRAINING_DATA_FILEPATH = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'SEQC', 'nsFilterDflt_v0_MAV-G_498_ALL_tr.txt')
SEQC2_TRAINING_LABS_FILEPATH = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'SEQC', 'labels_498_ALL_tr.txt')
SEQC2_TEST_DATA_FILEPATH = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'SEQC', 'nsFilterDflt_v0_MAV-G_498_ALL_ts.txt')
SEQC2_TEST_LABS_FILEPATH = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'SEQC', 'labels_498_ALL_ts.txt')
# ========================================
# TARGET-NB data Settings
......@@ -77,19 +77,19 @@ def _filter_hr_only(bunch):
bunch.test_targets[tgt] = bunch.test_targets[tgt][bunch.test_targets[HR_LAB] == 1]
def load_nb_camda(hr_only=False, dataset_name='CAMDA_NB',
training_data_fpath=SEQC2_TRAINING_DATA_FILEPATH,
training_labels_fpath=SEQC2_TRAINING_LABS_FILEPATH,
test_data_fpath=SEQC2_TEST_DATA_FILEPATH,
test_labels_fpath=SEQC2_TEST_LABS_FILEPATH):
def load_nb_seqc(hr_only=False, dataset_name=SEQC_NB,
training_data_fpath=SEQC2_TRAINING_DATA_FILEPATH,
training_labels_fpath=SEQC2_TRAINING_LABS_FILEPATH,
test_data_fpath=SEQC2_TEST_DATA_FILEPATH,
test_labels_fpath=SEQC2_TEST_LABS_FILEPATH):
"""
Load the NB Dataset CAMDA as provided by the SEQC2 Initiative
Load the NB Dataset as provided by the SEQC2 Initiative
Parameters
----------
hr_only: bool (default False)
Whether or not filter out HR-Only patients in the dataset
dataset_name: str (default 'CAMBDA_NB')
dataset_name: str (default 'SEQC_NB')
Unique name of the dataset (used for results reports)
training_data_fpath: str
Path to the training dataset for SEQC-NB Dataset
......@@ -350,8 +350,8 @@ def generate_dataset_partitions(n_partitions=50, target_dataset=TARGET_NB,
data = pd.read_csv(DATA_FILE, sep='\t', header=0, index_col=0)
else: # so far, only two datasets are supported
BASE_DS_FOLDER = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'CAMDA')
load_dataset_fn = load_nb_camda
BASE_DS_FOLDER = os.path.join(BASE_DIR, DATA_FOLDER_NAME, 'SEQC')
load_dataset_fn = load_nb_seqc
# Data Stack
training_data = pd.read_csv(SEQC2_TRAINING_DATA_FILEPATH, sep='\t',
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment