Commit 904af62d authored by Valerio Maggio's avatar Valerio Maggio
Browse files

setup dap runners for the entire ML/DL experimental pipeline

parent aa0ef8cc
This diff is collapsed.
"""
CDRP-A Deep AutoEncoder for NB Dataset
"""
import os
from dap.deep_learning_dap import DeepLearningDAPRegr
from dap.dap import DAPRegr
from dataset import load_nb_target, HR_LAB, load_nb_camda
from keras.backend import floatx
from keras.engine import Input, Model
from keras.layers import Dense
from keras.initializers import glorot_normal
from sklearn.metrics import mean_squared_error
from collections import Counter
import numpy as np
import argparse
class CDRP_HR_AutoEncoder(DeepLearningDAPRegr):
"""
CDRP-A DAP Runner
"""
DAP_REFERENCE_METRIC = DAPRegr.MSE_CI
REF_STEP_METRIC = DAPRegr.MSE
BASE_METRICS = [DAPRegr.MSE]
CI_METRICS = [DAPRegr.MSE_CI]
def __init__(self, experiment):
"""
Parameters
----------
experiment: sklearn.datasets.base.Bunch
Bunch object embedding dataset (meta)data as returned by load_nb_target
function (see dataset.load_nb_target)
"""
super(CDRP_HR_AutoEncoder, self).__init__(experiment=experiment)
self.experiment_data.nb_classes = experiment.nb_classes_targets[HR_LAB]
self._do_serialisation = False # Serialization does not work with our Keras layer
# ==== Abstract Methods Implementation ====
@property
def ml_model_name(self):
return 'CDRPA'
@property
def results_folder(self):
"""
Compose path to the folder where reports and metrics will be saved.
"""
base_foldername = 'autoencoder'
folder_name = '_'.join([self.ml_model_name,
'epochs_{}'.format(self.learning_epochs),
'batch_{}'.format(self.batch_size),
self.optimizer_name,
self.feature_scaling_name, self.feature_ranking_name,
str(self.cv_n), str(self.cv_k)])
ds_name = self.experiment_data.dataset_name \
if 'dataset_name' in self.experiment_data.keys() else self.experiment_data.DESCR
output_folder_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
'results', ds_name, base_foldername, folder_name)
os.makedirs(output_folder_path, exist_ok=True)
return output_folder_path
def _prepare_metrics_array(self):
""""""
metrics_shape = (self.iteration_steps, self.feature_steps)
metrics = {
self.RANKINGS: np.empty((self.iteration_steps, self.experiment_data.nb_features), dtype=np.int),
self.MSE: np.empty(metrics_shape),
# Confidence Interval Metrics-specific
# all entry are assumed to have the following structure
# (mean, lower_bound, upper_bound)
self.MSE_CI: np.empty((self.feature_steps, 3)),
# Test dictionary
self.TEST_SET: dict()
}
return metrics
def _set_training_data(self):
"""Default implementation for classic and quite standard DAP implementation.
More complex implementation require overriding this method.
"""
self.X = self.experiment_data.training_data
self.y = self.experiment_data.targets[HR_LAB]
def _set_test_data(self):
self.X_test = self.experiment_data.test_data
self.y_test = self.experiment_data.test_targets[HR_LAB]
def _prepare_data(self, X, training_data=True, learning_phase=True):
""""""
if learning_phase:
if training_data:
self.X_ae_train = X
else:
self.X_ae_validation = X
else:
self.X_ae_test = X
return X
def _prepare_targets(self, y, training_labels=True, learning_phase=True):
"""
"""
if learning_phase:
if training_labels:
return self.X_ae_train
else:
return self.X_ae_validation
else:
return self.X_ae_test
def _predict(self, model, X_validation, y_validation=None, **kwargs):
"""
"""
decoding_prediction = model.predict(X_validation)
return (decoding_prediction, None)
def _compute_step_metrics(self, validation_indices, y_true_validation,
predictions, **extra_metrics):
"""
"""
# Process predictions
predicted_values, _ = predictions
# Compute Base Step Metrics
iteration_step, feature_step = self._iteration_step_nb, self._feature_step_nb
self.metrics[self.MSE][iteration_step, feature_step] = mean_squared_error(self.X_ae_validation,
predicted_values)
def _compute_test_metrics(self, y_true_test, predictions, **extra_metrics):
"""
"""
# Process predictions
predicted_values, _ = predictions # prediction_probabilities are not used for base metrics.
self.metrics[self.TEST_SET][self.MSE] = mean_squared_error(self.X_ae_test, predicted_values)
def _save_all_metrics_to_file(self, base_output_folder_path, feature_steps, feature_names, sample_names):
blacklist = [self.RANKINGS, self.PREDS]
for key in self.BASE_METRICS:
if key not in blacklist:
metric_values = self.metrics[key]
self._save_metric_to_file(os.path.join(base_output_folder_path, 'metric_{}.txt'.format(key)),
metric_values, feature_steps)
# Save Ranking
self._save_metric_to_file(os.path.join(base_output_folder_path, 'metric_{}.txt'.format(self.RANKINGS)),
self.metrics[self.RANKINGS], feature_names)
# Save Confidence Intervals Metrics
# NOTE: All metrics will be saved together into a unique file.
ci_metric_values = list()
metric_names = list() # collect names to become indices of resulting pd.DataFrame
for metric_key in self.CI_METRICS:
metric_values = self.metrics[metric_key]
nb_features_steps = metric_values.shape[0]
metric_names.extend(['{}-{}'.format(metric_key, s) for s in range(nb_features_steps)])
ci_metric_values.append(metric_values)
ci_metric_values = np.vstack(ci_metric_values)
self._save_metric_to_file(os.path.join(base_output_folder_path, 'CI_All_metrics.txt'),
ci_metric_values, columns=['Mean', 'Lower', 'Upper'],
index=metric_names)
def _save_test_metrics_to_file(self, base_output_folder_path):
"""
Save all basic metrics to corresponding files.
Parameters
----------
base_output_folder_path: str
Path to the output folder where files will be saved.
"""
column_names = []
test_metrics = np.zeros((1, len(self.metrics[self.TEST_SET].keys())))
for i, item in enumerate(self.metrics[self.TEST_SET].items()):
key, value = item
test_metrics[0, i] = value
column_names.append(key)
self._save_metric_to_file(os.path.join(base_output_folder_path, 'metric_test.txt'),
test_metrics, column_names)
def _build_network(self):
"""
Build a Fully-Connected AutoEncoder Network.
Returns
-------
keras.models.Model
MLP model
"""
# Parameters for Input & Output Layers
nb_features = self._nb_features # current nb of features in the feature step!
input_layer = Input(shape=(nb_features,), name="data", dtype=floatx())
x = Dense(128, activation='tanh', name="en_fc_128_1",
kernel_initializer=glorot_normal(seed=self._iteration_step_nb))(input_layer)
x = Dense(128, activation='tanh', name="en_fc_128_2",
kernel_initializer=glorot_normal(seed=self._iteration_step_nb))(x)
x = Dense(64, activation='linear', name="bn_64",
kernel_initializer=glorot_normal(seed=self._iteration_step_nb))(x)
x = Dense(128, activation='tanh', name="dec_fc_128_1",
kernel_initializer=glorot_normal(seed=self._iteration_step_nb))(x)
x = Dense(128, activation='tanh', name="dec_fc_128_2",
kernel_initializer=glorot_normal(seed=self._iteration_step_nb))(x)
out = Dense(nb_features, activation='linear', name="out",
kernel_initializer=glorot_normal(seed=self._iteration_step_nb))(x)
model = Model(inputs=input_layer, outputs=out)
return model
def _fit(self, model, X_train, y_train, X_validation=None, y_validation=None):
""""""
def get_class_weights(y):
if y.ndim == 2:
y = np.argmax(y, axis=1)
counter = Counter(y)
majority = max(counter.values())
return {cls: float(majority / count) for cls, count in counter.items()}
cls_weights = get_class_weights(y_train)
self.extra_fit_params['class_weight'] = cls_weights
model, extra_metrics = super(CDRP_HR_AutoEncoder, self)._fit(model, X_train, y_train, X_validation,
y_validation)
# Load Best Model
model_filename = '{}_{}_model.hdf5'.format(self._iteration_step_nb, self._nb_features)
base_output_folder = self.results_folder
weights_filepath = os.path.join(base_output_folder, model_filename)
model = self._build_network()
model.load_weights(weights_filepath)
return model, extra_metrics
def fold_teardown(self):
self.clear_network_graph()
def main():
# =======================
# Parse Arguments from CL
# =======================
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dataset", help="Target NB dataset to analyse",
type=str, choices=['TARGET', 'SEQC'])
args = parser.parse_args()
if args.dataset == 'TARGET':
load_ds_fn = load_nb_target
else:
load_ds_fn = load_nb_camda
# Load Dataset
# ============
dataset = load_ds_fn()
print('RUNNING ON DATASET {}'.format(dataset.dataset_name.upper()))
# ============
# DAP Settings
# ============
from dap import settings as dap_settings
dap_settings.to_categorical = False
dap_settings.feature_ranges = [2, 5, 10, 15, 20, 25, 50, 75, 100]
from dap.scaling import MinMaxScaler
dap_settings.feature_scaler = MinMaxScaler(feature_range=(-1, 1), copy=False)
# ===============
# DAP DL Settings
# ===============
from dap import deep_learning_settings as dl_dap_settings
dl_dap_settings.epochs = 2000
dl_dap_settings.batch_size = 64
# deep_learning_settings.fit_verbose = 1
dl_dap_settings.metrics = []
dl_dap_settings.callbacks = [] # no callbacks
dl_dap_settings.loss = 'mse'
dl_dap_settings.shuffle = True
from keras.optimizers import RMSprop
dl_dap_settings.optimizer = RMSprop()
dap = CDRP_HR_AutoEncoder(dataset)
trained_model = dap.run(verbose=True)
dap.predict_on_test(trained_model)
print("Computation completed!")
if __name__ == '__main__':
main()
This diff is collapsed.
"""
Multi-Layer Perceptron DAP Runner on NB Dataset
"""
import os
from dap.deep_learning_dap import DeepLearningDAP
from dataset import load_nb_target, load_nb_camda
from dataset import EFS_LAB, OS_LAB, HR_LAB
import argparse
from keras.backend import floatx
from keras.engine import Input, Model
from keras.layers import Dense
from keras.initializers import glorot_normal
from collections import Counter
import numpy as np
class MultiLayerPerceptronRunner(DeepLearningDAP):
"""
MLP DAP Runner
"""
def __init__(self, experiment, target_prediction):
"""
Parameters
----------
experiment: sklearn.datasets.base.Bunch
Bunch object embedding dataset (meta)data as returned by load_nb_target
function (see dataset.load_nb_target)
target_prediction: str
The target classification prediction/end-point (as one of HR, EFS, OS)
"""
# One of: 'HR', 'EFS', 'OS'
self._target_prediction_name = target_prediction
super(MultiLayerPerceptronRunner, self).__init__(experiment=experiment)
self.experiment_data.nb_classes = experiment.nb_classes_targets[self._target_prediction_name]
self._do_serialisation = False # Serialization does not work with our Keras layer
# ==== Abstract Methods Implementation ====
@property
def ml_model_name(self):
return 'MLP'
@property
def results_folder(self):
"""
Compose path to the folder where reports and metrics will be saved.
"""
base_foldername = self._target_prediction_name.lower()
folder_name = '_'.join([self.ml_model_name,
'epochs_{}'.format(self.learning_epochs),
'batch_{}'.format(self.batch_size),
self.optimizer_name,
self.feature_scaling_name, self.feature_ranking_name,
str(self.cv_n), str(self.cv_k)])
ds_name = self.experiment_data.dataset_name \
if 'dataset_name' in self.experiment_data.keys() else self.experiment_data.DESCR
output_folder_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
'results', ds_name, base_foldername, folder_name)
os.makedirs(output_folder_path, exist_ok=True)
return output_folder_path
# Override DAP Methods
# --------------------
def _set_training_data(self):
"""Default implementation for classic and quite standard DAP implementation.
More complex implementation require overriding this method.
"""
self.X = self.experiment_data.training_data
self.y = self.experiment_data.targets[self._target_prediction_name]
def _set_test_data(self):
self.X_test = self.experiment_data.test_data
self.y_test = self.experiment_data.test_targets[self._target_prediction_name]
def _build_network(self):
"""
Build a Multi-output network with AutoEncoding as Input.
Returns
-------
keras.models.Model
MLP model
"""
# Parameters for Input Layers
nb_features = self._nb_features
# Parameter for output layer
nb_classes = self.experiment_data.nb_classes
input_layer = Input(shape=(nb_features,), name="data", dtype=floatx())
hidden = Dense(128, activation='sigmoid', name='hidden',
kernel_initializer=glorot_normal(seed=self._iteration_step_nb))(input_layer)
output = Dense(units=nb_classes, kernel_initializer=glorot_normal(seed=self._iteration_step_nb),
activation="softmax", name='prediction')(hidden)
model = Model(inputs=input_layer, outputs=output)
return model
def _fit(self, model, X_train, y_train, X_validation=None, y_validation=None):
""""""
def get_class_weights(y):
if y.ndim == 2:
y = np.argmax(y, axis=1)
counter = Counter(y)
majority = max(counter.values())
return {cls: float(majority / count) for cls, count in counter.items()}
cls_weights = get_class_weights(y_train)
self.extra_fit_params['class_weight'] = cls_weights
return super(MultiLayerPerceptronRunner, self)._fit(model, X_train, y_train,
X_validation, y_validation)
def main():
# =======================
# Parse Arguments from CL
# =======================
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dataset", help="Target NB dataset to analyse",
type=str, choices=['TARGET', 'SEQC'])
parser.add_argument("-hr", "--hronly", help="Analyse High risk cohort only",
type=bool, default=False)
parser.add_argument("-t", "--target", help="Target Endpoint prediction",
type=str, choices=(EFS_LAB, OS_LAB, HR_LAB))
args = parser.parse_args()
if args.dataset == 'TARGET':
load_ds_fn = load_nb_target
else:
load_ds_fn = load_nb_camda
# Load Dataset
# ============
dataset = load_ds_fn(hr_only=args.hronly)
print('RUNNING ON DATASET {} (HR-ONLY: {})'.format(dataset.dataset_name.upper(),
args.hronly))
# ============
# DAP Settings
# ============
from dap import settings as dap_settings
dap_settings.to_categorical = False
dap_settings.feature_ranges = [2, 5, 10, 15, 20, 25, 50, 75, 100]
from dap.scaling import MinMaxScaler
dap_settings.feature_scaler = MinMaxScaler(feature_range=(-1, 1), copy=False)
# ===============
# DAP DL Settings
# ===============
from dap import deep_learning_settings as dl_dap_settings
dl_dap_settings.epochs = 500
dl_dap_settings.batch_size = 64
# deep_learning_settings.fit_verbose = 1
dl_dap_settings.loss = 'categorical_crossentropy'
dl_dap_settings.shuffle = True
from keras.optimizers import Adadelta
dl_dap_settings.optimizer = Adadelta()
dap = MultiLayerPerceptronRunner(dataset, target_prediction=args.target)
trained_model = dap.run(verbose=True)
# Predict on Test
X_test = dap.X_test
Y_test = dap.y_test
_, X_test = dap._apply_scaling(dap.X, X_test)
feature_ranking = dap._best_feature_ranking[:dap._nb_features]
X_test = dap._select_ranked_features(feature_ranking, X_test)
X_test = dap._prepare_data(X_test, learning_phase=False)
predictions = dap._predict(trained_model, X_test)
print(Y_test.shape, predictions[0].shape)
dap._compute_test_metrics(Y_test, predictions)
dap._save_test_metrics_to_file(dap.results_folder)
# dap.predict_on_test(trained_model)
print("Computation completed!")
if __name__ == '__main__':
main()
"""
Random Forest DAP Runner on NB Dataset
"""
import os
from dap.runners import RandomForestRunnerDAP
from dataset import load_nb_target, load_nb_camda
from dataset import EFS_LAB, OS_LAB, HR_LAB
import argparse
class RandomForestDAP(RandomForestRunnerDAP):
"""
RandomForest DAP Runner
"""
def __init__(self, experiment, target_prediction):
"""
Parameters
----------
experiment: sklearn.datasets.base.Bunch
Bunch object embedding dataset (meta)data as returned by load_nb_target
function (see dataset.load_nb_target)
target_prediction: str
The target classification prediction/end-point (as one of HR, EFS, OS)
"""
# One of: 'HR', 'EFS', 'OS'
self._target_prediction_name = target_prediction
super(RandomForestDAP, self).__init__(experiment=experiment)
self.experiment_data.nb_classes = experiment.nb_classes_targets[self._target_prediction_name]
@property
def results_folder(self):
"""
Compose path to the folder where reports and metrics will be saved.
"""
base_foldername = self._target_prediction_name.lower()
folder_name = '_'.join([self.ml_model_name, str(self._hyper_params['n_estimators']),
self._hyper_params['criterion'], self._hyper_params['max_features'],
self.feature_scaling_name, self.feature_ranking_name,
str(self.cv_n), str(self.cv_k)])
ds_name = self.experiment_data.dataset_name \
if 'dataset_name' in self.experiment_data.keys() else self.experiment_data.DESCR
output_folder_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
'results', ds_name, base_foldername, folder_name)
os.makedirs(output_folder_path, exist_ok=True)
return output_folder_path
@property
def ml_model_name(self):
return "RandomForest"
# Override DAP Methods
# --------------------
def _set_training_data(self):
"""Default implementation for classic and quite standard DAP implementation.
More complex implementation require overriding this method.
"""
self.X = self.experiment_data.training_data
self.y = self.experiment_data.targets[self._target_prediction_name]
def _set_test_data(self):
self.X_test = self.experiment_data.test_data
self.y_test = self.experiment_data.test_targets[self._target_prediction_name]
def main():
# =======================
# Parse Arguments from CL
# =======================
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dataset", help="Target NB dataset to analyse",
type=str, choices=['TARGET', 'SEQC'])
parser.add_argument("-hr", "--hronly", help="Analyse High risk cohort only",
type=bool, default=False)
parser.add_argument("-t", "--target", help="Target Endpoint prediction",
type=str, choices=(EFS_LAB, OS_LAB, HR_LAB))
args = parser.parse_args()
if args.dataset == 'TARGET':
load_ds_fn = load_nb_target
else:
load_ds_fn = load_nb_camda
# Load Dataset
# ============
dataset = load_ds_fn(hr_only=args.hronly)
print('RUNNING ON DATASET {} (HR-ONLY: {})'.format(dataset.dataset_name.upper(),
args.hronly))
# ============
# DAP Settings