#%% import argparse import os from collections import Counter, OrderedDict from itertools import combinations from pathlib import Path import bootstrapped.bootstrap as bs import bootstrapped.stats_functions as bs_stats import numpy as np import pandas as pd #%% parser = argparse.ArgumentParser( description='Compute Borda of Bordas for juXT and rSNF' ) parser.add_argument('--outfolder', type=str, help='Results folder') parser.add_argument('--dataset', type=str, help='Dataset name') parser.add_argument('--target', type=str, help='Clinical endpoint') parser.add_argument( '--model', type=str, help='classifiers implemented, randomForest or LSVM' ) parser.add_argument('--layers', type=str, nargs='+', help='List of omic layers') parser.add_argument( '--n_splits_end', type=int, help='Upper end of splits range - not inclusive' ) parser.add_argument('--n_splits_start', type=int, help='Lower end of splits range') parser.add_argument('--mode', type=str, help='juXT, rSNF, rSNFi or single') args = parser.parse_args() #%% OUTFOLDER = args.outfolder DATASET = args.dataset TARGET = args.target MODEL = args.model LAYERS = args.layers N_SPLITS_START = args.n_splits_start N_SPLITS_END = args.n_splits_end MODE = args.mode assert MODE in ['juxt', 'rSNF', 'rSNFi', 'single'] #%% PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}' N_LAYERS = len(LAYERS) #%% df_results = pd.DataFrame( columns=[ 'layers', 'mcc_train', 'mcc_train_min', 'mcc_train_max', 'auc_train', 'auc_train_min', 'auc_train_max', 'sens_train', 'sens_train_min', 'sens_train_max', 'spec_train', 'spec_train_min', 'spec_train_max', 'ppv_train', 'ppv_train_min', 'ppv_train_max', 'mcc_test', 'mcc_test_min', 'mcc_test_max', 'best_feat', ] ) if MODE == 'single': range_combinations = range(1, 2) else: range_combinations = range(2, N_LAYERS + 1) for k in range_combinations: for comb in combinations(LAYERS, k): layers_concat = '_'.join(comb) all_mccs = [] all_sens = [] all_spec = [] all_ppv = [] all_aucs = [] all_test_mccs = [] best_feat_steps = [] for split_id in range(N_SPLITS_START, N_SPLITS_END): PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/{split_id}' if MODE == 'rSNF': file_log = os.path.join( PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList.log' ) file_metrics = os.path.join( PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList_allmetrics.txt' ) file_MCC_test = os.path.join( PATH, MODE, f'{layers_concat}_tr_MCC_scores.txt' ) elif MODE == 'rSNFi': file_log = os.path.join( PATH, f'{MODE}/{layers_concat}_ts_{MODEL}_KBest.log' ) file_metrics = os.path.join( PATH, f'{MODE}/{layers_concat}_ts_{MODEL}_KBest_allmetrics.txt' ) file_MCC_test = os.path.join( PATH, MODE, f'{layers_concat}_ts_MCC_scores.txt' ) else: file_log = os.path.join( PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest.log' ) file_metrics = os.path.join( PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest_allmetrics.txt' ) file_MCC_test = os.path.join( PATH, MODE, f'{layers_concat}_tr_MCC_scores.txt' ) with open(file_log) as f: log_content = f.readlines() for line in log_content: if 'mcc' in line: mcc_test_line = line if "n_feats" in line: best_feat_line = line break best_feat = int(best_feat_line.split(' = ')[1][:-1]) best_feat_steps.append(best_feat) with open(file_MCC_test, 'r') as f: mcc_test = float(f.readlines()[1].split('\t')[1]) all_test_mccs.append(mcc_test) all_metrics = pd.read_csv(file_metrics, sep='\t') best_idxs = np.where(all_metrics["nf"] == best_feat)[0] MCC = np.where(all_metrics.columns == 'mcc')[0][0] best_mccs = all_metrics.iloc[best_idxs, MCC] # print(np.mean(best_mccs), best_feat) all_mccs.extend(best_mccs) AUC = np.where(all_metrics.columns == 'auc')[0][0] best_auc = all_metrics.iloc[best_idxs, AUC] all_aucs.extend(best_auc) if TARGET != 'subtypes': SENS = np.where(all_metrics.columns == 'sens')[0][0] best_sens = all_metrics.iloc[best_idxs, SENS] all_sens.extend(best_sens) SPEC = np.where(all_metrics.columns == 'spec')[0][0] best_spec = all_metrics.iloc[best_idxs, SPEC] all_spec.extend(best_spec) PPV = np.where(all_metrics.columns == 'ppv')[0][0] best_ppv = all_metrics.iloc[best_idxs, PPV] all_ppv.extend(best_ppv) all_mccs = np.array(all_mccs) MCC_CI = bs.bootstrap(all_mccs, stat_func=bs_stats.mean) print( 'MCC train =', round(np.mean(all_mccs), 3), (round(MCC_CI.lower_bound, 3), round(MCC_CI.upper_bound, 3)), ) all_aucs = np.array(all_aucs) AUC_CI = bs.bootstrap(all_aucs, stat_func=bs_stats.mean) print( 'AUC train =', round(np.mean(all_aucs), 3), (round(AUC_CI.lower_bound, 3), round(AUC_CI.upper_bound, 3)), ) all_test_mccs = np.array(all_test_mccs) MCC_TEST = bs.bootstrap(all_test_mccs, stat_func=bs_stats.mean) print( 'MCC test =', round(np.mean(all_test_mccs), 3), (round(MCC_TEST.lower_bound, 3), round(MCC_TEST.upper_bound, 3)), ) mean_features = np.mean(best_feat_steps) median_features = np.median(best_feat_steps) if TARGET != 'subtypes': all_sens = np.array(all_sens) all_spec = np.array(all_spec) all_ppv = np.array(all_ppv) SENS_CI = bs.bootstrap(all_sens, stat_func=bs_stats.mean) SPEC_CI = bs.bootstrap(all_spec, stat_func=bs_stats.mean) PPV_CI = bs.bootstrap(all_ppv, stat_func=bs_stats.mean) print( 'SENS =', round(np.mean(all_sens), 3), (round(SENS_CI.lower_bound, 3), round(SENS_CI.upper_bound, 3)), ) print( 'SPEC =', round(np.mean(all_spec), 3), (round(SPEC_CI.lower_bound, 3), round(SPEC_CI.upper_bound, 3)), ) print( 'PPV =', round(np.mean(all_ppv), 3), (round(PPV_CI.lower_bound, 3), round(PPV_CI.upper_bound, 3)), ) row = OrderedDict( { 'layers': layers_concat, 'mcc_train': round(np.mean(all_mccs), 3), 'mcc_train_min': round(MCC_CI.lower_bound, 3), 'mcc_train_max': round(MCC_CI.upper_bound, 3), 'auc_train': round(np.mean(all_aucs), 3), 'auc_train_min': round(AUC_CI.lower_bound, 3), 'auc_train_max': round(AUC_CI.upper_bound, 3), 'sens_train': round(np.mean(all_sens), 3), 'sens_train_min': round(SENS_CI.lower_bound, 3), 'sens_train_max': round(SENS_CI.upper_bound, 3), 'spec_train': round(np.mean(all_spec), 3), 'spec_train_min': round(SPEC_CI.lower_bound, 3), 'spec_train_max': round(SPEC_CI.upper_bound, 3), 'ppv_train': round(np.mean(all_ppv), 3), 'ppv_train_min': round(PPV_CI.lower_bound, 3), 'ppv_train_max': round(PPV_CI.upper_bound, 3), 'mcc_test': round(np.mean(all_test_mccs), 3), 'mcc_test_min': round(MCC_TEST.lower_bound, 3), 'mcc_test_max': round(MCC_TEST.upper_bound, 3), 'best_feat': best_feat_steps, 'mean_features': mean_features, 'median_features': median_features, } ) else: row = OrderedDict( { 'layers': layers_concat, 'mcc_train': round(np.mean(all_mccs), 3), 'mcc_train_min': round(MCC_CI.lower_bound, 3), 'mcc_train_max': round(MCC_CI.upper_bound, 3), 'auc_train': round(np.mean(all_aucs), 3), 'auc_train_min': round(AUC_CI.lower_bound, 3), 'auc_train_max': round(AUC_CI.upper_bound, 3), 'sens_train': np.nan, 'sens_train_min': np.nan, 'sens_train_max': np.nan, 'spec_train': np.nan, 'spec_train_min': np.nan, 'spec_train_max': np.nan, 'ppv_train': np.nan, 'ppv_train_min': np.nan, 'ppv_train_max': np.nan, 'mcc_test': round(np.mean(all_test_mccs), 3), 'mcc_test_min': round(MCC_TEST.lower_bound, 3), 'mcc_test_max': round(MCC_TEST.upper_bound, 3), 'best_feat': best_feat_steps, 'mean_features': mean_features, 'median_features': median_features, } ) print(layers_concat, MODE, 'best_feats =', Counter(best_feat_steps)) print("Mean features: ", mean_features) print("Median features: ", median_features) print('\n') df_results = df_results.append(row, ignore_index=True) df_results.to_csv( f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}.txt', sep='\t', index=False, ) # %%