compute_all_metrics.py 7.05 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#%%
import pandas as  pd
import argparse
from itertools import combinations
import numpy as np
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
from collections import Counter, OrderedDict
from pathlib import Path
import os
#%%
class myArgumentParser(argparse.ArgumentParser):
    def __init__(self, *args, **kwargs):
        super(myArgumentParser, self).__init__(*args, **kwargs)

    def convert_arg_line_to_args(self, line):
        for arg in line.split():
            if not arg.strip():
                continue
            if arg[0] == '#':
                break
            yield arg

parser = myArgumentParser(
    description='Compute metrics on all splits', fromfile_prefix_chars='@'
)
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
Nicole Bussola's avatar
Nicole Bussola committed
30
parser.add_argument('--model', type=str, help='classifiers implemented, randomForest or LSVM')
31
32
33
34
35
36
37
38
39
40
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
parser.add_argument('--mode', type=str, help='juxt, rSNF, rSNFi, single')

args = parser.parse_args()

#%%
OUTFOLDER = args.outfolder
DATASET = args.dataset
TARGET = args.target
Nicole Bussola's avatar
Nicole Bussola committed
41
MODEL = args.model
42
43
44
45
46
47
LAYERS = args.layers
N_SPLITS = args.n_splits
MODE = args.mode

assert MODE in ['juxt', 'rSNF', 'rSNFi', 'single']
#%%
Nicole Bussola's avatar
Nicole Bussola committed
48
PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}'
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

N_LAYERS = len(LAYERS)


#%%
df_results = pd.DataFrame(columns=['layers', 'mcc_train','mcc_train_min','mcc_train_max', 'auc_train', 'auc_train_min', 'auc_train_max', 
                                    'sens_train','sens_train_min','sens_train_max', 'spec_train', 'spec_train_min', 'spec_train_max', 
                                   	'mcc_test', 'mcc_test_min', 'mcc_test_max', 'best_feat'])

for k in range(2, N_LAYERS + 1):

    for comb in combinations(LAYERS, k):

        layers_concat = '_'.join(comb)

        all_mccs = []
        all_sens = []
        all_spec = []
        all_aucs = []

        all_test_mccs = []
        best_feat_steps = []

        for split_id in range(N_SPLITS):

Nicole Bussola's avatar
Nicole Bussola committed
74
            PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/{split_id}'
75
76
   
            if MODE == 'rSNF': 
77
78
                    file_log = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList.log') 
                    file_metrics = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList_allmetrics.txt') 
79
80
                
            else:
81
82
                    file_log = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest.log') 
                    file_metrics = os.path.join(PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest_allmetrics.txt')  
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

            with open(file_log) as f:
                log_content = f.readlines()

            for line in log_content:

                if 'mcc' in line:
                    mcc_test_line = line
                if "n_feats" in line:
                    best_feat_line = line
                    break


            best_feat = int(best_feat_line.split(' = ')[1][:-1])
            best_feat_steps.append(best_feat)

            mcc_test = float(mcc_test_line.split(' = ')[1][:-1])
            all_test_mccs.append(mcc_test)
            # %%
            all_metrics = pd.read_csv(file_metrics, sep='\t')
            best_idxs = np.where(all_metrics["nf"]==best_feat)[0]

            MCC = np.where(all_metrics.columns=='mcc')[0][0]
            best_mccs = all_metrics.iloc[best_idxs, MCC]
            # print(np.mean(best_mccs), best_feat)
            all_mccs.extend(best_mccs)

            AUC = np.where(all_metrics.columns=='auc')[0][0]
            best_auc = all_metrics.iloc[best_idxs, AUC]
            all_aucs.extend(best_auc)
           
            if TARGET!='subtypes':
                SENS = np.where(all_metrics.columns=='sens')[0][0]
                best_sens = all_metrics.iloc[best_idxs, SENS]
                all_sens.extend(best_sens)

                SPEC = np.where(all_metrics.columns=='spec')[0][0]
                best_spec = all_metrics.iloc[best_idxs, SPEC]
                all_spec.extend(best_spec)

        all_mccs = np.array(all_mccs)
        MCC_CI = bs.bootstrap(all_mccs, stat_func=bs_stats.mean)
        print('MCC train =', round(np.mean(all_mccs),3), (round(MCC_CI.lower_bound,3), round(MCC_CI.upper_bound,3)))

        all_aucs = np.array(all_aucs)
        AUC_CI = bs.bootstrap(all_aucs, stat_func=bs_stats.mean)      
        print('AUC train =', round(np.mean(all_aucs),3), (round(AUC_CI.lower_bound,3), round(AUC_CI.upper_bound,3)))

        all_test_mccs = np.array(all_test_mccs)
        MCC_TEST = bs.bootstrap(all_test_mccs, stat_func=bs_stats.mean)      
        print('MCC test =', round(np.mean(all_test_mccs),3), (round(MCC_TEST.lower_bound,3), round(MCC_TEST.upper_bound,3)))

        if TARGET!='subtypes':
            all_sens = np.array(all_sens)
            all_spec = np.array(all_spec)
            SENS_CI = bs.bootstrap(all_sens, stat_func=bs_stats.mean)
            SPEC_CI = bs.bootstrap(all_spec, stat_func=bs_stats.mean)
            print('SENS =', round(np.mean(all_sens),3), (round(SENS_CI.lower_bound,3), round(SENS_CI.upper_bound,3)))
            print('SPEC =', round(np.mean(all_spec),3), (round(SPEC_CI.lower_bound,3), round(SPEC_CI.upper_bound,3)))

            row = OrderedDict({'layers':layers_concat, 'mcc_train':round(np.mean(all_mccs),3), 'mcc_train_min':round(MCC_CI.lower_bound,3), 'mcc_train_max':round(MCC_CI.upper_bound,3), 
            'auc_train':round(np.mean(all_aucs),3), 'auc_train_min':round(AUC_CI.lower_bound,3), 'auc_train_max':round(AUC_CI.upper_bound,3),
            'sens_train':round(np.mean(all_sens),3), 'sens_train_min':round(SENS_CI.lower_bound,3), 'sens_train_max':round(SENS_CI.upper_bound,3),
            'spec_train':round(np.mean(all_spec),3), 'spec_train_min':round(SPEC_CI.lower_bound,3), 'spec_train_max':round(SPEC_CI.upper_bound,3),
            'mcc_test':round(np.mean(all_test_mccs),3), 'mcc_test_min':round(MCC_TEST.lower_bound,3), 'mcc_test_max':round(MCC_TEST.upper_bound,3),
            'best_feat':best_feat_steps})   
        else:
             row = OrderedDict({'layers':layers_concat, 'mcc_train':round(np.mean(all_mccs),3), 'mcc_train_min':round(MCC_CI.lower_bound,3), 'mcc_train_max':round(MCC_CI.upper_bound,3), 
            'auc_train':round(np.mean(all_aucs),3), 'auc_train_min':round(AUC_CI.lower_bound,3), 'auc_train_max':round(AUC_CI.upper_bound,3),
            'sens_train':np.nan, 'sens_train_min':np.nan, 'sens_train_max':np.nan,
            'spec_train':np.nan, 'spec_train_min':np.nan, 'spec_train_max':np.nan,
            'mcc_test':round(np.mean(all_test_mccs),3), 'mcc_test_min':round(MCC_TEST.lower_bound,3), 'mcc_test_max':round(MCC_TEST.upper_bound,3),
            'best_feat':best_feat_steps}) 

        print(layers_concat, MODE, 'best_feats =', Counter(best_feat_steps))
        print('\n')


        df_results = df_results.append(row, ignore_index=True)

Nicole Bussola's avatar
Nicole Bussola committed
163
    df_results.to_csv(f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_allSplits_{MODE}_{layers_concat}.txt', sep='\t', index=False)
164
# %%