compute_all_metrics.py 10.2 KB
Newer Older
1
2
#%%
import argparse
3
4
import os
from collections import Counter, OrderedDict
5
from itertools import combinations
6
7
from pathlib import Path

8
9
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
10
11
12
import numpy as np
import pandas as pd

13
#%%
14
15
parser = argparse.ArgumentParser(
    description='Compute Borda of Bordas for juXT and rSNF'
16
17
18
19
)
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
20
21
22
parser.add_argument(
    '--model', type=str, help='classifiers implemented, randomForest or LSVM'
)
23
24
25
26
27
28
parser.add_argument('--layers', type=str, nargs='+', help='List of omic layers')
parser.add_argument(
    '--n_splits_end', type=int, help='Upper end of splits range - not inclusive'
)
parser.add_argument('--n_splits_start', type=int, help='Lower end of splits range')
parser.add_argument('--mode', type=str, help='juXT, rSNF, rSNFi or single')
29
30
31
32
33
34
35

args = parser.parse_args()

#%%
OUTFOLDER = args.outfolder
DATASET = args.dataset
TARGET = args.target
Nicole Bussola's avatar
Nicole Bussola committed
36
MODEL = args.model
37
LAYERS = args.layers
Nicole Bussola's avatar
Nicole Bussola committed
38
39
N_SPLITS_START = args.n_splits_start
N_SPLITS_END = args.n_splits_end
40
41
42
43
MODE = args.mode

assert MODE in ['juxt', 'rSNF', 'rSNFi', 'single']
#%%
Nicole Bussola's avatar
Nicole Bussola committed
44
PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}'
45
46
47
48
49

N_LAYERS = len(LAYERS)


#%%
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
df_results = pd.DataFrame(
    columns=[
        'layers',
        'mcc_train',
        'mcc_train_min',
        'mcc_train_max',
        'auc_train',
        'auc_train_min',
        'auc_train_max',
        'sens_train',
        'sens_train_min',
        'sens_train_max',
        'spec_train',
        'spec_train_min',
        'spec_train_max',
Marco Chierici's avatar
Marco Chierici committed
65
66
67
        'ppv_train',
        'ppv_train_min',
        'ppv_train_max',
68
69
70
71
72
73
        'mcc_test',
        'mcc_test_min',
        'mcc_test_max',
        'best_feat',
    ]
)
74

75
76
if MODE == 'single':
    range_combinations = range(1, 2)
Nicole Bussola's avatar
Nicole Bussola committed
77
78
79
80
else:
    range_combinations = range(2, N_LAYERS + 1)

for k in range_combinations:
81
82
83
84
85
86
87
88

    for comb in combinations(LAYERS, k):

        layers_concat = '_'.join(comb)

        all_mccs = []
        all_sens = []
        all_spec = []
Marco Chierici's avatar
Marco Chierici committed
89
        all_ppv = []
90
91
92
93
94
        all_aucs = []

        all_test_mccs = []
        best_feat_steps = []

Alessia Marcolini's avatar
Alessia Marcolini committed
95
        for split_id in range(N_SPLITS_START, N_SPLITS_END):
96

Nicole Bussola's avatar
Nicole Bussola committed
97
            PATH = f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/{split_id}'
98
99
100
101
102
103
104
105

            if MODE == 'rSNF':
                file_log = os.path.join(
                    PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList.log'
                )
                file_metrics = os.path.join(
                    PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_rankList_allmetrics.txt'
                )
106
107
108
                file_MCC_test = os.path.join(
                    PATH, MODE, f'{layers_concat}_tr_MCC_scores.txt'
                )
109

110
111
112
113
114
115
116
            elif MODE == 'rSNFi':
                file_log = os.path.join(
                    PATH, f'{MODE}/{layers_concat}_ts_{MODEL}_KBest.log'
                )
                file_metrics = os.path.join(
                    PATH, f'{MODE}/{layers_concat}_ts_{MODEL}_KBest_allmetrics.txt'
                )
117
118
119
                file_MCC_test = os.path.join(
                    PATH, MODE, f'{layers_concat}_ts_MCC_scores.txt'
                )
120
            else:
121
122
123
124
125
126
                file_log = os.path.join(
                    PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest.log'
                )
                file_metrics = os.path.join(
                    PATH, f'{MODE}/{layers_concat}_tr_{MODEL}_KBest_allmetrics.txt'
                )
127
128
129
                file_MCC_test = os.path.join(
                    PATH, MODE, f'{layers_concat}_tr_MCC_scores.txt'
                )
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

            with open(file_log) as f:
                log_content = f.readlines()

            for line in log_content:

                if 'mcc' in line:
                    mcc_test_line = line
                if "n_feats" in line:
                    best_feat_line = line
                    break

            best_feat = int(best_feat_line.split(' = ')[1][:-1])
            best_feat_steps.append(best_feat)

145
146
147
            with open(file_MCC_test, 'r') as f:
                mcc_test = float(f.readlines()[1].split('\t')[1])

148
            all_test_mccs.append(mcc_test)
149

150
            all_metrics = pd.read_csv(file_metrics, sep='\t')
151
            best_idxs = np.where(all_metrics["nf"] == best_feat)[0]
152

153
            MCC = np.where(all_metrics.columns == 'mcc')[0][0]
154
155
156
157
            best_mccs = all_metrics.iloc[best_idxs, MCC]
            # print(np.mean(best_mccs), best_feat)
            all_mccs.extend(best_mccs)

158
            AUC = np.where(all_metrics.columns == 'auc')[0][0]
159
160
            best_auc = all_metrics.iloc[best_idxs, AUC]
            all_aucs.extend(best_auc)
161
162
163

            if TARGET != 'subtypes':
                SENS = np.where(all_metrics.columns == 'sens')[0][0]
164
165
166
                best_sens = all_metrics.iloc[best_idxs, SENS]
                all_sens.extend(best_sens)

167
                SPEC = np.where(all_metrics.columns == 'spec')[0][0]
168
169
170
                best_spec = all_metrics.iloc[best_idxs, SPEC]
                all_spec.extend(best_spec)

Marco Chierici's avatar
Marco Chierici committed
171
172
173
174
                PPV = np.where(all_metrics.columns == 'ppv')[0][0]
                best_ppv = all_metrics.iloc[best_idxs, PPV]
                all_ppv.extend(best_ppv)

175
176
        all_mccs = np.array(all_mccs)
        MCC_CI = bs.bootstrap(all_mccs, stat_func=bs_stats.mean)
177
178
179
180
181
        print(
            'MCC train =',
            round(np.mean(all_mccs), 3),
            (round(MCC_CI.lower_bound, 3), round(MCC_CI.upper_bound, 3)),
        )
182
183

        all_aucs = np.array(all_aucs)
184
185
186
187
188
189
        AUC_CI = bs.bootstrap(all_aucs, stat_func=bs_stats.mean)
        print(
            'AUC train =',
            round(np.mean(all_aucs), 3),
            (round(AUC_CI.lower_bound, 3), round(AUC_CI.upper_bound, 3)),
        )
190
191

        all_test_mccs = np.array(all_test_mccs)
192
193
194
195
196
197
198
199
200
        MCC_TEST = bs.bootstrap(all_test_mccs, stat_func=bs_stats.mean)
        print(
            'MCC test =',
            round(np.mean(all_test_mccs), 3),
            (round(MCC_TEST.lower_bound, 3), round(MCC_TEST.upper_bound, 3)),
        )

        mean_features = np.mean(best_feat_steps)
        median_features = np.median(best_feat_steps)
201

202
        if TARGET != 'subtypes':
203
204
            all_sens = np.array(all_sens)
            all_spec = np.array(all_spec)
Marco Chierici's avatar
Marco Chierici committed
205
            all_ppv = np.array(all_ppv)
206
207
            SENS_CI = bs.bootstrap(all_sens, stat_func=bs_stats.mean)
            SPEC_CI = bs.bootstrap(all_spec, stat_func=bs_stats.mean)
Marco Chierici's avatar
Marco Chierici committed
208
            PPV_CI = bs.bootstrap(all_ppv, stat_func=bs_stats.mean)
209
210
211
212
213
214
215
216
217
218
            print(
                'SENS =',
                round(np.mean(all_sens), 3),
                (round(SENS_CI.lower_bound, 3), round(SENS_CI.upper_bound, 3)),
            )
            print(
                'SPEC =',
                round(np.mean(all_spec), 3),
                (round(SPEC_CI.lower_bound, 3), round(SPEC_CI.upper_bound, 3)),
            )
Marco Chierici's avatar
Marco Chierici committed
219
220
221
222
223
            print(
                'PPV =',
                round(np.mean(all_ppv), 3),
                (round(PPV_CI.lower_bound, 3), round(PPV_CI.upper_bound, 3)),
            )
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239

            row = OrderedDict(
                {
                    'layers': layers_concat,
                    'mcc_train': round(np.mean(all_mccs), 3),
                    'mcc_train_min': round(MCC_CI.lower_bound, 3),
                    'mcc_train_max': round(MCC_CI.upper_bound, 3),
                    'auc_train': round(np.mean(all_aucs), 3),
                    'auc_train_min': round(AUC_CI.lower_bound, 3),
                    'auc_train_max': round(AUC_CI.upper_bound, 3),
                    'sens_train': round(np.mean(all_sens), 3),
                    'sens_train_min': round(SENS_CI.lower_bound, 3),
                    'sens_train_max': round(SENS_CI.upper_bound, 3),
                    'spec_train': round(np.mean(all_spec), 3),
                    'spec_train_min': round(SPEC_CI.lower_bound, 3),
                    'spec_train_max': round(SPEC_CI.upper_bound, 3),
Marco Chierici's avatar
Marco Chierici committed
240
241
242
                    'ppv_train': round(np.mean(all_ppv), 3),
                    'ppv_train_min': round(PPV_CI.lower_bound, 3),
                    'ppv_train_max': round(PPV_CI.upper_bound, 3),
243
244
245
246
247
248
249
250
                    'mcc_test': round(np.mean(all_test_mccs), 3),
                    'mcc_test_min': round(MCC_TEST.lower_bound, 3),
                    'mcc_test_max': round(MCC_TEST.upper_bound, 3),
                    'best_feat': best_feat_steps,
                    'mean_features': mean_features,
                    'median_features': median_features,
                }
            )
251
        else:
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
            row = OrderedDict(
                {
                    'layers': layers_concat,
                    'mcc_train': round(np.mean(all_mccs), 3),
                    'mcc_train_min': round(MCC_CI.lower_bound, 3),
                    'mcc_train_max': round(MCC_CI.upper_bound, 3),
                    'auc_train': round(np.mean(all_aucs), 3),
                    'auc_train_min': round(AUC_CI.lower_bound, 3),
                    'auc_train_max': round(AUC_CI.upper_bound, 3),
                    'sens_train': np.nan,
                    'sens_train_min': np.nan,
                    'sens_train_max': np.nan,
                    'spec_train': np.nan,
                    'spec_train_min': np.nan,
                    'spec_train_max': np.nan,
Marco Chierici's avatar
Marco Chierici committed
267
268
269
                    'ppv_train': np.nan,
                    'ppv_train_min': np.nan,
                    'ppv_train_max': np.nan,                    
270
271
272
273
274
275
276
277
                    'mcc_test': round(np.mean(all_test_mccs), 3),
                    'mcc_test_min': round(MCC_TEST.lower_bound, 3),
                    'mcc_test_max': round(MCC_TEST.upper_bound, 3),
                    'best_feat': best_feat_steps,
                    'mean_features': mean_features,
                    'median_features': median_features,
                }
            )
278
279

        print(layers_concat, MODE, 'best_feats =', Counter(best_feat_steps))
280
281
        print("Mean features: ", mean_features)
        print("Median features: ", median_features)
282
283
284
285
        print('\n')

        df_results = df_results.append(row, ignore_index=True)

286
df_results.to_csv(
Nicole Bussola's avatar
Nicole Bussola committed
287
    f'{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/metrics_splits_{N_SPLITS_START}-{N_SPLITS_END}_{MODE}.txt',
288
289
290
    sep='\t',
    index=False,
)
291
# %%