borda_global_rSNFi.py 3.19 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#%%
import os
import numpy as np
import pandas as pd
import operator
import argparse
from itertools import combinations

#%%
class myArgumentParser(argparse.ArgumentParser):
    def __init__(self, *args, **kwargs):
        super(myArgumentParser, self).__init__(*args, **kwargs)

    def convert_arg_line_to_args(self, line):
        for arg in line.split():
            if not arg.strip():
                continue
            if arg[0] == '#':
                break
            yield arg


parser = myArgumentParser(
    description='Concatenate omic layers files', fromfile_prefix_chars='@'
)
parser.add_argument('--datafolder', type=str, help='Main data folder')
parser.add_argument('--outfolder', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
Nicole Bussola's avatar
Nicole Bussola committed
30
parser.add_argument('--model', type=str, help='Classifiers implemented, randomForest or LSVM')
31
32
33
34
35
36
37
38
39
40
41
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
parser.add_argument('--mode', type=str, help='rSNFi custom Borda')

args = parser.parse_args()


DATAFOLDER = args.datafolder
DATASET = args.dataset
OUTFOLDER = args.outfolder
TARGET = args.target
Nicole Bussola's avatar
Nicole Bussola committed
42
MODEL = args.model
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
LAYERS = args.layers
N_SPLITS = args.n_splits
MODE = args.mode
assert MODE == 'rSNFi'

N_LAYERS = len(LAYERS)

CV_K = 5
CV_N = 10

#%% 
for k in range(2, N_LAYERS + 1):

    for comb in combinations(LAYERS, k):

        layers_concat = '_'.join(comb)

        all_feats=[]

        for i in range(N_SPLITS):
63
            file_featureList = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_featurelist.txt') 
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
            feats = pd.read_csv(file_featureList, sep='\t')
            all_feats.extend(list(feats.FEATURE_NAME))

        all_feats = list(set(all_feats))

        positions = dict()
        means = dict()
        x=((len(all_feats)-1)*np.ones((1,N_SPLITS*CV_K*CV_N)))

        for i in all_feats:
            positions[i]=x.tolist()[0]
            means[i]=0.0

        for i in range(N_SPLITS):

79
            file_featureList = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_featurelist.txt') 
80
81
82
83
84
85
86
            feats = pd.read_csv(file_featureList, sep='\t')
            
            z=[None]*len(feats)

            for k in range(len(feats)):
                z[feats.FEATURE_ID[k]]=feats.FEATURE_NAME[k]

87
            file_ranking = os.path.join(OUTFOLDER, DATASET, TARGET, MODEL, f'{i}/{MODE}/{layers_concat}_tr_{MODEL}_KBest_ranking.csv.gz') 
88
89
90
91
92
93
94
95
96
97
98
            rankings = pd.read_csv(file_ranking, header=None, sep='\t')
            for j in range(CV_K*CV_N):
                for k in range(rankings.shape[1]):
                    positions[z[rankings.iloc[j][k]]][i*(CV_K*CV_N)+j]=1.0*k

        for i in all_feats:
            means[i]=np.mean(positions[i])

        sorted_means = sorted(means.items(), key=operator.itemgetter(1))
        
        borda_df = pd.DataFrame(sorted_means, columns=['FEATURE_NAME', 'MEAN_POS'])
Nicole Bussola's avatar
Nicole Bussola committed
99
        borda_df.to_csv(f"{OUTFOLDER}/{DATASET}/{TARGET}/{MODEL}/Borda_allSplits_{MODE}_{layers_concat}.txt", sep='\t', index=False, float_format="%.3f")