Commit 3c6ba2f4 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Compute snf ranking for each DAP 10x5cv fold

parent d80f90ae
......@@ -20,6 +20,9 @@ LAYERS = [config[k] for k in config.keys() if k.startswith('layer')]
LAYERS_CONCAT = "_".join(LAYERS)
LAYERS_SPACED = " ".join(LAYERS)
CV_N = 10
CV_K = 5
old_target = TARGET
try:
RANDOM = config['random']
......@@ -33,17 +36,30 @@ except:
rule all:
input:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/juxt/{layers}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/{layers}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_ts_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/single/{layer}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layer=LAYERS, split_id=SPLIT_ID)
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layer=LAYERS, split_id=SPLIT_ID)
rule create_labels_random:
rule prepare_data_snf:
input:
f"{DATAFOLDER}/{DATASET}/{old_target}/{SPLIT_ID}/labels_{old_target}_tr.txt"
data = expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
labels = expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID)
output:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr_{cv_n}_{cv_k}.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS, cv_n=list(range(CV_N)), cv_k=list(range(CV_K))),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr_{cv_n}_{cv_k}.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, cv_n=list(range(CV_N)), cv_k=list(range(CV_K)))
shell:
'python prepare_data_snf.py --datafiles {input.data} --labelsfile {input.labels} --cv_k {CV_K} --cv_n {CV_N}'
rule create_labels_random:
output:
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_tr.txt",
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_ts.txt",
......@@ -92,7 +108,6 @@ rule create_labels_random:
rule concat_layers:
input:
rules.create_labels_random.output,
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
......@@ -116,7 +131,7 @@ rule ml_juxt_tr:
output:
"{outfolder}/{dataset}/{target}/{model}/{split_id}/juxt/{layers}_tr_{model}_KBest.log"
shell:
"python sklearn_training.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/juxt --model {wildcards.model} --ranking KBest"
"python sklearn_training.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/juxt --model {wildcards.model} --ranking KBest --cv_n {CV_N} --cv_k {CV_K}"
rule ml_juxt_val:
......@@ -130,16 +145,15 @@ rule ml_juxt_val:
shell:
"python sklearn_validation.py {input[0]} {input[2]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/juxt --tslab {input[3]}"
rule snf:
input:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr_{{cv_n}}_{{cv_k}}.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layer=LAYERS, split_id=SPLIT_ID),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID)
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr_{{cv_n}}_{{cv_k}}.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID),
threads: 8
output:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/INF_{layers}_tr.txt",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/INF_{layers}_tr_{{cv_n}}_{{cv_k}}.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
run:
all_input = [i[1] for i in input.allitems()]
......@@ -156,11 +170,12 @@ rule ml_rsnf_tr:
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID),
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/INF_{layers}_tr.txt",
snf_rankings = expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/INF_{layers}_tr_{cv_n}_{cv_k}.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, split_id=SPLIT_ID, layers=LAYERS_CONCAT, cv_n=list(range(CV_N)), cv_k=list(range(CV_K)))
output:
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/{layers}_tr_{model}_rankList.log",
shell:
"python sklearn_training.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/rSNF --model {wildcards.model} --ranking rankList --rankFeats {input[2]}"
"python sklearn_training.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/rSNF --model {wildcards.model} --cv_n {CV_N} --cv_k {CV_K} --ranking rankList --rankFeatsList {input.snf_rankings}"
rule ml_rsnf_val:
......@@ -212,7 +227,7 @@ rule ml_rsnfi_tr:
output:
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_ts_{model}_KBest.log"
shell:
"python sklearn_training.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/rSNFi --model {wildcards.model} --ranking KBest"
"python sklearn_training.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/rSNFi --model {wildcards.model} --cv_n {CV_N} --cv_k {CV_K} --ranking KBest"
rule ml_rsnfi_val:
......@@ -236,7 +251,7 @@ rule single_tr:
output:
"{outfolder}/{dataset}/{target}/{model}/{split_id}/single/{layer}_tr_{model}_KBest.log"
shell:
"python sklearn_training.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/single --model {wildcards.model} --ranking KBest"
"python sklearn_training.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/single --model {wildcards.model} --ranking KBest --cv_n {CV_N} --cv_k {CV_K}"
rule single_val:
input:
......
# %%
import numpy as np
from sklearn.model_selection import StratifiedKFold
import os
import pandas as pd
from tqdm import tqdm
import argparse
#%%
class myArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
super(myArgumentParser, self).__init__(*args, **kwargs)
def convert_arg_line_to_args(self, line):
for arg in line.split():
if not arg.strip():
continue
if arg[0] == '#':
break
yield arg
parser = myArgumentParser(
description='Run a training experiment (10x5-CV fold) using Random Forest as classifier.',
fromfile_prefix_chars='@',
)
parser.add_argument('--datafiles', type=str, nargs='+', help='Training datafile')
parser.add_argument('--labelsfile', type=str, help='Sample labels')
parser.add_argument(
'--cv_k', type=np.int, default=5, help='Number of CV folds (default: %(default)s)'
)
parser.add_argument(
'--cv_n', type=np.int, default=10, help='Number of CV cycles (default: %(default)s)'
)
args = parser.parse_args()
DATAFILES = args.datafiles
LABELSFILE = args.labelsfile
CV_K = args.cv_k
CV_N = args.cv_n
#%%
for datafile in DATAFILES:
original_data = pd.read_csv(datafile, sep='\t', header=0, index_col=0)
original_labels = pd.read_csv(LABELSFILE, header=None)
ys = []
for i in range(CV_N):
ys.append(original_labels)
#%%
for n in tqdm(range(CV_N)):
skf = StratifiedKFold(CV_K, shuffle=True, random_state=n)
for i, (idx_tr, idx_ts) in enumerate(skf.split(original_data, ys[n])):
cv_data = original_data.iloc[idx_tr]
cv_labels = original_labels.iloc[idx_tr]
# print(cv_data.shape)
layer_filename = os.path.splitext(datafile)[0]
layer_ext = os.path.splitext(datafile)[1]
layer_filename_new = f'{layer_filename}_{n}_{i}{layer_ext}'
labels_filename = os.path.splitext(LABELSFILE)[0]
labels_ext = os.path.splitext(LABELSFILE)[1]
labels_filename_new = f'{labels_filename}_{n}_{i}{labels_ext}'
cv_data.to_csv(layer_filename_new, sep='\t')
cv_labels.to_csv(labels_filename_new, sep='\t', header=None, index=False)
# %%
......@@ -93,9 +93,10 @@ parser.add_argument(
'--cv_n', type=np.int, default=10, help='Number of CV cycles (default: %(default)s)'
)
parser.add_argument(
'--rankFeats',
'--rankFeatsList',
type=str,
default='',
nargs='+',
help='Ranked features list to be used by Machine Learning [Feats name on 1st column, feats weights on 2nd column, with HEADER]',
)
......@@ -113,7 +114,7 @@ RANK_METHOD = args.RANK_METHOD
random_labels = args.random
CV_K = args.cv_k
CV_N = args.cv_n
RANKFEATS = args.rankFeats
RANKFEATSLIST = sorted(args.rankFeatsList)
BASEFILE = os.path.splitext(os.path.basename(DATAFILE))[0]
......@@ -152,12 +153,16 @@ y = le.fit_transform(y_orig)
is_multiclass = len(le.classes_) > 2
# If ranked list is given as input to DAP, read it and extract features index
if RANK_METHOD == "rankList":
rankedList = np.loadtxt(RANKFEATS, delimiter='\t', dtype=str, skiprows=1)
ranked_feats = rankedList[:, 0]
# Find index of features inside dataset
ranked_feats_idx = []
for el in ranked_feats.tolist():
ranked_feats_idx.append(np.where([el == vn for vn in var_names])[0][0])
ranked_feats_idx_list = []
for ranked_list in RANKFEATSLIST:
ranked_feats = np.loadtxt(ranked_list, delimiter='\t', dtype=str, skiprows=1)[
:, 0
]
# Find index of features inside dataset
ranked_feats_idx = []
for el in ranked_feats.tolist():
ranked_feats_idx.append(np.where([el == vn for vn in var_names])[0][0])
ranked_feats_idx_list.append(ranked_feats_idx)
scorer = make_scorer(matthews_corrcoef)
......@@ -257,7 +262,7 @@ for n in range(CV_N):
elif RANK_METHOD == "LSVM":
ranking_tmp = np.argsort(model.coef_[0] ** 2)[::-1]
elif RANK_METHOD == 'rankList':
ranking_tmp = ranked_feats_idx
ranking_tmp = ranked_feats_idx_list[(n * CV_K) + i]
RANKING[(n * CV_K) + i] = ranking_tmp
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment