Commit 34e0f0e8 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Trying split management

parent 43c00983
......@@ -2,11 +2,12 @@ import os
import subprocess
# these can be set on runtime:
# snakemake --config datafolder="mydata" outfolder="out" dataset="breast" target="ER" layer1="gene" layer2="cnv" layer3="prot" (...)
# snakemake --config datafolder="mydata" outfolder="out" dataset="breast" target="ER" layer1="gene" layer2="cnv" layer3="prot" split_id="1"(...)
DATAFOLDER = config['datafolder']
OUTFOLDER = config['outfolder']
DATASET = config['dataset']
TARGET = config['target']
SPLIT_ID = config['split_id']
LAYERS = [config[k] for k in config.keys() if k.startswith('layer')]
LAYERS_CONCAT = "_".join(LAYERS)
......@@ -14,9 +15,9 @@ LAYERS_CONCAT = "_".join(LAYERS)
rule all:
input:
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=[i for i in range(10)]),
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{split_id}/single/{layer}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layer=LAYERS, split_id=[i for i in range(10)])
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layer=LAYERS, split_id=SPLIT_ID)
rule ml_juxt_tr:
input:
......@@ -42,13 +43,13 @@ rule ml_juxt_val:
rule snf:
input:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layer=LAYERS, split_id=[i for i in range(10)]),
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layer=LAYERS, split_id=SPLIT_ID),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=[i for i in range(10)])
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID)
threads: 8
output:
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNF/INF_{layers}_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=[i for i in range(10)])
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
run:
all_input = [i[1] for i in input.allitems()]
inputfiles = " ".join(all_input[:-1])
......@@ -60,63 +61,75 @@ rule snf:
rule ml_rsnf_tr:
input:
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/{layer1}_{layer2}_tr.txt"),
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/labels_{target}_tr.txt"),
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/INF_{layer1}_{layer2}_tr.txt"
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID),
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/INF_{layers}_tr.txt",
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layer1}_{layer2}_tr_RandomForest_rankList.log"
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_RandomForest_rankList.log",
shell:
"python sklearn_rf_training_fixrank.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNF --ranking rankList --rankFeats {input[2]}"
rule ml_rsnf_val:
input:
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layer1}_{layer2}_tr_RandomForest_rankList.log",
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/{layer1}_{layer2}_ts.txt"),
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/labels_{target}_ts.txt")
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layer1}_{layer2}_tr_MCC_scores.txt"
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_RandomForest_rankList.log",
"{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts.txt",
"{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts.txt",
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_MCC_scores.txt",
shell:
"python sklearn_rf_validation_writeperf.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNF --tslab {input[2]}"
rule myintersect:
input:
"{outfolder}/{dataset}/{target}/{split_id}/juxt/{layer1}_{layer2}_tr_RandomForest_KBest.log",
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layer1}_{layer2}_tr_RandomForest_rankList.log"
expand("{outfolder}/{dataset}/{target}/{split_id}/juxt/{layers}_tr_RandomForest_KBest.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_RandomForest_rankList.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layer1}_{layer2}_intersect_tr.txt"
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_intersect_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
shell:
"python intersect_biomarkers.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNFi/venn_{wildcards.layer1}_{wildcards.layer2}_tr.png {output} --title1 {wildcards.layer1} --title2 {wildcards.layer2}"
"python intersect_biomarkers.py {input} {output}" ########
rule extract:
input:
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/{layer1}_{layer2}_tr.txt"),
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layer1}_{layer2}_intersect_tr.txt"
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_intersect_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layer1}_{layer2}_tr.txt"
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
shell:
"python extract_topfeats_onecol.py {input} {output}"
rule ml_rsnfi_tr:
input:
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layer1}_{layer2}_tr.txt",
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/labels_{target}_tr.txt")
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layer1}_{layer2}_tr_RandomForest_KBest.log"
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr_RandomForest_KBest.log"
shell:
"python sklearn_rf_training_fixrank.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNFi --ranking KBest"
rule ml_rsnfi_val:
input:
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layer1}_{layer2}_tr_RandomForest_KBest.log",
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/{layer1}_{layer2}_ts.txt"),
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/labels_{target}_ts.txt")
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr_RandomForest_KBest.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT)
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layer1}_{layer2}_tr_MCC_scores.txt"
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr_MCC_scores.txt"
shell:
"python sklearn_rf_validation_writeperf.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNFi --tslab {input[2]}"
......
......@@ -22,14 +22,13 @@ matplotlib.use('Agg')
parser = argparse.ArgumentParser(description='Find the intersection between feature lists and produce Venn diagrams.')
parser.add_argument('CONFIGFILE1', type=str, help='Training experiment configuration file 1 (with info about number of top discriminant features)')
parser.add_argument('CONFIGFILE2', type=str, help='Training experiment configuration file 2 (with info about number of top discriminant features)')
parser.add_argument('OUTFILE', type=str, help='Output file for Venn diagram plot.')
# begin MC edit
parser.add_argument('OUTLIST', type=str, help='Output file for intersected feature list.')
# end MC edit
parser.add_argument('--title1', type=str, default='List_1', help='Name for first diagram (default: %(default)s)')
parser.add_argument('--title2', type=str, default='List_2', help='Name for second diagram (default: %(default)s)')
parser.add_argument('--configFile3', type=str, default='NO', help='Third configuration file - optional (default: %(default)s)')
parser.add_argument('--title3', type=str, default='List_3', help='Name for third diagram (default: %(default)s)')
parser.add_argument('OUTFILE', type=str, nargs='?', help='Output file for Venn diagram plot.')
parser.add_argument('--title1', type=str, default='List_1', nargs='?', help='Name for first diagram (default: %(default)s)')
parser.add_argument('--title2', type=str, default='List_2', nargs='?', help='Name for second diagram (default: %(default)s)')
parser.add_argument('--configFile3', type=str, default='NO', nargs='?', help='Third configuration file - optional (default: %(default)s)')
parser.add_argument('--title3', type=str, default='List_3', nargs='?', help='Name for third diagram (default: %(default)s)')
__author__ = 'Alessandro Zandona'
__date__ = '15 December 2016'
......@@ -147,13 +146,13 @@ if (configfile3 != 'NO'):
writer.writerow([list(f2f3)[i], idx_list2+1, idx_list3+1])
# plot Venn diagrams
if (configfile3 != 'NO'):
v3_inter = pltv.venn3([feats1_set, feats2_set, feats3_set], (title1, title2, title3))
plt.title('Intersection of top discriminant features from %s, %s and %s' %(title1,title2,title3))
else:
v2_inter = pltv.venn2([feats1_set, feats2_set], (title1, title2))
plt.title('Intersection of top discriminant features from %s and %s' %(title1,title2))
plt.savefig(OUTFILE)
plt.close()
# # plot Venn diagrams
# if (configfile3 != 'NO'):
# v3_inter = pltv.venn3([feats1_set, feats2_set, feats3_set], (title1, title2, title3))
# plt.title('Intersection of top discriminant features from %s, %s and %s' %(title1,title2,title3))
# else:
# v2_inter = pltv.venn2([feats1_set, feats2_set], (title1, title2))
# plt.title('Intersection of top discriminant features from %s and %s' %(title1,title2))
# plt.savefig(OUTFILE)
# plt.close()
......@@ -8,7 +8,11 @@ DATAFOLDER=data
DATASET=tcga_breast
LAYER1=gene
LAYER2=cnv
TARGET=ER
TARGET=subtypes
# go!
snakemake --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 -p
for i in {0..1}
do
snakemake --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 split_id=$i -p
done
......@@ -4,24 +4,27 @@
## Requires Python >= 2.7, mlpy >= 3.5
import numpy as np
import pandas as pd
import argparse
import configparser as ConfigParser
import csv
import glob
import os.path
from mlpy import borda_count, canberra_stability
from input_output import load_data
import performance as perf
import sys
import glob
import argparse
import configparser as ConfigParser
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
import numpy as np
import pandas as pd
from mlpy import borda_count, canberra_stability
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import (StratifiedKFold, StratifiedShuffleSplit,
train_test_split)
from sklearn.multiclass import OneVsRestClassifier
import performance as perf
from input_output import load_data
__author__ = 'Marco Chierici'
__version__ = '2.5'
......@@ -93,7 +96,7 @@ TUN_CV_K = 10
TUN_CV_P = 50
sample_names, var_names, x = load_data(DATAFILE)
y_orig = np.loadtxt(LABELSFILE, dtype=np.int)
y_orig = pd.read_csv(LABELSFILE, header=None).values
# encode labels
le = preprocessing.LabelEncoder()
y = le.fit_transform(y_orig)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment