Commit 69275c80 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Add concat_layer step to pipeline

parent c03b92f6
#%%
import os
import subprocess
#%%
# these can be set on runtime:
# snakemake --config datafolder="mydata" outfolder="out" dataset="breast" target="ER" layer1="gene" layer2="cnv" layer3="prot" split_id="1"(...)
DATAFOLDER = config['datafolder']
OUTFOLDER = config['outfolder']
DATASET = config['dataset']
TARGET = config['target']
SPLIT_ID = config['split_id']
LAYERS = [config[k] for k in config.keys() if k.startswith('layer')]
LAYERS_CONCAT = "_".join(LAYERS)
rule all:
input:
expand("{outfolder}/{dataset}/{target}/{split_id}/juxt/{layers}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{split_id}/single/{layer}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layer=LAYERS, split_id=SPLIT_ID)
rule ml_juxt_tr:
input:
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/{layers}_tr.txt"),
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/labels_{target}_tr.txt")
output:
"{outfolder}/{dataset}/{target}/{split_id}/juxt/{layers}_tr_RandomForest_KBest.log"
shell:
"python sklearn_rf_training_fixrank.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/juxt --ranking KBest"
rule ml_juxt_val:
input:
"{outfolder}/{dataset}/{target}/{split_id}/juxt/{layers}_tr_RandomForest_KBest.log",
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/{layers}_ts.txt"),
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/labels_{target}_ts.txt")
output:
"{outfolder}/{dataset}/{target}/{split_id}/juxt/{layers}_tr_MCC_scores.txt"
shell:
"python sklearn_rf_validation_writeperf.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/juxt --tslab {input[2]}"
rule snf:
input:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layer=LAYERS, split_id=SPLIT_ID),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID)
threads: 8
output:
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNF/INF_{layers}_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
run:
all_input = [i[1] for i in input.allitems()]
inputfiles = " ".join(all_input[:-1])
labfile = all_input[-1]
subprocess.call(f"Rscript snf_integration.R --data {inputfiles} --lab {labfile} \
--scriptDir SNFtools/ --clust spectral --threads {threads} \
--outf {output}", shell=True)
rule ml_rsnf_tr:
input:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID),
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/INF_{layers}_tr.txt",
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_RandomForest_rankList.log",
shell:
"python sklearn_rf_training_fixrank.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNF --ranking rankList --rankFeats {input[2]}"
rule ml_rsnf_val:
input:
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_RandomForest_rankList.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_MCC_scores.txt",
shell:
"python sklearn_rf_validation_writeperf.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNF --tslab {input[2]}"
rule myintersect:
input:
expand("{outfolder}/{dataset}/{target}/{split_id}/juxt/{layers}_tr_RandomForest_KBest.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNF/{layers}_tr_RandomForest_rankList.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
output:
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_intersect_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
shell:
"python intersect_biomarkers.py {input} {output}" ########
rule extract:
input:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_intersect_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
output:
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
shell:
"python extract_topfeats_onecol.py {input} {output}"
rule ml_rsnfi_tr:
input:
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr_RandomForest_KBest.log"
shell:
"python sklearn_rf_training_fixrank.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNFi --ranking KBest"
rule ml_rsnfi_val:
input:
expand("{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr_RandomForest_KBest.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT)
output:
"{outfolder}/{dataset}/{target}/{split_id}/rSNFi/{layers}_tr_MCC_scores.txt"
shell:
"python sklearn_rf_validation_writeperf.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/rSNFi --tslab {input[2]}"
rule single_tr:
input:
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/{layer}_tr.txt"),
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/labels_{target}_tr.txt")
output:
"{outfolder}/{dataset}/{target}/{split_id}/single/{layer}_tr_RandomForest_KBest.log"
shell:
"python sklearn_rf_training_fixrank.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/single --ranking KBest"
rule single_val:
input:
"{outfolder}/{dataset}/{target}/{split_id}/single/{layer}_tr_RandomForest_KBest.log",
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/{layer}_ts.txt"),
os.path.join(DATAFOLDER, "{dataset}/{target}/{split_id}/labels_{target}_ts.txt")
output:
"{outfolder}/{dataset}/{target}/{split_id}/single/{layer}_tr_MCC_scores.txt"
shell:
"python sklearn_rf_validation_writeperf.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.split_id}/single --tslab {input[2]}"
#%%
import pandas as pd
#%%
target = 'ER'
layer1 = 'gene'
layer2 = 'cnv'
layer3 = 'prot'
#%%
for split_id in range(10):
PATH = f'data/tcga_breast/{target}/{split_id}'
l1_l2_tr = pd.read_csv(f'{PATH}/{layer1}_{layer2}_tr.txt', sep='\t')
l1_l2_ts = pd.read_csv(f'{PATH}/{layer1}_{layer2}_ts.txt', sep='\t')
l3_tr = pd.read_csv(f'{PATH}/{layer3}_tr.txt', sep='\t')
l3_ts = pd.read_csv(f'{PATH}/{layer3}_ts.txt', sep='\t')
merged_tr = pd.merge(l1_l2_tr, l3_tr, on='Sample')
merged_ts = pd.merge(l1_l2_ts, l3_ts, on='Sample')
merged_tr.to_csv(f'{PATH}/{layer1}_{layer2}_{layer3}_tr.txt', sep='\t', index=False)
merged_ts.to_csv(f'{PATH}/{layer1}_{layer2}_{layer3}_ts.txt', sep='\t', index=False)
# %%
#%%
import argparse
from functools import reduce
from itertools import combinations
import pandas as pd
class myArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
super(myArgumentParser, self).__init__(*args, **kwargs)
def convert_arg_line_to_args(self, line):
for arg in line.split():
if not arg.strip():
continue
if arg[0] == '#':
break
yield arg
parser = myArgumentParser(
description='Concatenate omic layers files', fromfile_prefix_chars='@'
)
parser.add_argument('--datafolder', type=str, help='Main data folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--layers', type=str, nargs='+', help='')
parser.add_argument('--n_splits', type=int, help='')
args = parser.parse_args()
#%%
DATAFOLDER = args.datafolder
DATASET = args.dataset
TARGET = args.target
LAYERS = args.layers
N_SPLITS = args.n_splits
print(LAYERS)
#%%
for split_id in range(N_SPLITS):
PATH = f'{DATAFOLDER}/{DATASET}/{TARGET}/{split_id}'
for k in range(2, N_SPLITS + 1):
for comb in combinations(LAYERS, k):
single_dfs_tr = []
single_dfs_ts = []
for layer in comb:
single_dfs_tr.append(pd.read_csv(f'{PATH}/{layer}_tr.txt', sep='\t'))
single_dfs_ts.append(pd.read_csv(f'{PATH}/{layer}_ts.txt', sep='\t'))
merged_tr = reduce(lambda x, y: pd.merge(x, y, on='Sample'), single_dfs_tr)
merged_ts = reduce(lambda x, y: pd.merge(x, y, on='Sample'), single_dfs_ts)
layers_concat = '_'.join(comb)
merged_tr.to_csv(f'{PATH}/__{layers_concat}_tr.txt', sep='\t', index=False)
merged_ts.to_csv(f'{PATH}/__{layers_concat}_ts.txt', sep='\t', index=False)
# %%
......@@ -9,10 +9,16 @@ DATASET=tcga_breast
LAYER1=gene
LAYER2=cnv
TARGET=subtypes
N_SPLITS=10
# go!
for i in {0..1}
python preprocessing/concat_layers.py --datafolder $DATAFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 $LAYER3 --n_splits $N_SPLITS
for i in N_SPLITS
do
snakemake --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 split_id=$i -p
snakemake -f Snakefile_split --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 split_id=$i -p
done
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment