Commit 7889bf5f authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Random Labels in Snakefile

parent 0dd975f3
#%%
import os
import subprocess
import numpy as np
#%%
# these can be set on runtime:
# snakemake --config datafolder="mydata" outfolder="out" dataset="breast" model="RandomForest" target="ER" layer1="gene" layer2="cnv" layer3="prot" split_id="1"(...)
......@@ -17,7 +20,18 @@ LAYERS = [config[k] for k in config.keys() if k.startswith('layer')]
LAYERS_CONCAT = "_".join(LAYERS)
LAYERS_SPACED = " ".join(LAYERS)
old_target = TARGET
try:
RANDOM = config['random']
if RANDOM == 'true':
TARGET = TARGET + '_random'
except:
pass
output_labels_file = [f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_tr.txt",
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_ts.txt"]
rule all:
input:
......@@ -30,8 +44,43 @@ rule all:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/single/{layer}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layer=LAYERS, split_id=SPLIT_ID)
rule create_labels_random:
input:
f"{DATAFOLDER}/{DATASET}/{old_target}/{SPLIT_ID}/labels_{old_target}_tr.txt"
output:
output_labels_file,
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS)
run:
if RANDOM:
path = os.path.abspath(os.path.join(f'{DATAFOLDER}/{DATASET}/{old_target}/{SPLIT_ID}'))
path_random = os.path.join(f'{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}')
# link each data file but labels as they need to be shuffled
files_to_link = [f for f in os.listdir(path) if not f.startswith('labels')]
for f in files_to_link:
subprocess.call(f'ln -s {path}/{f} {path_random}/{f}', shell=True)
subprocess.call(f'ln -s {path}/labels_{old_target}_ts.txt {path_random}/labels_{TARGET}_ts.txt', shell=True)
labels_file = f'{path}/labels_{old_target}_tr.txt'
shuffled_labels_file = f'{path_random}/labels_{TARGET}_tr.txt'
np.random.seed(0)
with open(labels_file, 'r') as f_old:
labels = np.array(f_old.readlines())
np.random.shuffle(labels)
with open(shuffled_labels_file, 'w') as f_new:
f_new.write(''.join(labels))
rule concat_layers:
input:
rules.create_labels_random.output,
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
......
......@@ -6,21 +6,27 @@ THREADS=12
OUTFOLDER=results
DATAFOLDER=data
DATASET=tcga_breast
LAYER1=cnv
LAYER2=prot
LAYER3=prot
LAYER1=gene
LAYER2=cnv
TARGET=ER
MODEL=LSVM
N_SPLITS_START=10
N_SPLITS_END=20
N_SPLITS_START=0
N_SPLITS_END=10
RANDOM_LABELS=true
# go!
for (( i=$N_SPLITS_START; i<$N_SPLITS_END; i++ ))
do
snakemake -s Snakefile_split --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 model=$MODEL split_id=$i -p
snakemake -s Snakefile_split --cores $THREADS --config datafolder=$DATAFOLDER outfolder=$OUTFOLDER dataset=$DATASET target=$TARGET layer1=$LAYER1 layer2=$LAYER2 model=$MODEL random=$RANDOM_LABELS split_id=$i -p
done
if [ $RANDOM_LABELS = true ]
then
TARGET+='_random'
fi
for MODE in juxt rSNF rSNFi single
do
python postprocessing/compute_all_metrics.py --outfolder $OUTFOLDER --dataset $DATASET --target $TARGET --layers $LAYER1 $LAYER2 --model $MODEL --n_splits_end $N_SPLITS_END --n_splits_start $N_SPLITS_START --mode $MODE
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment