Commit fc51f697 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Handle new rSNFi method on tr, ts and ts2 splits

parent 7889bf5f
......@@ -29,9 +29,6 @@ try:
except:
pass
output_labels_file = [f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_tr.txt",
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_ts.txt"]
rule all:
input:
......@@ -39,20 +36,15 @@ rule all:
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/{layers}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_tr_MCC_scores.txt",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_ts_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/single/{layer}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layer=LAYERS, split_id=SPLIT_ID)
rule create_labels_random:
input:
f"{DATAFOLDER}/{DATASET}/{old_target}/{SPLIT_ID}/labels_{old_target}_tr.txt"
output:
output_labels_file,
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS)
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_tr.txt",
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_ts.txt",
run:
if RANDOM:
path = os.path.abspath(os.path.join(f'{DATAFOLDER}/{DATASET}/{old_target}/{SPLIT_ID}'))
......@@ -62,33 +54,46 @@ rule create_labels_random:
files_to_link = [f for f in os.listdir(path) if not f.startswith('labels')]
for f in files_to_link:
subprocess.call(f'ln -s {path}/{f} {path_random}/{f}', shell=True)
if not os.path.exists(f'{path_random}/{f}'):
subprocess.call(f'ln -s {path}/{f} {path_random}/{f}', shell=True)
subprocess.call(f'ln -s {path}/labels_{old_target}_ts2.txt {path_random}/labels_{TARGET}_ts2.txt', shell=True)
subprocess.call(f'ln -s {path}/labels_{old_target}_ts.txt {path_random}/labels_{TARGET}_ts.txt', shell=True)
labels_file_tr = f'{path}/labels_{old_target}_tr.txt'
shuffled_labels_file_tr = f'{path_random}/labels_{TARGET}_tr.txt'
labels_file_ts = f'{path}/labels_{old_target}_ts.txt'
shuffled_labels_file_ts = f'{path_random}/labels_{TARGET}_ts.txt'
np.random.seed(0)
with open(labels_file_tr, 'r') as f_old:
labels = np.array(f_old.readlines())
np.random.shuffle(labels)
labels_file = f'{path}/labels_{old_target}_tr.txt'
shuffled_labels_file = f'{path_random}/labels_{TARGET}_tr.txt'
with open(shuffled_labels_file_tr, 'w') as f_new:
f_new.write(''.join(labels))
np.random.seed(0)
with open(labels_file, 'r') as f_old:
with open(labels_file_ts, 'r') as f_old:
labels = np.array(f_old.readlines())
np.random.shuffle(labels)
with open(shuffled_labels_file, 'w') as f_new:
with open(shuffled_labels_file_ts, 'w') as f_new:
f_new.write(''.join(labels))
rule concat_layers:
input:
rules.create_labels_random.output,
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts2.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS)
output:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts2.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS_CONCAT)
shell:
f"python preprocessing/concat_layers.py --datafolder {DATAFOLDER} --dataset {DATASET} --target {TARGET} --layers {LAYERS_SPACED} --split_id {SPLIT_ID}"
......@@ -124,7 +129,7 @@ rule snf:
threads: 8
output:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/INF_{layers}_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
run:
all_input = [i[1] for i in input.allitems()]
inputfiles = " ".join(all_input[:-1])
......@@ -176,12 +181,12 @@ rule myintersect:
rule extract:
input:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_tr.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_intersect_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
output:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_tr.txt",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_ts.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
shell:
"python extract_topfeats_onecol.py {input} {output}"
......@@ -189,26 +194,26 @@ rule extract:
rule ml_rsnfi_tr:
input:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_tr.txt",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_ts.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_tr.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
output:
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_tr_{model}_KBest.log"
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_ts_{model}_KBest.log"
shell:
"python sklearn_training.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/rSNFi --model {wildcards.model} --ranking KBest"
rule ml_rsnfi_val:
input:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_tr_{model}_KBest.log",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_ts_{model}_KBest.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts2.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts2.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT)
output:
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_tr_MCC_scores.txt"
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_ts_MCC_scores.txt"
shell:
"python sklearn_validation.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/rSNFi --tslab {input[2]}"
......
import numpy as np
import pandas as pd
def load_data(filename):
df = pd.read_csv(filename, sep='\t', header=0, index_col=0)
var_names = df.columns.tolist()
sample_names = df.index.tolist()
data = df.values.astype(dtype=np.float)
return sample_names, var_names, data
def save_split(x, y, sample_names, var_names, basename):
"""
x, y: output of train_test_split
sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names)
"""
x_df = pd.DataFrame(x, index=sample_names, columns=var_names)
x_df.to_csv(f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID")
y_df = pd.DataFrame(y, index=sample_names, columns=['label'])
y_df.to_csv(f"{basename}.lab", sep='\t', index=True, header=True, index_label="sampleID")
......@@ -49,16 +49,20 @@ for k in range(2, len(LAYERS) + 1):
for comb in combinations(LAYERS, k):
single_dfs_tr = []
single_dfs_ts = []
single_dfs_ts2 = []
for layer in comb:
single_dfs_tr.append(pd.read_csv(f'{PATH}/{layer}_tr.txt', sep='\t'))
single_dfs_ts.append(pd.read_csv(f'{PATH}/{layer}_ts.txt', sep='\t'))
single_dfs_ts2.append(pd.read_csv(f'{PATH}/{layer}_ts2.txt', sep='\t'))
merged_tr = reduce(lambda x, y: pd.merge(x, y, on='Sample'), single_dfs_tr)
merged_ts = reduce(lambda x, y: pd.merge(x, y, on='Sample'), single_dfs_ts)
merged_ts2 = reduce(lambda x, y: pd.merge(x, y, on='Sample'), single_dfs_ts2)
layers_concat = '_'.join(comb)
merged_tr.to_csv(f'{PATH}/{layers_concat}_tr.txt', sep='\t', index=False)
merged_ts.to_csv(f'{PATH}/{layers_concat}_ts.txt', sep='\t', index=False)
merged_ts2.to_csv(f'{PATH}/{layers_concat}_ts2.txt', sep='\t', index=False)
# %%
import argparse
import os
import subprocess
from functools import reduce
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
parser = argparse.ArgumentParser()
parser.add_argument('--datafolder', type=str, help='Data folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--n_splits_end', type=int, help='')
parser.add_argument('--n_splits_start', type=int, help='')
parser.add_argument('--split_offset', type=int, default=50, help='')
args = parser.parse_args()
#%%
DATAFOLDER = args.datafolder
DATASET = args.dataset
TARGET = args.target
N_SPLITS_START = args.n_splits_start
N_SPLITS_END = args.n_splits_end
SPLIT_OFFSET = args.split_offset
assert (
SPLIT_OFFSET > N_SPLITS_END - N_SPLITS_START
), 'New splits set must not overlap with old splits set'
########### calculate new tr ratio
tmp_y_tr = pd.read_csv(
f'{DATAFOLDER}/{DATASET}/{TARGET}/{N_SPLITS_START}/labels_{TARGET}_tr.txt',
sep='\t',
header=None,
)
tmp_y_ts = pd.read_csv(
f'{DATAFOLDER}/{DATASET}/{TARGET}/{N_SPLITS_START}/labels_{TARGET}_ts.txt',
sep='\t',
header=None,
)
total_dataset_len = tmp_y_tr.shape[0] + tmp_y_ts.shape[0]
ts2_dataset_len = round(0.2 * total_dataset_len)
ts2_ratio = round(ts2_dataset_len / tmp_y_tr.shape[0], 2)
###########
for split_old in range(N_SPLITS_START, N_SPLITS_END):
split_new = split_old + SPLIT_OFFSET
path_old = f'{DATAFOLDER}/{DATASET}/{TARGET}/{split_old}'
path_new = f'{DATAFOLDER}/{DATASET}/{TARGET}/{split_new}'
subprocess.call(f'rsync -rav {path_old}/ {path_new}', shell=True)
labelsfile = f"labels_{TARGET}_tr.txt"
print('resplitter')
y_orig = pd.read_csv(f'{path_new}/{labelsfile}', header=None)
for f in os.listdir(path_new):
if f.endswith("_tr.txt"):
if not f.startswith("labels") and not f.startswith("clin"):
print(f)
df = pd.read_csv(f'{path_new}/{f}', sep="\t", index_col=0)
df_tr_new, df_ts2, y_tr_new, y_ts2 = train_test_split(
df, y_orig, test_size=ts2_ratio, random_state=0, stratify=y_orig
)
df_tr_new.to_csv(f'{path_new}/{f}', sep="\t")
df_ts2.to_csv(
f'{path_new}/{f.replace("_tr.txt", "_ts2.txt")}', sep="\t"
)
y_tr_new.to_csv(
f'{path_new}/labels_{TARGET}_tr.txt',
sep="\t",
index=False,
header=False,
)
y_ts2.to_csv(
f'{path_new}/labels_{TARGET}_ts2.txt',
sep="\t",
index=False,
header=False,
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment