Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
MPBA
INF
Commits
fc51f697
Commit
fc51f697
authored
Feb 24, 2020
by
Alessia Marcolini
Browse files
Handle new rSNFi method on tr, ts and ts2 splits
parent
7889bf5f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Snakefile_split
View file @
fc51f697
...
...
@@ -29,9 +29,6 @@ try:
except:
pass
output_labels_file = [f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_tr.txt",
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_ts.txt"]
rule all:
input:
...
...
@@ -39,20 +36,15 @@ rule all:
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/{layers}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
r
_MCC_scores.txt",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
s
_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/single/{layer}_tr_MCC_scores.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layer=LAYERS, split_id=SPLIT_ID)
rule create_labels_random:
input:
f"{DATAFOLDER}/{DATASET}/{old_target}/{SPLIT_ID}/labels_{old_target}_tr.txt"
output:
output_labels_file,
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS)
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_tr.txt",
f"{DATAFOLDER}/{DATASET}/{TARGET}/{SPLIT_ID}/labels_{TARGET}_ts.txt",
run:
if RANDOM:
path = os.path.abspath(os.path.join(f'{DATAFOLDER}/{DATASET}/{old_target}/{SPLIT_ID}'))
...
...
@@ -62,33 +54,46 @@ rule create_labels_random:
files_to_link = [f for f in os.listdir(path) if not f.startswith('labels')]
for f in files_to_link:
subprocess.call(f'ln -s {path}/{f} {path_random}/{f}', shell=True)
if not os.path.exists(f'{path_random}/{f}'):
subprocess.call(f'ln -s {path}/{f} {path_random}/{f}', shell=True)
subprocess.call(f'ln -s {path}/labels_{old_target}_ts2.txt {path_random}/labels_{TARGET}_ts2.txt', shell=True)
subprocess.call(f'ln -s {path}/labels_{old_target}_ts.txt {path_random}/labels_{TARGET}_ts.txt', shell=True)
labels_file_tr = f'{path}/labels_{old_target}_tr.txt'
shuffled_labels_file_tr = f'{path_random}/labels_{TARGET}_tr.txt'
labels_file_ts = f'{path}/labels_{old_target}_ts.txt'
shuffled_labels_file_ts = f'{path_random}/labels_{TARGET}_ts.txt'
np.random.seed(0)
with open(labels_file_tr, 'r') as f_old:
labels = np.array(f_old.readlines())
np.random.shuffle(labels)
labels_file = f'{path}/labels_{old_target}_tr.txt'
shuffled_labels_file = f'{path_random}/labels_{TARGET}_tr.txt'
with open(shuffled_labels_file_tr, 'w') as f_new:
f_new.write(''.join(labels))
np.random.seed(0)
with open(labels_file, 'r') as f_old:
with open(labels_file
_ts
, 'r') as f_old:
labels = np.array(f_old.readlines())
np.random.shuffle(labels)
with open(shuffled_labels_file, 'w') as f_new:
with open(shuffled_labels_file
_ts
, 'w') as f_new:
f_new.write(''.join(labels))
rule concat_layers:
input:
rules.create_labels_random.output,
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts2.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS)
output:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_tr.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layer}_ts2.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layer=LAYERS_CONCAT)
shell:
f"python preprocessing/concat_layers.py --datafolder {DATAFOLDER} --dataset {DATASET} --target {TARGET} --layers {LAYERS_SPACED} --split_id {SPLIT_ID}"
...
...
@@ -124,7 +129,7 @@ rule snf:
threads: 8
output:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNF/INF_{layers}_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
run:
all_input = [i[1] for i in input.allitems()]
inputfiles = " ".join(all_input[:-1])
...
...
@@ -176,12 +181,12 @@ rule myintersect:
rule extract:
input:
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_t
r
.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_t
s
.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, layers=LAYERS_CONCAT, split_id=SPLIT_ID),
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_intersect_tr.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
output:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
r
.txt",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
s
.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, layers=LAYERS_CONCAT, split_id=SPLIT_ID)
shell:
"python extract_topfeats_onecol.py {input} {output}"
...
...
@@ -189,26 +194,26 @@ rule extract:
rule ml_rsnfi_tr:
input:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
r
.txt",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
s
.txt",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_t
r
.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_t
s
.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
output:
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
r
_{model}_KBest.log"
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
s
_{model}_KBest.log"
shell:
"python sklearn_training.py {input} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/rSNFi --model {wildcards.model} --ranking KBest"
rule ml_rsnfi_val:
input:
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
r
_{model}_KBest.log",
expand("{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
s
_{model}_KBest.log",
outfolder=OUTFOLDER, dataset=DATASET, target=TARGET, model=MODEL, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/{layers}_ts
2
.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT),
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts.txt",
expand("{datafolder}/{dataset}/{target}/{split_id}/labels_{target}_ts
2
.txt",
datafolder=DATAFOLDER, dataset=DATASET, target=TARGET, split_id=SPLIT_ID, layers=LAYERS_CONCAT)
output:
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
r
_MCC_scores.txt"
"{outfolder}/{dataset}/{target}/{model}/{split_id}/rSNFi/{layers}_t
s
_MCC_scores.txt"
shell:
"python sklearn_validation.py {input[0]} {input[1]} {wildcards.outfolder}/{wildcards.dataset}/{wildcards.target}/{wildcards.model}/{wildcards.split_id}/rSNFi --tslab {input[2]}"
...
...
postprocessing/input_output.py
0 → 100644
View file @
fc51f697
import
numpy
as
np
import
pandas
as
pd
def
load_data
(
filename
):
df
=
pd
.
read_csv
(
filename
,
sep
=
'
\t
'
,
header
=
0
,
index_col
=
0
)
var_names
=
df
.
columns
.
tolist
()
sample_names
=
df
.
index
.
tolist
()
data
=
df
.
values
.
astype
(
dtype
=
np
.
float
)
return
sample_names
,
var_names
,
data
def
save_split
(
x
,
y
,
sample_names
,
var_names
,
basename
):
"""
x, y: output of train_test_split
sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names)
"""
x_df
=
pd
.
DataFrame
(
x
,
index
=
sample_names
,
columns
=
var_names
)
x_df
.
to_csv
(
f
"
{
basename
}
.txt"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
,
index_label
=
"sampleID"
)
y_df
=
pd
.
DataFrame
(
y
,
index
=
sample_names
,
columns
=
[
'label'
])
y_df
.
to_csv
(
f
"
{
basename
}
.lab"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
,
index_label
=
"sampleID"
)
preprocessing/concat_layers.py
View file @
fc51f697
...
...
@@ -49,16 +49,20 @@ for k in range(2, len(LAYERS) + 1):
for
comb
in
combinations
(
LAYERS
,
k
):
single_dfs_tr
=
[]
single_dfs_ts
=
[]
single_dfs_ts2
=
[]
for
layer
in
comb
:
single_dfs_tr
.
append
(
pd
.
read_csv
(
f
'
{
PATH
}
/
{
layer
}
_tr.txt'
,
sep
=
'
\t
'
))
single_dfs_ts
.
append
(
pd
.
read_csv
(
f
'
{
PATH
}
/
{
layer
}
_ts.txt'
,
sep
=
'
\t
'
))
single_dfs_ts2
.
append
(
pd
.
read_csv
(
f
'
{
PATH
}
/
{
layer
}
_ts2.txt'
,
sep
=
'
\t
'
))
merged_tr
=
reduce
(
lambda
x
,
y
:
pd
.
merge
(
x
,
y
,
on
=
'Sample'
),
single_dfs_tr
)
merged_ts
=
reduce
(
lambda
x
,
y
:
pd
.
merge
(
x
,
y
,
on
=
'Sample'
),
single_dfs_ts
)
merged_ts2
=
reduce
(
lambda
x
,
y
:
pd
.
merge
(
x
,
y
,
on
=
'Sample'
),
single_dfs_ts2
)
layers_concat
=
'_'
.
join
(
comb
)
merged_tr
.
to_csv
(
f
'
{
PATH
}
/
{
layers_concat
}
_tr.txt'
,
sep
=
'
\t
'
,
index
=
False
)
merged_ts
.
to_csv
(
f
'
{
PATH
}
/
{
layers_concat
}
_ts.txt'
,
sep
=
'
\t
'
,
index
=
False
)
merged_ts2
.
to_csv
(
f
'
{
PATH
}
/
{
layers_concat
}
_ts2.txt'
,
sep
=
'
\t
'
,
index
=
False
)
# %%
resplitter.py
0 → 100644
View file @
fc51f697
import
argparse
import
os
import
subprocess
from
functools
import
reduce
import
numpy
as
np
import
pandas
as
pd
from
sklearn.model_selection
import
train_test_split
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--datafolder'
,
type
=
str
,
help
=
'Data folder'
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
help
=
'Dataset name'
)
parser
.
add_argument
(
'--target'
,
type
=
str
,
help
=
'Clinical endpoint'
)
parser
.
add_argument
(
'--n_splits_end'
,
type
=
int
,
help
=
''
)
parser
.
add_argument
(
'--n_splits_start'
,
type
=
int
,
help
=
''
)
parser
.
add_argument
(
'--split_offset'
,
type
=
int
,
default
=
50
,
help
=
''
)
args
=
parser
.
parse_args
()
#%%
DATAFOLDER
=
args
.
datafolder
DATASET
=
args
.
dataset
TARGET
=
args
.
target
N_SPLITS_START
=
args
.
n_splits_start
N_SPLITS_END
=
args
.
n_splits_end
SPLIT_OFFSET
=
args
.
split_offset
assert
(
SPLIT_OFFSET
>
N_SPLITS_END
-
N_SPLITS_START
),
'New splits set must not overlap with old splits set'
########### calculate new tr ratio
tmp_y_tr
=
pd
.
read_csv
(
f
'
{
DATAFOLDER
}
/
{
DATASET
}
/
{
TARGET
}
/
{
N_SPLITS_START
}
/labels_
{
TARGET
}
_tr.txt'
,
sep
=
'
\t
'
,
header
=
None
,
)
tmp_y_ts
=
pd
.
read_csv
(
f
'
{
DATAFOLDER
}
/
{
DATASET
}
/
{
TARGET
}
/
{
N_SPLITS_START
}
/labels_
{
TARGET
}
_ts.txt'
,
sep
=
'
\t
'
,
header
=
None
,
)
total_dataset_len
=
tmp_y_tr
.
shape
[
0
]
+
tmp_y_ts
.
shape
[
0
]
ts2_dataset_len
=
round
(
0.2
*
total_dataset_len
)
ts2_ratio
=
round
(
ts2_dataset_len
/
tmp_y_tr
.
shape
[
0
],
2
)
###########
for
split_old
in
range
(
N_SPLITS_START
,
N_SPLITS_END
):
split_new
=
split_old
+
SPLIT_OFFSET
path_old
=
f
'
{
DATAFOLDER
}
/
{
DATASET
}
/
{
TARGET
}
/
{
split_old
}
'
path_new
=
f
'
{
DATAFOLDER
}
/
{
DATASET
}
/
{
TARGET
}
/
{
split_new
}
'
subprocess
.
call
(
f
'rsync -rav
{
path_old
}
/
{
path_new
}
'
,
shell
=
True
)
labelsfile
=
f
"labels_
{
TARGET
}
_tr.txt"
print
(
'resplitter'
)
y_orig
=
pd
.
read_csv
(
f
'
{
path_new
}
/
{
labelsfile
}
'
,
header
=
None
)
for
f
in
os
.
listdir
(
path_new
):
if
f
.
endswith
(
"_tr.txt"
):
if
not
f
.
startswith
(
"labels"
)
and
not
f
.
startswith
(
"clin"
):
print
(
f
)
df
=
pd
.
read_csv
(
f
'
{
path_new
}
/
{
f
}
'
,
sep
=
"
\t
"
,
index_col
=
0
)
df_tr_new
,
df_ts2
,
y_tr_new
,
y_ts2
=
train_test_split
(
df
,
y_orig
,
test_size
=
ts2_ratio
,
random_state
=
0
,
stratify
=
y_orig
)
df_tr_new
.
to_csv
(
f
'
{
path_new
}
/
{
f
}
'
,
sep
=
"
\t
"
)
df_ts2
.
to_csv
(
f
'
{
path_new
}
/
{
f
.
replace
(
"_tr.txt"
,
"_ts2.txt"
)
}
'
,
sep
=
"
\t
"
)
y_tr_new
.
to_csv
(
f
'
{
path_new
}
/labels_
{
TARGET
}
_tr.txt'
,
sep
=
"
\t
"
,
index
=
False
,
header
=
False
,
)
y_ts2
.
to_csv
(
f
'
{
path_new
}
/labels_
{
TARGET
}
_ts2.txt'
,
sep
=
"
\t
"
,
index
=
False
,
header
=
False
,
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment