Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
MPBA
INF
Commits
0b30429c
Commit
0b30429c
authored
Mar 26, 2020
by
Alessia Marcolini
Browse files
Black formatting
parent
44b870ae
Changes
12
Hide whitespace changes
Inline
Side-by-side
extract_topfeats.py
View file @
0b30429c
...
...
@@ -7,12 +7,12 @@ from input_output import load_data
def
extract_feats
(
datafile
,
rankedfile
,
nfeat
,
outfile
):
# sample names, features names and table with features abundances
# sample names, features names and table with features abundances
samples
,
features
,
data_ab
=
load_data
(
datafile
)
# feats abundances (no names of samples, no header)
# data_ab = data_ab.astype(np.float)
rank
=
np
.
loadtxt
(
rankedfile
,
delimiter
=
'
\t
'
,
skiprows
=
1
,
dtype
=
str
)
rank
=
np
.
loadtxt
(
rankedfile
,
delimiter
=
'
\t
'
,
skiprows
=
1
,
dtype
=
str
)
# number of features in the list
nf_list
=
rank
.
shape
if
len
(
nf_list
)
>
1
:
...
...
@@ -20,9 +20,7 @@ def extract_feats(datafile, rankedfile, nfeat, outfile):
top_feats
=
feats
[
0
:
nfeat
]
else
:
top_feats
=
rank
[
1
]
#print top_feats.shape
# print top_feats.shape
# extract top features from table with abundances of all features
idx
=
[]
if
len
(
nf_list
)
==
1
:
...
...
@@ -35,16 +33,15 @@ def extract_feats(datafile, rankedfile, nfeat, outfile):
print
(
'###### MISSING %s ######'
%
top_feats
[
i
])
# considering samples names in the new table
sel_feats
=
[
features
[
i
]
for
i
in
idx
]
sel_feats
=
[
features
[
i
]
for
i
in
idx
]
# write new table
with
open
(
outfile
,
'w'
)
as
outw
:
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
# header
writer
.
writerow
([
'Samples'
]
+
sel_feats
)
writer
.
writerow
([
'Samples'
]
+
sel_feats
)
for
i
in
range
(
0
,
len
(
samples
)):
writer
.
writerow
([
samples
[
i
]]
+
data_ab
[
i
,
idx
].
tolist
())
writer
.
writerow
([
samples
[
i
]]
+
data_ab
[
i
,
idx
].
tolist
())
if
__name__
==
"__main__"
:
...
...
extract_topfeats_onecol.py
View file @
0b30429c
...
...
@@ -9,35 +9,35 @@ import sys
import
numpy
as
np
__author__
=
'Marco Chierici, Alessandro Zandona'
__date__
=
'15 December 2016'
__author__
=
'Marco Chierici, Alessandro Zandona'
__date__
=
'15 December 2016'
#### Extract features from a given dataset ####
def
extract_feats
(
datafile
,
rankedfile
,
outfile
):
#print locals()
# table with feats abundances
#
print locals()
# table with feats abundances
data
=
np
.
loadtxt
(
datafile
,
delimiter
=
'
\t
'
,
dtype
=
str
)
# feats abundances (no names of samples, no header)
data_ab
=
data
[
1
:,
1
:].
astype
(
np
.
float
)
data_ab
=
data
[
1
:,
1
:].
astype
(
np
.
float
)
rank
=
np
.
loadtxt
(
rankedfile
,
delimiter
=
'
\t
'
,
skiprows
=
1
,
dtype
=
str
)
# number of features in the list
nf_list
=
rank
.
shape
if
len
(
nf_list
)
>
1
:
feats
=
rank
[:,
0
]
top_feats
=
feats
#
[0:nfeat]
top_feats
=
feats
#
[0:nfeat]
else
:
top_feats
=
rank
# extract top features from table with abundances of all features
idx
=
[]
nfeat
=
len
(
top_feats
)
for
i
in
range
(
nfeat
):
if
top_feats
[
i
]
in
data
[
0
,:].
tolist
():
idx
.
append
(
data
[
0
,:].
tolist
().
index
(
top_feats
[
i
]))
if
top_feats
[
i
]
in
data
[
0
,
:].
tolist
():
idx
.
append
(
data
[
0
,
:].
tolist
().
index
(
top_feats
[
i
]))
else
:
print
(
top_feats
[
i
])
...
...
@@ -48,8 +48,8 @@ def extract_feats(datafile, rankedfile, outfile):
# write new table
with
open
(
outfile
,
'w'
)
as
outw
:
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
for
i
in
range
(
len
(
sel_feats
[:,
0
])):
writer
.
writerow
(
sel_feats
[
i
,:])
for
i
in
range
(
len
(
sel_feats
[:,
0
])):
writer
.
writerow
(
sel_feats
[
i
,
:])
if
__name__
==
"__main__"
:
...
...
input_output.py
View file @
0b30429c
import
numpy
as
np
import
pandas
as
pd
def
load_data
(
filename
):
df
=
pd
.
read_csv
(
filename
,
sep
=
'
\t
'
,
header
=
0
,
index_col
=
0
)
var_names
=
df
.
columns
.
tolist
()
...
...
@@ -8,14 +9,18 @@ def load_data(filename):
data
=
df
.
values
.
astype
(
dtype
=
np
.
float
)
return
sample_names
,
var_names
,
data
def
save_split
(
x
,
y
,
sample_names
,
var_names
,
basename
):
"""
x, y: output of train_test_split
sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names)
"""
x_df
=
pd
.
DataFrame
(
x
,
index
=
sample_names
,
columns
=
var_names
)
x_df
.
to_csv
(
f
"
{
basename
}
.txt"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
,
index_label
=
"sampleID"
)
x_df
.
to_csv
(
f
"
{
basename
}
.txt"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
,
index_label
=
"sampleID"
)
y_df
=
pd
.
DataFrame
(
y
,
index
=
sample_names
,
columns
=
[
'label'
])
y_df
.
to_csv
(
f
"
{
basename
}
.lab"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
,
index_label
=
"sampleID"
)
y_df
.
to_csv
(
f
"
{
basename
}
.lab"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
,
index_label
=
"sampleID"
)
intersect_biomarkers.py
View file @
0b30429c
...
...
@@ -18,21 +18,59 @@ import numpy as np
matplotlib
.
use
(
'Agg'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Find the intersection between feature lists and produce Venn diagrams.'
)
parser
.
add_argument
(
'CONFIGFILE1'
,
type
=
str
,
help
=
'Training experiment configuration file 1 (with info about number of top discriminant features)'
)
parser
.
add_argument
(
'CONFIGFILE2'
,
type
=
str
,
help
=
'Training experiment configuration file 2 (with info about number of top discriminant features)'
)
parser
.
add_argument
(
'OUTLIST'
,
type
=
str
,
help
=
'Output file for intersected feature list.'
)
parser
.
add_argument
(
'OUTFILE'
,
type
=
str
,
nargs
=
'?'
,
help
=
'Output file for Venn diagram plot.'
)
parser
.
add_argument
(
'--title1'
,
type
=
str
,
default
=
'List_1'
,
nargs
=
'?'
,
help
=
'Name for first diagram (default: %(default)s)'
)
parser
.
add_argument
(
'--title2'
,
type
=
str
,
default
=
'List_2'
,
nargs
=
'?'
,
help
=
'Name for second diagram (default: %(default)s)'
)
parser
.
add_argument
(
'--configFile3'
,
type
=
str
,
default
=
'NO'
,
nargs
=
'?'
,
help
=
'Third configuration file - optional (default: %(default)s)'
)
parser
.
add_argument
(
'--title3'
,
type
=
str
,
default
=
'List_3'
,
nargs
=
'?'
,
help
=
'Name for third diagram (default: %(default)s)'
)
__author__
=
'Alessandro Zandona'
__date__
=
'15 December 2016'
if
len
(
sys
.
argv
)
==
1
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Find the intersection between feature lists and produce Venn diagrams.'
)
parser
.
add_argument
(
'CONFIGFILE1'
,
type
=
str
,
help
=
'Training experiment configuration file 1 (with info about number of top discriminant features)'
,
)
parser
.
add_argument
(
'CONFIGFILE2'
,
type
=
str
,
help
=
'Training experiment configuration file 2 (with info about number of top discriminant features)'
,
)
parser
.
add_argument
(
'OUTLIST'
,
type
=
str
,
help
=
'Output file for intersected feature list.'
)
parser
.
add_argument
(
'OUTFILE'
,
type
=
str
,
nargs
=
'?'
,
help
=
'Output file for Venn diagram plot.'
)
parser
.
add_argument
(
'--title1'
,
type
=
str
,
default
=
'List_1'
,
nargs
=
'?'
,
help
=
'Name for first diagram (default: %(default)s)'
,
)
parser
.
add_argument
(
'--title2'
,
type
=
str
,
default
=
'List_2'
,
nargs
=
'?'
,
help
=
'Name for second diagram (default: %(default)s)'
,
)
parser
.
add_argument
(
'--configFile3'
,
type
=
str
,
default
=
'NO'
,
nargs
=
'?'
,
help
=
'Third configuration file - optional (default: %(default)s)'
,
)
parser
.
add_argument
(
'--title3'
,
type
=
str
,
default
=
'List_3'
,
nargs
=
'?'
,
help
=
'Name for third diagram (default: %(default)s)'
,
)
__author__
=
'Alessandro Zandona'
__date__
=
'15 December 2016'
if
len
(
sys
.
argv
)
==
1
:
parser
.
print_help
()
sys
.
exit
(
1
)
...
...
@@ -77,72 +115,79 @@ feats2 = fl_2[:NFEATS, 1]
# Convert lists into sets
feats2_set
=
set
(
feats2
)
if
(
configfile3
!=
'NO'
)
:
config
.
read
(
configfile3
)
if
not
config
.
has_section
(
'INPUT'
):
print
(
"%s is not a valid configuration file."
%
CONFIGFILE2
)
sys
.
exit
(
3
)
if
configfile3
!=
'NO'
:
config
.
read
(
configfile3
)
if
not
config
.
has_section
(
'INPUT'
):
print
(
"%s is not a valid configuration file."
%
CONFIGFILE2
)
sys
.
exit
(
3
)
RANK
=
config
.
get
(
"OUTPUT"
,
"Borda"
)
NFEATS
=
config
.
getint
(
"OUTPUT"
,
"N_feats"
)
RANK
=
config
.
get
(
"OUTPUT"
,
"Borda"
)
NFEATS
=
config
.
getint
(
"OUTPUT"
,
"N_feats"
)
# Feature lists
fl_3
=
np
.
loadtxt
(
RANK
,
dtype
=
str
,
delimiter
=
'
\t
'
,
skiprows
=
1
)
# Features name
feats3
=
fl_3
[:
NFEATS
,
1
]
# Convert lists into sets
feats3_set
=
set
(
feats3
)
# Feature lists
fl_3
=
np
.
loadtxt
(
RANK
,
dtype
=
str
,
delimiter
=
'
\t
'
,
skiprows
=
1
)
# Features name
feats3
=
fl_3
[:
NFEATS
,
1
]
# Convert lists into sets
feats3_set
=
set
(
feats3
)
# Intersection between lists
f1f2
=
feats1_set
.
intersection
(
feats2_set
)
if
(
configfile3
!=
'NO'
)
:
f1f3
=
feats1_set
.
intersection
(
feats3_set
)
f2f3
=
feats2_set
.
intersection
(
feats3_set
)
if
configfile3
!=
'NO'
:
f1f3
=
feats1_set
.
intersection
(
feats3_set
)
f2f3
=
feats2_set
.
intersection
(
feats3_set
)
# associate to each common feature the position in each lists
#outFile_f1f2=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title2))
#outw=open(outFile_f1f2, 'w')
#
outFile_f1f2=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title2))
#
outw=open(outFile_f1f2, 'w')
with
open
(
OUTLIST
,
'w'
)
as
outw
:
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
writer
.
writerow
([
'Feature'
,
'Position in %s'
%
title1
,
'Postition in %s'
%
title2
])
for
i
in
range
(
len
(
list
(
f1f2
))):
# current feature in intersection
interF
=
list
(
f1f2
)[
i
]
# position of current feature in first list
idx_list1
=
np
.
where
(
feats1
==
interF
)[
0
][
0
]
# position of current feature in second list
idx_list2
=
np
.
where
(
feats2
==
interF
)[
0
][
0
]
writer
.
writerow
([
list
(
f1f2
)[
i
],
idx_list1
+
1
,
idx_list2
+
1
])
if
(
configfile3
!=
'NO'
):
# associate to each common feature the position in each lists
outFile_f1f3
=
os
.
path
.
join
(
os
.
path
.
dirname
(
OUTFILE
),
'Intersection_%s_%s.txt'
%
(
title1
,
title3
))
with
open
(
outFile_f1f3
,
'w'
)
as
outw
:
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
writer
.
writerow
([
'Feature'
,
'Position in %s '
%
title1
,
'Postition in %s '
%
title3
])
for
i
in
range
(
len
(
list
(
f1f3
))):
# current feature in intersection
interF
=
list
(
f1f3
)[
i
]
# position of current feature in first list
idx_list1
=
np
.
where
(
feats1
==
interF
)[
0
][
0
]
# position of current feature in second list
idx_list3
=
np
.
where
(
feats3
==
interF
)[
0
][
0
]
writer
.
writerow
([
list
(
f1f3
)[
i
],
idx_list1
+
1
,
idx_list3
+
1
])
outFile_f2f3
=
os
.
path
.
join
(
os
.
path
.
dirname
(
OUTFILE
),
'Intersection_%s_%s.txt'
%
(
title2
,
title3
))
with
open
(
outFile_f2f3
,
'w'
)
as
outw
:
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
writer
.
writerow
([
'Feature'
,
'Position in %s '
%
title2
,
'Postition in %s '
%
title3
])
for
i
in
range
(
len
(
list
(
f2f3
))):
# current feature in intersection
interF
=
list
(
f2f3
)[
i
]
# position of current feature in first list
idx_list2
=
np
.
where
(
feats2
==
interF
)[
0
][
0
]
# position of current feature in second list
idx_list3
=
np
.
where
(
feats3
==
interF
)[
0
][
0
]
writer
.
writerow
([
list
(
f2f3
)[
i
],
idx_list2
+
1
,
idx_list3
+
1
])
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
writer
.
writerow
([
'Feature'
,
'Position in %s'
%
title1
,
'Postition in %s'
%
title2
])
for
i
in
range
(
len
(
list
(
f1f2
))):
# current feature in intersection
interF
=
list
(
f1f2
)[
i
]
# position of current feature in first list
idx_list1
=
np
.
where
(
feats1
==
interF
)[
0
][
0
]
# position of current feature in second list
idx_list2
=
np
.
where
(
feats2
==
interF
)[
0
][
0
]
writer
.
writerow
([
list
(
f1f2
)[
i
],
idx_list1
+
1
,
idx_list2
+
1
])
if
configfile3
!=
'NO'
:
# associate to each common feature the position in each lists
outFile_f1f3
=
os
.
path
.
join
(
os
.
path
.
dirname
(
OUTFILE
),
'Intersection_%s_%s.txt'
%
(
title1
,
title3
)
)
with
open
(
outFile_f1f3
,
'w'
)
as
outw
:
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
writer
.
writerow
(
[
'Feature'
,
'Position in %s '
%
title1
,
'Postition in %s '
%
title3
]
)
for
i
in
range
(
len
(
list
(
f1f3
))):
# current feature in intersection
interF
=
list
(
f1f3
)[
i
]
# position of current feature in first list
idx_list1
=
np
.
where
(
feats1
==
interF
)[
0
][
0
]
# position of current feature in second list
idx_list3
=
np
.
where
(
feats3
==
interF
)[
0
][
0
]
writer
.
writerow
([
list
(
f1f3
)[
i
],
idx_list1
+
1
,
idx_list3
+
1
])
outFile_f2f3
=
os
.
path
.
join
(
os
.
path
.
dirname
(
OUTFILE
),
'Intersection_%s_%s.txt'
%
(
title2
,
title3
)
)
with
open
(
outFile_f2f3
,
'w'
)
as
outw
:
writer
=
csv
.
writer
(
outw
,
delimiter
=
'
\t
'
,
lineterminator
=
'
\n
'
)
writer
.
writerow
(
[
'Feature'
,
'Position in %s '
%
title2
,
'Postition in %s '
%
title3
]
)
for
i
in
range
(
len
(
list
(
f2f3
))):
# current feature in intersection
interF
=
list
(
f2f3
)[
i
]
# position of current feature in first list
idx_list2
=
np
.
where
(
feats2
==
interF
)[
0
][
0
]
# position of current feature in second list
idx_list3
=
np
.
where
(
feats3
==
interF
)[
0
][
0
]
writer
.
writerow
([
list
(
f2f3
)[
i
],
idx_list2
+
1
,
idx_list3
+
1
])
# # plot Venn diagrams
...
...
list_distances.py
View file @
0b30429c
import
argparse
import
pandas
as
pd
import
numpy
as
np
from
mlpy
import
canberra_stability
from
itertools
import
combinations
from
pathlib
import
Path
import
numpy
as
np
import
pandas
as
pd
from
mlpy
import
canberra_stability
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--resultsdir'
,
type
=
str
,
help
=
'Results folder'
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
help
=
'Dataset name'
)
parser
.
add_argument
(
'--target'
,
type
=
str
,
help
=
'Clinical endpoint'
)
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'randomForest'
,
help
=
'Model (default: %(default)s)'
)
parser
.
add_argument
(
'--nf_min'
,
type
=
int
,
default
=
10
,
help
=
'Min #feat (default: %(default)s)'
)
parser
.
add_argument
(
'--nf_max'
,
type
=
int
,
default
=
50
,
help
=
'Max #feat (default: %(default)s)'
)
parser
.
add_argument
(
'--nf_step'
,
type
=
int
,
default
=
10
,
help
=
'Increase by these many feat (default: %(default)s)'
)
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'randomForest'
,
help
=
'Model (default: %(default)s)'
)
parser
.
add_argument
(
'--nf_min'
,
type
=
int
,
default
=
10
,
help
=
'Min #feat (default: %(default)s)'
)
parser
.
add_argument
(
'--nf_max'
,
type
=
int
,
default
=
50
,
help
=
'Max #feat (default: %(default)s)'
)
parser
.
add_argument
(
'--nf_step'
,
type
=
int
,
default
=
10
,
help
=
'Increase by these many feat (default: %(default)s)'
,
)
parser
.
add_argument
(
'--nf_rsnf'
,
type
=
int
,
nargs
=
'+'
,
help
=
'One or more #feat for rSNF'
)
parser
.
add_argument
(
'--layers'
,
type
=
str
,
nargs
=
'+'
,
help
=
''
)
args
=
parser
.
parse_args
()
RESULTSDIR
=
args
.
resultsdir
# top-level results directory
DATASET
=
args
.
dataset
# 'tcga_breast'
TARGET
=
args
.
target
# 'ER'
RESULTSDIR
=
args
.
resultsdir
# top-level results directory
DATASET
=
args
.
dataset
# 'tcga_breast'
TARGET
=
args
.
target
# 'ER'
MODEL
=
args
.
model
NF_MIN
=
args
.
nf_min
NF_MAX
=
args
.
nf_max
...
...
@@ -29,15 +41,15 @@ LAYERS = args.layers
N_LAYERS
=
len
(
LAYERS
)
MODE
=
'rSNF'
assert
(
assert
(
Path
(
RESULTSDIR
,
DATASET
).
expanduser
().
exists
()
),
f
"
{
RESULTSDIR
}
/
{
DATASET
}
not found"
assert
(
assert
(
Path
(
RESULTSDIR
,
f
"
{
DATASET
}
_SNFdap"
).
expanduser
().
exists
()
),
f
"
{
RESULTSDIR
}
/
{
DATASET
}
_SNFdap not found"
for
k
in
range
(
2
,
N_LAYERS
+
1
):
for
k
in
range
(
2
,
N_LAYERS
+
1
):
for
comb
in
combinations
(
LAYERS
,
k
):
layers_concat
=
'_'
.
join
(
comb
)
bordas
=
[]
...
...
@@ -45,8 +57,17 @@ for k in range(2, N_LAYERS+1):
bordaf
=
f
'
{
RESULTSDIR
}
/
{
datatype
}
/
{
TARGET
}
/
{
MODEL
}
/Borda_splits_50-60_
{
MODE
}
_
{
layers_concat
}
.txt'
bordas
.
append
(
pd
.
read_csv
(
bordaf
,
sep
=
'
\t
'
,
index_col
=
None
))
# prepare ranks for canberra_stability
ranks
=
pd
.
concat
([
np
.
argsort
(
bordas
[
0
][
'FEATURE_ID'
]),
np
.
argsort
(
bordas
[
1
][
'FEATURE_ID'
])],
axis
=
1
).
transpose
().
values
ranks
=
(
pd
.
concat
(
[
np
.
argsort
(
bordas
[
0
][
'FEATURE_ID'
]),
np
.
argsort
(
bordas
[
1
][
'FEATURE_ID'
]),
],
axis
=
1
,
)
.
transpose
()
.
values
)
for
nf
in
np
.
arange
(
NF_MIN
,
NF_MAX
+
NF_STEP
,
NF_STEP
):
cs
=
canberra_stability
(
ranks
,
nf
)
print
(
f
'
{
MODE
}
-
{
layers_concat
}
- stability(
{
nf
}
) =
{
cs
:.
3
f
}
'
)
...
...
performance.py
View file @
0b30429c
from
__future__
import
division
import
numpy
as
np
__author__
=
'Davide Albanese'
def
error
(
ya
,
yp
):
"""
"""
ya_arr
,
yp_arr
=
np
.
asarray
(
ya
),
np
.
asarray
(
yp
)
ya_arr
,
yp_arr
=
np
.
asarray
(
ya
),
np
.
asarray
(
yp
)
if
ya_arr
.
shape
[
0
]
!=
yp_arr
.
shape
[
0
]:
raise
ValueError
(
"ya, yp: shape mismatch"
)
return
np
.
sum
(
ya_arr
!=
yp_arr
)
/
ya_arr
.
shape
[
0
]
...
...
@@ -16,7 +18,7 @@ def error(ya, yp):
def
accuracy
(
ya
,
yp
):
"""
"""
ya_arr
,
yp_arr
=
np
.
asarray
(
ya
),
np
.
asarray
(
yp
)
if
ya_arr
.
shape
[
0
]
!=
yp_arr
.
shape
[
0
]:
raise
ValueError
(
"ya, yp: shape mismatch"
)
...
...
@@ -29,10 +31,10 @@ def confusion_matrix(ya, yp, classes=None):
"""
if
classes
is
None
:
classes
=
np
.
unique
(
np
.
concatenate
((
ya
,
yp
)))
classes
=
np
.
unique
(
np
.
concatenate
((
ya
,
yp
)))
else
:
classes
=
np
.
asarray
(
classes
,
dtype
=
np
.
int
)
k
=
classes
.
shape
[
0
]
cm
=
np
.
zeros
((
k
,
k
),
dtype
=
np
.
int
)
...
...
@@ -51,10 +53,12 @@ def confusion_matrix_binary(ya, yp):
classes
=
np
.
unique
(
np
.
concatenate
((
ya
,
yp
)))
if
classes
.
shape
[
0
]
!=
2
:
raise
ValueError
(
"Binary confusion matrix is defined for binary classification only"
)
raise
ValueError
(
"Binary confusion matrix is defined for binary classification only"
)
cm
,
_
=
confusion_matrix
(
ya
,
yp
,
classes
=
classes
)
return
cm
[
0
,
0
],
cm
[
0
,
1
],
cm
[
1
,
0
],
cm
[
1
,
1
]
...
...
@@ -62,7 +66,7 @@ def sensitivity(ya, yp):
""" or true positive rate, hit rate, recall
TP / P = TP / (TP + FN)
"""
TN
,
FP
,
FN
,
TP
=
confusion_matrix_binary
(
ya
,
yp
)
if
TP
==
0.0
:
return
0.0
...
...
@@ -86,7 +90,7 @@ def fpr(ya, yp):
"""false positive rate or fall-out
FP / N = FP / (FP + TN)
"""
TN
,
FP
,
FN
,
TP
=
confusion_matrix_binary
(
ya
,
yp
)
if
FP
==
0.0
:
return
0.0
...
...
@@ -122,7 +126,7 @@ def fdr(ya, yp):
"""false discovery rate
FP / (FP+TP)
"""
TN
,
FP
,
FN
,
TP
=
confusion_matrix_binary
(
ya
,
yp
)
if
FP
==
0.0
:
return
0.0
...
...
@@ -145,8 +149,8 @@ def auc_wmw(ya, yp):
classes
=
np
.
unique
(
ya_arr
)
if
classes
.
shape
[
0
]
!=
2
:
raise
ValueError
(
"AUC is defined for binary classification only"
)
bn
=
(
ya_arr
==
classes
[
0
]
)
bp
=
(
ya_arr
==
classes
[
1
]
)
bn
=
ya_arr
==
classes
[
0
]
bp
=
ya_arr
==
classes
[
1
]
auc
=
0.0
for
i
in
yp
[
bp
]:
for
j
in
yp
[
bn
]:
...
...
@@ -154,21 +158,18 @@ def auc_wmw(ya, yp):
auc
+=
1.0
return
auc
/
(
np
.
sum
(
bn
)
*
np
.
sum
(
bp
))
##### KCCC
##### KCCC
def
_expand
(
x
,
y
):
K
=
np
.
unique
(
np
.
concatenate
((
x
,
y
)))
X
=
np
.
zeros
((
x
.
shape
[
0
],
K
.
shape
[
0
]),
dtype
=
np
.
int
)
Y
=
np
.
zeros
((
y
.
shape
[
0
],
K
.
shape
[
0
]),
dtype
=
np
.
int
)
for
i
,
k
in
enumerate
(
K
):
X
[
x
==
k
,
i
]
=
1
Y
[
y
==
k
,
i
]
=
1
X
[
x
==
k
,
i
]
=
1
Y
[
y
==
k
,
i
]
=
1
return
X
,
Y
def
KCCC
(
x
,
y
):
""" K-category correlation coefficient.
...
...
@@ -176,7 +177,7 @@ def KCCC(x, y):
EPS
=
np
.
finfo
(
np
.
float
).
eps
k
=
x
.
shape
[
1
]
xn
=
x
-
np
.
mean
(
x
,
axis
=
0
)
yn
=
y
-
np
.
mean
(
y
,
axis
=
0
)
cov_xy
=
np
.
sum
(
xn
*
yn
)
/
k
...
...
@@ -188,7 +189,7 @@ def KCCC(x, y):
rk
=
cov_xy
/
np
.
sqrt
(
cov_xx
*
cov_yy
)
else
:
rk
=
0.0
return
rk
...
...
@@ -196,7 +197,9 @@ def KCCC_discrete(x, y):
X
,
Y
=
_expand
(
x
,
y
)
return
KCCC
(
X
,
Y
)
##### end KCCC
##### end KCCC
def
dor
(
ya
,
yp
):
"""Diagnostic Odds Ratio
...
...
@@ -204,5 +207,3 @@ def dor(ya, yp):
TN
,
FP
,
FN
,
TP
=
confusion_matrix_binary
(
ya
,
yp
)
return
(
TP
/
FN
)
/
(
FP
/
TN
)
postprocessing/input_output.py
View file @
0b30429c
import
numpy
as
np
import
pandas
as
pd
def
load_data
(
filename
):
df
=
pd
.
read_csv
(
filename
,
sep
=
'
\t
'
,
header
=
0
,
index_col
=
0
)
var_names
=
df
.
columns
.
tolist
()
...
...
@@ -8,14 +9,18 @@ def load_data(filename):
data
=
df
.
values
.
astype
(
dtype
=
np
.
float
)
return
sample_names
,
var_names
,
data
def
save_split
(
x
,
y
,
sample_names
,
var_names
,
basename
):
"""
x, y: output of train_test_split
sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names)
"""
x_df
=
pd
.
DataFrame
(
x
,
index
=
sample_names
,
columns
=
var_names
)
x_df
.
to_csv
(
f
"
{
basename
}
.txt"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
,
index_label
=
"sampleID"
)
x_df
.
to_csv
(
f
"
{
basename
}
.txt"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
,
index_label
=
"sampleID"
)
y_df
=
pd
.
DataFrame
(
y
,
index
=
sample_names
,
columns
=
[
'label'
])
y_df
.
to_csv
(
f
"
{
basename
}
.lab"
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True