Commit 0b30429c authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Black formatting

parent 44b870ae
......@@ -7,12 +7,12 @@ from input_output import load_data
def extract_feats(datafile, rankedfile, nfeat, outfile):
# sample names, features names and table with features abundances
# sample names, features names and table with features abundances
samples, features, data_ab = load_data(datafile)
# feats abundances (no names of samples, no header)
# data_ab = data_ab.astype(np.float)
rank = np.loadtxt(rankedfile, delimiter = '\t', skiprows = 1, dtype = str)
rank = np.loadtxt(rankedfile, delimiter='\t', skiprows=1, dtype=str)
# number of features in the list
nf_list = rank.shape
if len(nf_list) > 1:
......@@ -20,9 +20,7 @@ def extract_feats(datafile, rankedfile, nfeat, outfile):
top_feats = feats[0:nfeat]
else:
top_feats = rank[1]
#print top_feats.shape
# print top_feats.shape
# extract top features from table with abundances of all features
idx = []
if len(nf_list) == 1:
......@@ -35,16 +33,15 @@ def extract_feats(datafile, rankedfile, nfeat, outfile):
print('###### MISSING %s ######' % top_feats[i])
# considering samples names in the new table
sel_feats=[features[i] for i in idx]
sel_feats = [features[i] for i in idx]
# write new table
with open(outfile, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
# header
writer.writerow(['Samples']+sel_feats)
writer.writerow(['Samples'] + sel_feats)
for i in range(0, len(samples)):
writer.writerow([samples[i]]+data_ab[i,idx].tolist())
writer.writerow([samples[i]] + data_ab[i, idx].tolist())
if __name__ == "__main__":
......
......@@ -9,35 +9,35 @@ import sys
import numpy as np
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
#### Extract features from a given dataset ####
def extract_feats(datafile, rankedfile, outfile):
#print locals()
# table with feats abundances
# print locals()
# table with feats abundances
data = np.loadtxt(datafile, delimiter='\t', dtype=str)
# feats abundances (no names of samples, no header)
data_ab = data[1:,1:].astype(np.float)
data_ab = data[1:, 1:].astype(np.float)
rank = np.loadtxt(rankedfile, delimiter='\t', skiprows=1, dtype=str)
# number of features in the list
nf_list = rank.shape
if len(nf_list) > 1:
feats = rank[:, 0]
top_feats = feats #[0:nfeat]
top_feats = feats # [0:nfeat]
else:
top_feats = rank
# extract top features from table with abundances of all features
idx = []
nfeat = len(top_feats)
for i in range(nfeat):
if top_feats[i] in data[0,:].tolist():
idx.append(data[0,:].tolist().index(top_feats[i]))
if top_feats[i] in data[0, :].tolist():
idx.append(data[0, :].tolist().index(top_feats[i]))
else:
print(top_feats[i])
......@@ -48,8 +48,8 @@ def extract_feats(datafile, rankedfile, outfile):
# write new table
with open(outfile, 'w') as outw:
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
for i in range(len(sel_feats[:,0])):
writer.writerow(sel_feats[i,:])
for i in range(len(sel_feats[:, 0])):
writer.writerow(sel_feats[i, :])
if __name__ == "__main__":
......
import numpy as np
import pandas as pd
def load_data(filename):
df = pd.read_csv(filename, sep='\t', header=0, index_col=0)
var_names = df.columns.tolist()
......@@ -8,14 +9,18 @@ def load_data(filename):
data = df.values.astype(dtype=np.float)
return sample_names, var_names, data
def save_split(x, y, sample_names, var_names, basename):
"""
x, y: output of train_test_split
sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names)
"""
x_df = pd.DataFrame(x, index=sample_names, columns=var_names)
x_df.to_csv(f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID")
x_df.to_csv(
f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID"
)
y_df = pd.DataFrame(y, index=sample_names, columns=['label'])
y_df.to_csv(f"{basename}.lab", sep='\t', index=True, header=True, index_label="sampleID")
y_df.to_csv(
f"{basename}.lab", sep='\t', index=True, header=True, index_label="sampleID"
)
......@@ -18,21 +18,59 @@ import numpy as np
matplotlib.use('Agg')
parser = argparse.ArgumentParser(description='Find the intersection between feature lists and produce Venn diagrams.')
parser.add_argument('CONFIGFILE1', type=str, help='Training experiment configuration file 1 (with info about number of top discriminant features)')
parser.add_argument('CONFIGFILE2', type=str, help='Training experiment configuration file 2 (with info about number of top discriminant features)')
parser.add_argument('OUTLIST', type=str, help='Output file for intersected feature list.')
parser.add_argument('OUTFILE', type=str, nargs='?', help='Output file for Venn diagram plot.')
parser.add_argument('--title1', type=str, default='List_1', nargs='?', help='Name for first diagram (default: %(default)s)')
parser.add_argument('--title2', type=str, default='List_2', nargs='?', help='Name for second diagram (default: %(default)s)')
parser.add_argument('--configFile3', type=str, default='NO', nargs='?', help='Third configuration file - optional (default: %(default)s)')
parser.add_argument('--title3', type=str, default='List_3', nargs='?', help='Name for third diagram (default: %(default)s)')
__author__ = 'Alessandro Zandona'
__date__ = '15 December 2016'
if len(sys.argv)==1:
parser = argparse.ArgumentParser(
description='Find the intersection between feature lists and produce Venn diagrams.'
)
parser.add_argument(
'CONFIGFILE1',
type=str,
help='Training experiment configuration file 1 (with info about number of top discriminant features)',
)
parser.add_argument(
'CONFIGFILE2',
type=str,
help='Training experiment configuration file 2 (with info about number of top discriminant features)',
)
parser.add_argument(
'OUTLIST', type=str, help='Output file for intersected feature list.'
)
parser.add_argument(
'OUTFILE', type=str, nargs='?', help='Output file for Venn diagram plot.'
)
parser.add_argument(
'--title1',
type=str,
default='List_1',
nargs='?',
help='Name for first diagram (default: %(default)s)',
)
parser.add_argument(
'--title2',
type=str,
default='List_2',
nargs='?',
help='Name for second diagram (default: %(default)s)',
)
parser.add_argument(
'--configFile3',
type=str,
default='NO',
nargs='?',
help='Third configuration file - optional (default: %(default)s)',
)
parser.add_argument(
'--title3',
type=str,
default='List_3',
nargs='?',
help='Name for third diagram (default: %(default)s)',
)
__author__ = 'Alessandro Zandona'
__date__ = '15 December 2016'
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
......@@ -77,72 +115,79 @@ feats2 = fl_2[:NFEATS, 1]
# Convert lists into sets
feats2_set = set(feats2)
if (configfile3 != 'NO'):
config.read(configfile3)
if not config.has_section('INPUT'):
print("%s is not a valid configuration file." % CONFIGFILE2)
sys.exit(3)
if configfile3 != 'NO':
config.read(configfile3)
if not config.has_section('INPUT'):
print("%s is not a valid configuration file." % CONFIGFILE2)
sys.exit(3)
RANK = config.get("OUTPUT", "Borda")
NFEATS = config.getint("OUTPUT", "N_feats")
RANK = config.get("OUTPUT", "Borda")
NFEATS = config.getint("OUTPUT", "N_feats")
# Feature lists
fl_3 = np.loadtxt(RANK, dtype=str, delimiter='\t', skiprows=1)
# Features name
feats3 = fl_3[:NFEATS, 1]
# Convert lists into sets
feats3_set = set(feats3)
# Feature lists
fl_3 = np.loadtxt(RANK, dtype=str, delimiter='\t', skiprows=1)
# Features name
feats3 = fl_3[:NFEATS, 1]
# Convert lists into sets
feats3_set = set(feats3)
# Intersection between lists
f1f2 = feats1_set.intersection(feats2_set)
if (configfile3 != 'NO'):
f1f3 = feats1_set.intersection(feats3_set)
f2f3 = feats2_set.intersection(feats3_set)
if configfile3 != 'NO':
f1f3 = feats1_set.intersection(feats3_set)
f2f3 = feats2_set.intersection(feats3_set)
# associate to each common feature the position in each lists
#outFile_f1f2=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title2))
#outw=open(outFile_f1f2, 'w')
# outFile_f1f2=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title2))
# outw=open(outFile_f1f2, 'w')
with open(OUTLIST, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer.writerow(['Feature', 'Position in %s' %title1, 'Postition in %s' %title2])
for i in range(len(list(f1f2))):
# current feature in intersection
interF = list(f1f2)[i]
# position of current feature in first list
idx_list1 = np.where(feats1==interF)[0][0]
# position of current feature in second list
idx_list2 = np.where(feats2==interF)[0][0]
writer.writerow([list(f1f2)[i], idx_list1+1, idx_list2+1])
if (configfile3 != 'NO'):
# associate to each common feature the position in each lists
outFile_f1f3=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title3))
with open(outFile_f1f3, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer.writerow(['Feature', 'Position in %s '%title1, 'Postition in %s ' %title3])
for i in range(len(list(f1f3))):
# current feature in intersection
interF = list(f1f3)[i]
# position of current feature in first list
idx_list1 = np.where(feats1==interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3==interF)[0][0]
writer.writerow([list(f1f3)[i], idx_list1+1, idx_list3+1])
outFile_f2f3=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title2,title3))
with open(outFile_f2f3, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer.writerow(['Feature', 'Position in %s '%title2, 'Postition in %s ' %title3])
for i in range(len(list(f2f3))):
# current feature in intersection
interF = list(f2f3)[i]
# position of current feature in first list
idx_list2 = np.where(feats2==interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3==interF)[0][0]
writer.writerow([list(f2f3)[i], idx_list2+1, idx_list3+1])
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
writer.writerow(['Feature', 'Position in %s' % title1, 'Postition in %s' % title2])
for i in range(len(list(f1f2))):
# current feature in intersection
interF = list(f1f2)[i]
# position of current feature in first list
idx_list1 = np.where(feats1 == interF)[0][0]
# position of current feature in second list
idx_list2 = np.where(feats2 == interF)[0][0]
writer.writerow([list(f1f2)[i], idx_list1 + 1, idx_list2 + 1])
if configfile3 != 'NO':
# associate to each common feature the position in each lists
outFile_f1f3 = os.path.join(
os.path.dirname(OUTFILE), 'Intersection_%s_%s.txt' % (title1, title3)
)
with open(outFile_f1f3, 'w') as outw:
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
writer.writerow(
['Feature', 'Position in %s ' % title1, 'Postition in %s ' % title3]
)
for i in range(len(list(f1f3))):
# current feature in intersection
interF = list(f1f3)[i]
# position of current feature in first list
idx_list1 = np.where(feats1 == interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3 == interF)[0][0]
writer.writerow([list(f1f3)[i], idx_list1 + 1, idx_list3 + 1])
outFile_f2f3 = os.path.join(
os.path.dirname(OUTFILE), 'Intersection_%s_%s.txt' % (title2, title3)
)
with open(outFile_f2f3, 'w') as outw:
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
writer.writerow(
['Feature', 'Position in %s ' % title2, 'Postition in %s ' % title3]
)
for i in range(len(list(f2f3))):
# current feature in intersection
interF = list(f2f3)[i]
# position of current feature in first list
idx_list2 = np.where(feats2 == interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3 == interF)[0][0]
writer.writerow([list(f2f3)[i], idx_list2 + 1, idx_list3 + 1])
# # plot Venn diagrams
......
import argparse
import pandas as pd
import numpy as np
from mlpy import canberra_stability
from itertools import combinations
from pathlib import Path
import numpy as np
import pandas as pd
from mlpy import canberra_stability
parser = argparse.ArgumentParser()
parser.add_argument('--resultsdir', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument('--model', type=str, default='randomForest', help='Model (default: %(default)s)')
parser.add_argument('--nf_min', type=int, default=10, help='Min #feat (default: %(default)s)')
parser.add_argument('--nf_max', type=int, default=50, help='Max #feat (default: %(default)s)')
parser.add_argument('--nf_step', type=int, default=10, help='Increase by these many feat (default: %(default)s)')
parser.add_argument(
'--model', type=str, default='randomForest', help='Model (default: %(default)s)'
)
parser.add_argument(
'--nf_min', type=int, default=10, help='Min #feat (default: %(default)s)'
)
parser.add_argument(
'--nf_max', type=int, default=50, help='Max #feat (default: %(default)s)'
)
parser.add_argument(
'--nf_step',
type=int,
default=10,
help='Increase by these many feat (default: %(default)s)',
)
parser.add_argument('--nf_rsnf', type=int, nargs='+', help='One or more #feat for rSNF')
parser.add_argument('--layers', type=str, nargs='+', help='')
args = parser.parse_args()
RESULTSDIR = args.resultsdir # top-level results directory
DATASET = args.dataset # 'tcga_breast'
TARGET = args.target # 'ER'
RESULTSDIR = args.resultsdir # top-level results directory
DATASET = args.dataset # 'tcga_breast'
TARGET = args.target # 'ER'
MODEL = args.model
NF_MIN = args.nf_min
NF_MAX = args.nf_max
......@@ -29,15 +41,15 @@ LAYERS = args.layers
N_LAYERS = len(LAYERS)
MODE = 'rSNF'
assert(
assert (
Path(RESULTSDIR, DATASET).expanduser().exists()
), f"{RESULTSDIR}/{DATASET} not found"
assert(
assert (
Path(RESULTSDIR, f"{DATASET}_SNFdap").expanduser().exists()
), f"{RESULTSDIR}/{DATASET}_SNFdap not found"
for k in range(2, N_LAYERS+1):
for k in range(2, N_LAYERS + 1):
for comb in combinations(LAYERS, k):
layers_concat = '_'.join(comb)
bordas = []
......@@ -45,8 +57,17 @@ for k in range(2, N_LAYERS+1):
bordaf = f'{RESULTSDIR}/{datatype}/{TARGET}/{MODEL}/Borda_splits_50-60_{MODE}_{layers_concat}.txt'
bordas.append(pd.read_csv(bordaf, sep='\t', index_col=None))
# prepare ranks for canberra_stability
ranks = pd.concat([np.argsort(bordas[0]['FEATURE_ID']),
np.argsort(bordas[1]['FEATURE_ID'])], axis=1).transpose().values
ranks = (
pd.concat(
[
np.argsort(bordas[0]['FEATURE_ID']),
np.argsort(bordas[1]['FEATURE_ID']),
],
axis=1,
)
.transpose()
.values
)
for nf in np.arange(NF_MIN, NF_MAX + NF_STEP, NF_STEP):
cs = canberra_stability(ranks, nf)
print(f'{MODE} - {layers_concat} - stability({nf}) = {cs:.3f}')
......
from __future__ import division
import numpy as np
__author__ = 'Davide Albanese'
def error(ya, yp):
"""
"""
ya_arr, yp_arr = np.asarray(ya), np.asarray(yp)
ya_arr, yp_arr = np.asarray(ya), np.asarray(yp)
if ya_arr.shape[0] != yp_arr.shape[0]:
raise ValueError("ya, yp: shape mismatch")
return np.sum(ya_arr != yp_arr) / ya_arr.shape[0]
......@@ -16,7 +18,7 @@ def error(ya, yp):
def accuracy(ya, yp):
"""
"""
ya_arr, yp_arr = np.asarray(ya), np.asarray(yp)
if ya_arr.shape[0] != yp_arr.shape[0]:
raise ValueError("ya, yp: shape mismatch")
......@@ -29,10 +31,10 @@ def confusion_matrix(ya, yp, classes=None):
"""
if classes is None:
classes = np.unique(np.concatenate((ya, yp)))
classes = np.unique(np.concatenate((ya, yp)))
else:
classes = np.asarray(classes, dtype=np.int)
k = classes.shape[0]
cm = np.zeros((k, k), dtype=np.int)
......@@ -51,10 +53,12 @@ def confusion_matrix_binary(ya, yp):
classes = np.unique(np.concatenate((ya, yp)))
if classes.shape[0] != 2:
raise ValueError("Binary confusion matrix is defined for binary classification only")
raise ValueError(
"Binary confusion matrix is defined for binary classification only"
)
cm, _ = confusion_matrix(ya, yp, classes=classes)
return cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]
......@@ -62,7 +66,7 @@ def sensitivity(ya, yp):
""" or true positive rate, hit rate, recall
TP / P = TP / (TP + FN)
"""
TN, FP, FN, TP = confusion_matrix_binary(ya, yp)
if TP == 0.0:
return 0.0
......@@ -86,7 +90,7 @@ def fpr(ya, yp):
"""false positive rate or fall-out
FP / N = FP / (FP + TN)
"""
TN, FP, FN, TP = confusion_matrix_binary(ya, yp)
if FP == 0.0:
return 0.0
......@@ -122,7 +126,7 @@ def fdr(ya, yp):
"""false discovery rate
FP / (FP+TP)
"""
TN, FP, FN, TP = confusion_matrix_binary(ya, yp)
if FP == 0.0:
return 0.0
......@@ -145,8 +149,8 @@ def auc_wmw(ya, yp):
classes = np.unique(ya_arr)
if classes.shape[0] != 2:
raise ValueError("AUC is defined for binary classification only")
bn = (ya_arr == classes[0])
bp = (ya_arr == classes[1])
bn = ya_arr == classes[0]
bp = ya_arr == classes[1]
auc = 0.0
for i in yp[bp]:
for j in yp[bn]:
......@@ -154,21 +158,18 @@ def auc_wmw(ya, yp):
auc += 1.0
return auc / (np.sum(bn) * np.sum(bp))
##### KCCC
##### KCCC
def _expand(x, y):
K = np.unique(np.concatenate((x, y)))
X = np.zeros((x.shape[0], K.shape[0]), dtype=np.int)
Y = np.zeros((y.shape[0], K.shape[0]), dtype=np.int)
for i, k in enumerate(K):
X[x==k, i] = 1
Y[y==k, i] = 1
X[x == k, i] = 1
Y[y == k, i] = 1
return X, Y
def KCCC(x, y):
""" K-category correlation coefficient.
......@@ -176,7 +177,7 @@ def KCCC(x, y):
EPS = np.finfo(np.float).eps
k = x.shape[1]
xn = x - np.mean(x, axis=0)
yn = y - np.mean(y, axis=0)
cov_xy = np.sum(xn * yn) / k
......@@ -188,7 +189,7 @@ def KCCC(x, y):
rk = cov_xy / np.sqrt(cov_xx * cov_yy)
else:
rk = 0.0
return rk
......@@ -196,7 +197,9 @@ def KCCC_discrete(x, y):
X, Y = _expand(x, y)
return KCCC(X, Y)
##### end KCCC
##### end KCCC
def dor(ya, yp):
"""Diagnostic Odds Ratio
......@@ -204,5 +207,3 @@ def dor(ya, yp):
TN, FP, FN, TP = confusion_matrix_binary(ya, yp)
return (TP / FN) / (FP / TN)
import numpy as np
import pandas as pd
def load_data(filename):
df = pd.read_csv(filename, sep='\t', header=0, index_col=0)
var_names = df.columns.tolist()
......@@ -8,14 +9,18 @@ def load_data(filename):
data = df.values.astype(dtype=np.float)
return sample_names, var_names, data
def save_split(x, y, sample_names, var_names, basename):
"""
x, y: output of train_test_split
sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names)
"""
x_df = pd.DataFrame(x, index=sample_names, columns=var_names)
x_df.to_csv(f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID")
x_df.to_csv(
f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID"
)
y_df = pd.DataFrame(y, index=sample_names, columns=['label'])
y_df.to_csv(f"{basename}.lab", sep='\t', index=True, header=True, index_label="sampleID")