Commit e93424c3 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Merge branch 'master' of gitlab.fbk.eu:MPBA/INF

parents c2ef86cb b422bb46
#!/usr/bin/make -f
# Makefile for running the INF pipeline
# Author: Marco Chierici <chierici@fbk.eu>
# Date: 2017-05-12
#
.PHONY: init
SHELL := /bin/bash
# input variables
# shown are examples, override on command line
FILE ?= data/AG1-G_MAV-G_498_LIT_ALL_tr.txt
LABEL ?= data/label_498_ALL-EFS_tr.lab
DATA1 ?= data/AG1-G_498_LIT_ALL_tr.txt
DATA2 ?= data/MAV-G_498_LIT_ALL_tr.txt
ENDPOINT ?= ALL-EFS
# added MF 20170710
THREADS ?= 4
OUTBASE ?= /path/to/out_tmp
BINDIR := .
OUTDIR := $(OUTBASE)/$(ENDPOINT)
# derived variables
OUTPREFIX = $(notdir $(basename $(FILE)))
LEVEL1 = $(word 1,$(subst _, ,$(OUTPREFIX)))
LEVEL2 = $(word 2,$(subst _, ,$(OUTPREFIX)))
init:
@mkdir -p $(OUTDIR)/rSNFi
@mkdir -p $(OUTDIR)/rSNF
all: init $(OUTDIR)/rSNFi/$(OUTPREFIX)_MCC_scores.txt $(OUTDIR)/rSNF/$(OUTPREFIX)_MCC_scores.txt $(OUTDIR)/juxt/$(OUTPREFIX)_MCC_scores.txt
$(OUTDIR)/juxt/$(OUTPREFIX)_RandomForest_KBest.log: $(FILE) $(LABEL)
python $(BINDIR)/sklearn_rf_training_fixrank.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/juxt --ranking KBest
$(OUTDIR)/juxt/$(OUTPREFIX)_MCC_scores.txt: $(OUTDIR)/juxt/$(OUTPREFIX)_RandomForest_KBest.log $(subst _tr,_ts,$(FILE)) $(subst _tr,_ts,$(LABEL))
python $(BINDIR)/sklearn_rf_validation_writeperf.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/juxt --tslab $(word 3,$^)
$(OUTDIR)/rSNF/INF_$(OUTPREFIX).txt: $(DATA1) $(DATA2) $(LABEL)
Rscript $(BINDIR)/snf_integration.R --d1 $(word 1,$^) --d2 $(word 2,$^) --lab $(word 3,$^) \
--scriptDir $(BINDIR)/SNFtools/ --clust spectral --threads "$(THREADS)" \
--outf $@
$(OUTDIR)/rSNF/$(OUTPREFIX)_RandomForest_rankList.log: $(FILE) $(LABEL) $(OUTDIR)/rSNF/INF_$(OUTPREFIX).txt
python $(BINDIR)/sklearn_rf_training_fixrank.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNF \
--ranking rankList --rankFeats $(word 3,$^)
$(OUTDIR)/rSNF/$(OUTPREFIX)_MCC_scores.txt: $(OUTDIR)/rSNF/$(OUTPREFIX)_RandomForest_rankList.log $(subst _tr,_ts,$(FILE)) $(subst _tr,_ts,$(LABEL))
python $(BINDIR)/sklearn_rf_validation_writeperf.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNF --tslab $(word 3,$^)
$(OUTDIR)/rSNFi/intersection_$(OUTPREFIX).txt: $(OUTDIR)/juxt/$(OUTPREFIX)_RandomForest_KBest.log $(OUTDIR)/rSNF/$(OUTPREFIX)_RandomForest_rankList.log
python $(BINDIR)/intersect_biomarkers.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNFi/venn_$(OUTPREFIX).png $@ --title1 "$(LEVEL1)" --title2 "$(LEVEL2)"
$(OUTDIR)/rSNFi/$(OUTPREFIX).txt: $(FILE) $(OUTDIR)/rSNFi/intersection_$(OUTPREFIX).txt
python $(BINDIR)/extract_topfeats_onecol.py $(word 1,$^) $(word 2,$^) $@
$(OUTDIR)/rSNFi/$(OUTPREFIX)_RandomForest_KBest.log: $(OUTDIR)/rSNFi/$(OUTPREFIX).txt $(LABEL)
python $(BINDIR)/sklearn_rf_training_fixrank.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNFi --ranking KBest
$(OUTDIR)/rSNFi/$(OUTPREFIX)_MCC_scores.txt: $(OUTDIR)/rSNFi/$(OUTPREFIX)_RandomForest_KBest.log $(subst _tr,_ts,$(FILE)) $(subst _tr,_ts,$(LABEL))
python $(BINDIR)/sklearn_rf_validation_writeperf.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNFi --tslab $(word 3,$^)
### Integrative Network Fusion (INF)
# Integrative Network Fusion (INF)
![INF pipeline ](figs/INF_pipeline.jpeg)
**Setup**
## Setup
```bash
git clone https://gitlab.fbk.eu/MPBA/INF
cd INF
......@@ -9,19 +9,44 @@ conda env create -f env.yml -n inf
conda activate inf
```
### Additional dependencies
#### R dependencies
To install the R dependencies (not in conda channels), run the following command via the R prompt:
```bash
install.packages("TunePareto")
```
To install `mlpy`, follow the instructions [here](https://gitlab.fbk.eu/MPBA/mlpy).
#### MLPY
To install `mlpy` follow this instructions:
`mlpy` package is required for some operations included in the DAP procedure.
The `mlpy` package available on PyPI is outdated and not working on OSX platforms.
These are the steps to follow:
Let `<ANACONDA>` be your anaconda path (e.g., `/home/user/anaconda3`).
Adjust these environmental variables:
```bash
export LD_LIBRARY_PATH=<ANACONDA>/envs/<ENV>/lib:${LD_LIBRARY_PATH}
export CPATH=<ANACONDA>/envs/<ENV>/include:${CPATH}
```
and then install `mlpy` from GitLab:
```bash
pip install git+https://gitlab.fbk.eu/MPBA/mlpy.git
```
#### Other Python dependencies
To install `bootstrapped`:
```bash
pip install bootstrapped
```
## Usage
**Input files**
* omics layer 1 data: samples x features, tab-separated, with row & column names
......
......@@ -7,12 +7,12 @@ from input_output import load_data
def extract_feats(datafile, rankedfile, nfeat, outfile):
# sample names, features names and table with features abundances
# sample names, features names and table with features abundances
samples, features, data_ab = load_data(datafile)
# feats abundances (no names of samples, no header)
# data_ab = data_ab.astype(np.float)
rank = np.loadtxt(rankedfile, delimiter = '\t', skiprows = 1, dtype = str)
rank = np.loadtxt(rankedfile, delimiter='\t', skiprows=1, dtype=str)
# number of features in the list
nf_list = rank.shape
if len(nf_list) > 1:
......@@ -20,9 +20,7 @@ def extract_feats(datafile, rankedfile, nfeat, outfile):
top_feats = feats[0:nfeat]
else:
top_feats = rank[1]
#print top_feats.shape
# print top_feats.shape
# extract top features from table with abundances of all features
idx = []
if len(nf_list) == 1:
......@@ -35,16 +33,15 @@ def extract_feats(datafile, rankedfile, nfeat, outfile):
print('###### MISSING %s ######' % top_feats[i])
# considering samples names in the new table
sel_feats=[features[i] for i in idx]
sel_feats = [features[i] for i in idx]
# write new table
with open(outfile, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
# header
writer.writerow(['Samples']+sel_feats)
writer.writerow(['Samples'] + sel_feats)
for i in range(0, len(samples)):
writer.writerow([samples[i]]+data_ab[i,idx].tolist())
writer.writerow([samples[i]] + data_ab[i, idx].tolist())
if __name__ == "__main__":
......
......@@ -9,35 +9,35 @@ import sys
import numpy as np
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
__author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016'
#### Extract features from a given dataset ####
def extract_feats(datafile, rankedfile, outfile):
#print locals()
# table with feats abundances
# print locals()
# table with feats abundances
data = np.loadtxt(datafile, delimiter='\t', dtype=str)
# feats abundances (no names of samples, no header)
data_ab = data[1:,1:].astype(np.float)
data_ab = data[1:, 1:].astype(np.float)
rank = np.loadtxt(rankedfile, delimiter='\t', skiprows=1, dtype=str)
# number of features in the list
nf_list = rank.shape
if len(nf_list) > 1:
feats = rank[:, 0]
top_feats = feats #[0:nfeat]
top_feats = feats # [0:nfeat]
else:
top_feats = rank
# extract top features from table with abundances of all features
idx = []
nfeat = len(top_feats)
for i in range(nfeat):
if top_feats[i] in data[0,:].tolist():
idx.append(data[0,:].tolist().index(top_feats[i]))
if top_feats[i] in data[0, :].tolist():
idx.append(data[0, :].tolist().index(top_feats[i]))
else:
print(top_feats[i])
......@@ -48,8 +48,8 @@ def extract_feats(datafile, rankedfile, outfile):
# write new table
with open(outfile, 'w') as outw:
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
for i in range(len(sel_feats[:,0])):
writer.writerow(sel_feats[i,:])
for i in range(len(sel_feats[:, 0])):
writer.writerow(sel_feats[i, :])
if __name__ == "__main__":
......
import numpy as np
import pandas as pd
def load_data(filename):
df = pd.read_csv(filename, sep='\t', header=0, index_col=0)
var_names = df.columns.tolist()
......@@ -8,14 +9,18 @@ def load_data(filename):
data = df.values.astype(dtype=np.float)
return sample_names, var_names, data
def save_split(x, y, sample_names, var_names, basename):
"""
x, y: output of train_test_split
sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names)
"""
x_df = pd.DataFrame(x, index=sample_names, columns=var_names)
x_df.to_csv(f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID")
x_df.to_csv(
f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID"
)
y_df = pd.DataFrame(y, index=sample_names, columns=['label'])
y_df.to_csv(f"{basename}.lab", sep='\t', index=True, header=True, index_label="sampleID")
y_df.to_csv(
f"{basename}.lab", sep='\t', index=True, header=True, index_label="sampleID"
)
......@@ -18,21 +18,59 @@ import numpy as np
matplotlib.use('Agg')
parser = argparse.ArgumentParser(description='Find the intersection between feature lists and produce Venn diagrams.')
parser.add_argument('CONFIGFILE1', type=str, help='Training experiment configuration file 1 (with info about number of top discriminant features)')
parser.add_argument('CONFIGFILE2', type=str, help='Training experiment configuration file 2 (with info about number of top discriminant features)')
parser.add_argument('OUTLIST', type=str, help='Output file for intersected feature list.')
parser.add_argument('OUTFILE', type=str, nargs='?', help='Output file for Venn diagram plot.')
parser.add_argument('--title1', type=str, default='List_1', nargs='?', help='Name for first diagram (default: %(default)s)')
parser.add_argument('--title2', type=str, default='List_2', nargs='?', help='Name for second diagram (default: %(default)s)')
parser.add_argument('--configFile3', type=str, default='NO', nargs='?', help='Third configuration file - optional (default: %(default)s)')
parser.add_argument('--title3', type=str, default='List_3', nargs='?', help='Name for third diagram (default: %(default)s)')
__author__ = 'Alessandro Zandona'
__date__ = '15 December 2016'
if len(sys.argv)==1:
parser = argparse.ArgumentParser(
description='Find the intersection between feature lists and produce Venn diagrams.'
)
parser.add_argument(
'CONFIGFILE1',
type=str,
help='Training experiment configuration file 1 (with info about number of top discriminant features)',
)
parser.add_argument(
'CONFIGFILE2',
type=str,
help='Training experiment configuration file 2 (with info about number of top discriminant features)',
)
parser.add_argument(
'OUTLIST', type=str, help='Output file for intersected feature list.'
)
parser.add_argument(
'OUTFILE', type=str, nargs='?', help='Output file for Venn diagram plot.'
)
parser.add_argument(
'--title1',
type=str,
default='List_1',
nargs='?',
help='Name for first diagram (default: %(default)s)',
)
parser.add_argument(
'--title2',
type=str,
default='List_2',
nargs='?',
help='Name for second diagram (default: %(default)s)',
)
parser.add_argument(
'--configFile3',
type=str,
default='NO',
nargs='?',
help='Third configuration file - optional (default: %(default)s)',
)
parser.add_argument(
'--title3',
type=str,
default='List_3',
nargs='?',
help='Name for third diagram (default: %(default)s)',
)
__author__ = 'Alessandro Zandona'
__date__ = '15 December 2016'
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
......@@ -77,72 +115,79 @@ feats2 = fl_2[:NFEATS, 1]
# Convert lists into sets
feats2_set = set(feats2)
if (configfile3 != 'NO'):
config.read(configfile3)
if not config.has_section('INPUT'):
print("%s is not a valid configuration file." % CONFIGFILE2)
sys.exit(3)
if configfile3 != 'NO':
config.read(configfile3)
if not config.has_section('INPUT'):
print("%s is not a valid configuration file." % CONFIGFILE2)
sys.exit(3)
RANK = config.get("OUTPUT", "Borda")
NFEATS = config.getint("OUTPUT", "N_feats")
RANK = config.get("OUTPUT", "Borda")
NFEATS = config.getint("OUTPUT", "N_feats")
# Feature lists
fl_3 = np.loadtxt(RANK, dtype=str, delimiter='\t', skiprows=1)
# Features name
feats3 = fl_3[:NFEATS, 1]
# Convert lists into sets
feats3_set = set(feats3)
# Feature lists
fl_3 = np.loadtxt(RANK, dtype=str, delimiter='\t', skiprows=1)
# Features name
feats3 = fl_3[:NFEATS, 1]
# Convert lists into sets
feats3_set = set(feats3)
# Intersection between lists
f1f2 = feats1_set.intersection(feats2_set)
if (configfile3 != 'NO'):
f1f3 = feats1_set.intersection(feats3_set)
f2f3 = feats2_set.intersection(feats3_set)
if configfile3 != 'NO':
f1f3 = feats1_set.intersection(feats3_set)
f2f3 = feats2_set.intersection(feats3_set)
# associate to each common feature the position in each lists
#outFile_f1f2=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title2))
#outw=open(outFile_f1f2, 'w')
# outFile_f1f2=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title2))
# outw=open(outFile_f1f2, 'w')
with open(OUTLIST, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer.writerow(['Feature', 'Position in %s' %title1, 'Postition in %s' %title2])
for i in range(len(list(f1f2))):
# current feature in intersection
interF = list(f1f2)[i]
# position of current feature in first list
idx_list1 = np.where(feats1==interF)[0][0]
# position of current feature in second list
idx_list2 = np.where(feats2==interF)[0][0]
writer.writerow([list(f1f2)[i], idx_list1+1, idx_list2+1])
if (configfile3 != 'NO'):
# associate to each common feature the position in each lists
outFile_f1f3=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title3))
with open(outFile_f1f3, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer.writerow(['Feature', 'Position in %s '%title1, 'Postition in %s ' %title3])
for i in range(len(list(f1f3))):
# current feature in intersection
interF = list(f1f3)[i]
# position of current feature in first list
idx_list1 = np.where(feats1==interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3==interF)[0][0]
writer.writerow([list(f1f3)[i], idx_list1+1, idx_list3+1])
outFile_f2f3=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title2,title3))
with open(outFile_f2f3, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer.writerow(['Feature', 'Position in %s '%title2, 'Postition in %s ' %title3])
for i in range(len(list(f2f3))):
# current feature in intersection
interF = list(f2f3)[i]
# position of current feature in first list
idx_list2 = np.where(feats2==interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3==interF)[0][0]
writer.writerow([list(f2f3)[i], idx_list2+1, idx_list3+1])
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
writer.writerow(['Feature', 'Position in %s' % title1, 'Postition in %s' % title2])
for i in range(len(list(f1f2))):
# current feature in intersection
interF = list(f1f2)[i]
# position of current feature in first list
idx_list1 = np.where(feats1 == interF)[0][0]
# position of current feature in second list
idx_list2 = np.where(feats2 == interF)[0][0]
writer.writerow([list(f1f2)[i], idx_list1 + 1, idx_list2 + 1])
if configfile3 != 'NO':
# associate to each common feature the position in each lists
outFile_f1f3 = os.path.join(
os.path.dirname(OUTFILE), 'Intersection_%s_%s.txt' % (title1, title3)
)
with open(outFile_f1f3, 'w') as outw:
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
writer.writerow(
['Feature', 'Position in %s ' % title1, 'Postition in %s ' % title3]
)
for i in range(len(list(f1f3))):
# current feature in intersection
interF = list(f1f3)[i]
# position of current feature in first list
idx_list1 = np.where(feats1 == interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3 == interF)[0][0]
writer.writerow([list(f1f3)[i], idx_list1 + 1, idx_list3 + 1])
outFile_f2f3 = os.path.join(
os.path.dirname(OUTFILE), 'Intersection_%s_%s.txt' % (title2, title3)
)
with open(outFile_f2f3, 'w') as outw:
writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
writer.writerow(
['Feature', 'Position in %s ' % title2, 'Postition in %s ' % title3]
)
for i in range(len(list(f2f3))):
# current feature in intersection
interF = list(f2f3)[i]
# position of current feature in first list
idx_list2 = np.where(feats2 == interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3 == interF)[0][0]
writer.writerow([list(f2f3)[i], idx_list2 + 1, idx_list3 + 1])
# # plot Venn diagrams
......
import argparse
from itertools import combinations
from pathlib import Path
import numpy as np
import pandas as pd
from mlpy import canberra_stability
parser = argparse.ArgumentParser()
parser.add_argument('--resultsdir', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument(
'--model', type=str, default='randomForest', help='Model (default: %(default)s)'
)
parser.add_argument(
'--nf_min', type=int, default=10, help='Min #feat (default: %(default)s)'
)
parser.add_argument(
'--nf_max', type=int, default=50, help='Max #feat (default: %(default)s)'
)
parser.add_argument(
'--nf_step',
type=int,
default=10,
help='Increase by these many feat (default: %(default)s)',
)
parser.add_argument('--nf_rsnf', type=int, nargs='+', help='One or more #feat for rSNF')
parser.add_argument('--layers', type=str, nargs='+', help='')
args = parser.parse_args()
RESULTSDIR = args.resultsdir # top-level results directory
DATASET = args.dataset # 'tcga_breast'
TARGET = args.target # 'ER'
MODEL = args.model
NF_MIN = args.nf_min
NF_MAX = args.nf_max
NF_STEP = args.nf_step
NF_RSNF = args.nf_rsnf
LAYERS = args.layers
N_LAYERS = len(LAYERS)
MODE = 'rSNF'
assert (
Path(RESULTSDIR, DATASET).expanduser().exists()
), f"{RESULTSDIR}/{DATASET} not found"
assert (
Path(RESULTSDIR, f"{DATASET}_SNFdap").expanduser().exists()
), f"{RESULTSDIR}/{DATASET}_SNFdap not found"
for k in range(2, N_LAYERS + 1):
for comb in combinations(LAYERS, k):
layers_concat = '_'.join(comb)
bordas = []
for datatype in [DATASET, f'{DATASET}_SNFdap']:
bordaf = f'{RESULTSDIR}/{datatype}/{TARGET}/{MODEL}/Borda_splits_50-60_{MODE}_{layers_concat}.txt'
bordas.append(pd.read_csv(bordaf, sep='\t', index_col=None))
# prepare ranks for canberra_stability
ranks = (
pd.concat(
[
np.argsort(bordas[0]['FEATURE_ID']),
np.argsort(bordas[1]['FEATURE_ID']),
],
axis=1,
)
.transpose()
.values
)
for nf in np.arange(NF_MIN, NF_MAX + NF_STEP, NF_STEP):
cs = canberra_stability(ranks, nf)
print(f'{MODE} - {layers_concat} - stability({nf}) = {cs:.3f}')
# additional steps for NF_RSNF
print()
for nf in NF_RSNF:
cs = canberra_stability(ranks, nf)
print(f'{MODE} - {layers_concat} - stability({nf}) = {cs:.3f}')
print()
print()
from __future__ import division
import numpy as np
__author__ = 'Davide Albanese'
def error(ya, yp):
"""
"""
ya_arr, yp_arr = np.asarray(ya), np.asarray(yp)
ya_arr, yp_arr = np.asarray(ya), np.asarray(yp)
if ya_arr.shape[0] != yp_arr.shape[0]:
raise ValueError("ya, yp: shape mismatch")
return np.sum(ya_arr != yp_arr) / ya_arr.shape[0]
......@@ -16,7 +18,7 @@ def error(ya, yp):
def accuracy(ya, yp):
"""
"""
ya_arr, yp_arr = np.asarray(ya), np.asarray(yp)
if ya_arr.shape[0] != yp_arr.shape[0]:
raise ValueError("ya, yp: shape mismatch")
......@@ -29,10 +31,10 @@ def confusion_matrix(ya, yp, classes=None):
"""
if classes is None:
classes = np.unique(np.concatenate((ya, yp)))
classes = np.unique(np.concatenate((ya, yp)))
else:
classes = np.asarray(classes, dtype=np.int)
k = classes.shape[0]
cm = np.zeros((k, k), dtype=np.int)
......@@ -51,10 +53,12 @@ def confusion_matrix_binary(ya, yp):
classes = np.unique(np.concatenate((ya, yp)))
if classes.shape[0] != 2:
raise ValueError("Binary confusion matrix is defined for binary classification only")
raise ValueError(
"Binary confusion matrix is defined for binary classification only"
)
cm, _ = confusion_matrix(ya, yp, classes=classes)
return cm[