Commit e93424c3 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Merge branch 'master' of gitlab.fbk.eu:MPBA/INF

parents c2ef86cb b422bb46
#!/usr/bin/make -f
# Makefile for running the INF pipeline
# Author: Marco Chierici <chierici@fbk.eu>
# Date: 2017-05-12
#
.PHONY: init
SHELL := /bin/bash
# input variables
# shown are examples, override on command line
FILE ?= data/AG1-G_MAV-G_498_LIT_ALL_tr.txt
LABEL ?= data/label_498_ALL-EFS_tr.lab
DATA1 ?= data/AG1-G_498_LIT_ALL_tr.txt
DATA2 ?= data/MAV-G_498_LIT_ALL_tr.txt
ENDPOINT ?= ALL-EFS
# added MF 20170710
THREADS ?= 4
OUTBASE ?= /path/to/out_tmp
BINDIR := .
OUTDIR := $(OUTBASE)/$(ENDPOINT)
# derived variables
OUTPREFIX = $(notdir $(basename $(FILE)))
LEVEL1 = $(word 1,$(subst _, ,$(OUTPREFIX)))
LEVEL2 = $(word 2,$(subst _, ,$(OUTPREFIX)))
init:
@mkdir -p $(OUTDIR)/rSNFi
@mkdir -p $(OUTDIR)/rSNF
all: init $(OUTDIR)/rSNFi/$(OUTPREFIX)_MCC_scores.txt $(OUTDIR)/rSNF/$(OUTPREFIX)_MCC_scores.txt $(OUTDIR)/juxt/$(OUTPREFIX)_MCC_scores.txt
$(OUTDIR)/juxt/$(OUTPREFIX)_RandomForest_KBest.log: $(FILE) $(LABEL)
python $(BINDIR)/sklearn_rf_training_fixrank.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/juxt --ranking KBest
$(OUTDIR)/juxt/$(OUTPREFIX)_MCC_scores.txt: $(OUTDIR)/juxt/$(OUTPREFIX)_RandomForest_KBest.log $(subst _tr,_ts,$(FILE)) $(subst _tr,_ts,$(LABEL))
python $(BINDIR)/sklearn_rf_validation_writeperf.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/juxt --tslab $(word 3,$^)
$(OUTDIR)/rSNF/INF_$(OUTPREFIX).txt: $(DATA1) $(DATA2) $(LABEL)
Rscript $(BINDIR)/snf_integration.R --d1 $(word 1,$^) --d2 $(word 2,$^) --lab $(word 3,$^) \
--scriptDir $(BINDIR)/SNFtools/ --clust spectral --threads "$(THREADS)" \
--outf $@
$(OUTDIR)/rSNF/$(OUTPREFIX)_RandomForest_rankList.log: $(FILE) $(LABEL) $(OUTDIR)/rSNF/INF_$(OUTPREFIX).txt
python $(BINDIR)/sklearn_rf_training_fixrank.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNF \
--ranking rankList --rankFeats $(word 3,$^)
$(OUTDIR)/rSNF/$(OUTPREFIX)_MCC_scores.txt: $(OUTDIR)/rSNF/$(OUTPREFIX)_RandomForest_rankList.log $(subst _tr,_ts,$(FILE)) $(subst _tr,_ts,$(LABEL))
python $(BINDIR)/sklearn_rf_validation_writeperf.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNF --tslab $(word 3,$^)
$(OUTDIR)/rSNFi/intersection_$(OUTPREFIX).txt: $(OUTDIR)/juxt/$(OUTPREFIX)_RandomForest_KBest.log $(OUTDIR)/rSNF/$(OUTPREFIX)_RandomForest_rankList.log
python $(BINDIR)/intersect_biomarkers.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNFi/venn_$(OUTPREFIX).png $@ --title1 "$(LEVEL1)" --title2 "$(LEVEL2)"
$(OUTDIR)/rSNFi/$(OUTPREFIX).txt: $(FILE) $(OUTDIR)/rSNFi/intersection_$(OUTPREFIX).txt
python $(BINDIR)/extract_topfeats_onecol.py $(word 1,$^) $(word 2,$^) $@
$(OUTDIR)/rSNFi/$(OUTPREFIX)_RandomForest_KBest.log: $(OUTDIR)/rSNFi/$(OUTPREFIX).txt $(LABEL)
python $(BINDIR)/sklearn_rf_training_fixrank.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNFi --ranking KBest
$(OUTDIR)/rSNFi/$(OUTPREFIX)_MCC_scores.txt: $(OUTDIR)/rSNFi/$(OUTPREFIX)_RandomForest_KBest.log $(subst _tr,_ts,$(FILE)) $(subst _tr,_ts,$(LABEL))
python $(BINDIR)/sklearn_rf_validation_writeperf.py $(word 1,$^) $(word 2,$^) $(OUTDIR)/rSNFi --tslab $(word 3,$^)
### Integrative Network Fusion (INF) # Integrative Network Fusion (INF)
![INF pipeline ](figs/INF_pipeline.jpeg) ![INF pipeline ](figs/INF_pipeline.jpeg)
**Setup** ## Setup
```bash ```bash
git clone https://gitlab.fbk.eu/MPBA/INF git clone https://gitlab.fbk.eu/MPBA/INF
cd INF cd INF
...@@ -9,19 +9,44 @@ conda env create -f env.yml -n inf ...@@ -9,19 +9,44 @@ conda env create -f env.yml -n inf
conda activate inf conda activate inf
``` ```
### Additional dependencies
#### R dependencies
To install the R dependencies (not in conda channels), run the following command via the R prompt: To install the R dependencies (not in conda channels), run the following command via the R prompt:
```bash ```bash
install.packages("TunePareto") install.packages("TunePareto")
``` ```
To install `mlpy`, follow the instructions [here](https://gitlab.fbk.eu/MPBA/mlpy). #### MLPY
To install `mlpy` follow this instructions:
`mlpy` package is required for some operations included in the DAP procedure.
The `mlpy` package available on PyPI is outdated and not working on OSX platforms.
These are the steps to follow:
Let `<ANACONDA>` be your anaconda path (e.g., `/home/user/anaconda3`).
Adjust these environmental variables:
```bash
export LD_LIBRARY_PATH=<ANACONDA>/envs/<ENV>/lib:${LD_LIBRARY_PATH}
export CPATH=<ANACONDA>/envs/<ENV>/include:${CPATH}
```
and then install `mlpy` from GitLab:
```bash
pip install git+https://gitlab.fbk.eu/MPBA/mlpy.git
```
#### Other Python dependencies
To install `bootstrapped`: To install `bootstrapped`:
```bash ```bash
pip install bootstrapped pip install bootstrapped
``` ```
## Usage
**Input files** **Input files**
* omics layer 1 data: samples x features, tab-separated, with row & column names * omics layer 1 data: samples x features, tab-separated, with row & column names
......
...@@ -7,12 +7,12 @@ from input_output import load_data ...@@ -7,12 +7,12 @@ from input_output import load_data
def extract_feats(datafile, rankedfile, nfeat, outfile): def extract_feats(datafile, rankedfile, nfeat, outfile):
# sample names, features names and table with features abundances # sample names, features names and table with features abundances
samples, features, data_ab = load_data(datafile) samples, features, data_ab = load_data(datafile)
# feats abundances (no names of samples, no header) # feats abundances (no names of samples, no header)
# data_ab = data_ab.astype(np.float) # data_ab = data_ab.astype(np.float)
rank = np.loadtxt(rankedfile, delimiter = '\t', skiprows = 1, dtype = str) rank = np.loadtxt(rankedfile, delimiter='\t', skiprows=1, dtype=str)
# number of features in the list # number of features in the list
nf_list = rank.shape nf_list = rank.shape
if len(nf_list) > 1: if len(nf_list) > 1:
...@@ -20,9 +20,7 @@ def extract_feats(datafile, rankedfile, nfeat, outfile): ...@@ -20,9 +20,7 @@ def extract_feats(datafile, rankedfile, nfeat, outfile):
top_feats = feats[0:nfeat] top_feats = feats[0:nfeat]
else: else:
top_feats = rank[1] top_feats = rank[1]
# print top_feats.shape
#print top_feats.shape
# extract top features from table with abundances of all features # extract top features from table with abundances of all features
idx = [] idx = []
if len(nf_list) == 1: if len(nf_list) == 1:
...@@ -35,16 +33,15 @@ def extract_feats(datafile, rankedfile, nfeat, outfile): ...@@ -35,16 +33,15 @@ def extract_feats(datafile, rankedfile, nfeat, outfile):
print('###### MISSING %s ######' % top_feats[i]) print('###### MISSING %s ######' % top_feats[i])
# considering samples names in the new table # considering samples names in the new table
sel_feats=[features[i] for i in idx] sel_feats = [features[i] for i in idx]
# write new table # write new table
with open(outfile, 'w') as outw: with open(outfile, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n') writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
# header # header
writer.writerow(['Samples']+sel_feats) writer.writerow(['Samples'] + sel_feats)
for i in range(0, len(samples)): for i in range(0, len(samples)):
writer.writerow([samples[i]]+data_ab[i,idx].tolist()) writer.writerow([samples[i]] + data_ab[i, idx].tolist())
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -9,35 +9,35 @@ import sys ...@@ -9,35 +9,35 @@ import sys
import numpy as np import numpy as np
__author__ = 'Marco Chierici, Alessandro Zandona' __author__ = 'Marco Chierici, Alessandro Zandona'
__date__ = '15 December 2016' __date__ = '15 December 2016'
#### Extract features from a given dataset #### #### Extract features from a given dataset ####
def extract_feats(datafile, rankedfile, outfile): def extract_feats(datafile, rankedfile, outfile):
#print locals() # print locals()
# table with feats abundances # table with feats abundances
data = np.loadtxt(datafile, delimiter='\t', dtype=str) data = np.loadtxt(datafile, delimiter='\t', dtype=str)
# feats abundances (no names of samples, no header) # feats abundances (no names of samples, no header)
data_ab = data[1:,1:].astype(np.float) data_ab = data[1:, 1:].astype(np.float)
rank = np.loadtxt(rankedfile, delimiter='\t', skiprows=1, dtype=str) rank = np.loadtxt(rankedfile, delimiter='\t', skiprows=1, dtype=str)
# number of features in the list # number of features in the list
nf_list = rank.shape nf_list = rank.shape
if len(nf_list) > 1: if len(nf_list) > 1:
feats = rank[:, 0] feats = rank[:, 0]
top_feats = feats #[0:nfeat] top_feats = feats # [0:nfeat]
else: else:
top_feats = rank top_feats = rank
# extract top features from table with abundances of all features # extract top features from table with abundances of all features
idx = [] idx = []
nfeat = len(top_feats) nfeat = len(top_feats)
for i in range(nfeat): for i in range(nfeat):
if top_feats[i] in data[0,:].tolist(): if top_feats[i] in data[0, :].tolist():
idx.append(data[0,:].tolist().index(top_feats[i])) idx.append(data[0, :].tolist().index(top_feats[i]))
else: else:
print(top_feats[i]) print(top_feats[i])
...@@ -48,8 +48,8 @@ def extract_feats(datafile, rankedfile, outfile): ...@@ -48,8 +48,8 @@ def extract_feats(datafile, rankedfile, outfile):
# write new table # write new table
with open(outfile, 'w') as outw: with open(outfile, 'w') as outw:
writer = csv.writer(outw, delimiter='\t', lineterminator='\n') writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
for i in range(len(sel_feats[:,0])): for i in range(len(sel_feats[:, 0])):
writer.writerow(sel_feats[i,:]) writer.writerow(sel_feats[i, :])
if __name__ == "__main__": if __name__ == "__main__":
......
import numpy as np import numpy as np
import pandas as pd import pandas as pd
def load_data(filename): def load_data(filename):
df = pd.read_csv(filename, sep='\t', header=0, index_col=0) df = pd.read_csv(filename, sep='\t', header=0, index_col=0)
var_names = df.columns.tolist() var_names = df.columns.tolist()
...@@ -8,14 +9,18 @@ def load_data(filename): ...@@ -8,14 +9,18 @@ def load_data(filename):
data = df.values.astype(dtype=np.float) data = df.values.astype(dtype=np.float)
return sample_names, var_names, data return sample_names, var_names, data
def save_split(x, y, sample_names, var_names, basename): def save_split(x, y, sample_names, var_names, basename):
""" """
x, y: output of train_test_split x, y: output of train_test_split
sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names) sample_names var_names: lists with samples and feature names (will be the DataFrame row and column names)
""" """
x_df = pd.DataFrame(x, index=sample_names, columns=var_names) x_df = pd.DataFrame(x, index=sample_names, columns=var_names)
x_df.to_csv(f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID") x_df.to_csv(
f"{basename}.txt", sep='\t', index=True, header=True, index_label="sampleID"
)
y_df = pd.DataFrame(y, index=sample_names, columns=['label']) y_df = pd.DataFrame(y, index=sample_names, columns=['label'])
y_df.to_csv(f"{basename}.lab", sep='\t', index=True, header=True, index_label="sampleID") y_df.to_csv(
f"{basename}.lab", sep='\t', index=True, header=True, index_label="sampleID"
)
...@@ -18,21 +18,59 @@ import numpy as np ...@@ -18,21 +18,59 @@ import numpy as np
matplotlib.use('Agg') matplotlib.use('Agg')
parser = argparse.ArgumentParser(description='Find the intersection between feature lists and produce Venn diagrams.') parser = argparse.ArgumentParser(
parser.add_argument('CONFIGFILE1', type=str, help='Training experiment configuration file 1 (with info about number of top discriminant features)') description='Find the intersection between feature lists and produce Venn diagrams.'
parser.add_argument('CONFIGFILE2', type=str, help='Training experiment configuration file 2 (with info about number of top discriminant features)') )
parser.add_argument('OUTLIST', type=str, help='Output file for intersected feature list.') parser.add_argument(
parser.add_argument('OUTFILE', type=str, nargs='?', help='Output file for Venn diagram plot.') 'CONFIGFILE1',
type=str,
parser.add_argument('--title1', type=str, default='List_1', nargs='?', help='Name for first diagram (default: %(default)s)') help='Training experiment configuration file 1 (with info about number of top discriminant features)',
parser.add_argument('--title2', type=str, default='List_2', nargs='?', help='Name for second diagram (default: %(default)s)') )
parser.add_argument('--configFile3', type=str, default='NO', nargs='?', help='Third configuration file - optional (default: %(default)s)') parser.add_argument(
parser.add_argument('--title3', type=str, default='List_3', nargs='?', help='Name for third diagram (default: %(default)s)') 'CONFIGFILE2',
type=str,
__author__ = 'Alessandro Zandona' help='Training experiment configuration file 2 (with info about number of top discriminant features)',
__date__ = '15 December 2016' )
parser.add_argument(
if len(sys.argv)==1: 'OUTLIST', type=str, help='Output file for intersected feature list.'
)
parser.add_argument(
'OUTFILE', type=str, nargs='?', help='Output file for Venn diagram plot.'
)
parser.add_argument(
'--title1',
type=str,
default='List_1',
nargs='?',
help='Name for first diagram (default: %(default)s)',
)
parser.add_argument(
'--title2',
type=str,
default='List_2',
nargs='?',
help='Name for second diagram (default: %(default)s)',
)
parser.add_argument(
'--configFile3',
type=str,
default='NO',
nargs='?',
help='Third configuration file - optional (default: %(default)s)',
)
parser.add_argument(
'--title3',
type=str,
default='List_3',
nargs='?',
help='Name for third diagram (default: %(default)s)',
)
__author__ = 'Alessandro Zandona'
__date__ = '15 December 2016'
if len(sys.argv) == 1:
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
...@@ -77,72 +115,79 @@ feats2 = fl_2[:NFEATS, 1] ...@@ -77,72 +115,79 @@ feats2 = fl_2[:NFEATS, 1]
# Convert lists into sets # Convert lists into sets
feats2_set = set(feats2) feats2_set = set(feats2)
if (configfile3 != 'NO'): if configfile3 != 'NO':
config.read(configfile3) config.read(configfile3)
if not config.has_section('INPUT'): if not config.has_section('INPUT'):
print("%s is not a valid configuration file." % CONFIGFILE2) print("%s is not a valid configuration file." % CONFIGFILE2)
sys.exit(3) sys.exit(3)
RANK = config.get("OUTPUT", "Borda") RANK = config.get("OUTPUT", "Borda")
NFEATS = config.getint("OUTPUT", "N_feats") NFEATS = config.getint("OUTPUT", "N_feats")
# Feature lists # Feature lists
fl_3 = np.loadtxt(RANK, dtype=str, delimiter='\t', skiprows=1) fl_3 = np.loadtxt(RANK, dtype=str, delimiter='\t', skiprows=1)
# Features name # Features name
feats3 = fl_3[:NFEATS, 1] feats3 = fl_3[:NFEATS, 1]
# Convert lists into sets # Convert lists into sets
feats3_set = set(feats3) feats3_set = set(feats3)
# Intersection between lists # Intersection between lists
f1f2 = feats1_set.intersection(feats2_set) f1f2 = feats1_set.intersection(feats2_set)
if (configfile3 != 'NO'): if configfile3 != 'NO':
f1f3 = feats1_set.intersection(feats3_set) f1f3 = feats1_set.intersection(feats3_set)
f2f3 = feats2_set.intersection(feats3_set) f2f3 = feats2_set.intersection(feats3_set)
# associate to each common feature the position in each lists # associate to each common feature the position in each lists
#outFile_f1f2=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title2)) # outFile_f1f2=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title2))
#outw=open(outFile_f1f2, 'w') # outw=open(outFile_f1f2, 'w')
with open(OUTLIST, 'w') as outw: with open(OUTLIST, 'w') as outw:
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n') writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
writer.writerow(['Feature', 'Position in %s' %title1, 'Postition in %s' %title2]) writer.writerow(['Feature', 'Position in %s' % title1, 'Postition in %s' % title2])
for i in range(len(list(f1f2))): for i in range(len(list(f1f2))):
# current feature in intersection # current feature in intersection
interF = list(f1f2)[i] interF = list(f1f2)[i]
# position of current feature in first list # position of current feature in first list
idx_list1 = np.where(feats1==interF)[0][0] idx_list1 = np.where(feats1 == interF)[0][0]
# position of current feature in second list # position of current feature in second list
idx_list2 = np.where(feats2==interF)[0][0] idx_list2 = np.where(feats2 == interF)[0][0]
writer.writerow([list(f1f2)[i], idx_list1+1, idx_list2+1]) writer.writerow([list(f1f2)[i], idx_list1 + 1, idx_list2 + 1])
if (configfile3 != 'NO'): if configfile3 != 'NO':
# associate to each common feature the position in each lists # associate to each common feature the position in each lists
outFile_f1f3=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title1,title3)) outFile_f1f3 = os.path.join(
with open(outFile_f1f3, 'w') as outw: os.path.dirname(OUTFILE), 'Intersection_%s_%s.txt' % (title1, title3)
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n') )
writer.writerow(['Feature', 'Position in %s '%title1, 'Postition in %s ' %title3]) with open(outFile_f1f3, 'w') as outw:
for i in range(len(list(f1f3))): writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
# current feature in intersection writer.writerow(
interF = list(f1f3)[i] ['Feature', 'Position in %s ' % title1, 'Postition in %s ' % title3]
# position of current feature in first list )
idx_list1 = np.where(feats1==interF)[0][0] for i in range(len(list(f1f3))):
# position of current feature in second list # current feature in intersection
idx_list3 = np.where(feats3==interF)[0][0] interF = list(f1f3)[i]
writer.writerow([list(f1f3)[i], idx_list1+1, idx_list3+1]) # position of current feature in first list
idx_list1 = np.where(feats1 == interF)[0][0]
# position of current feature in second list
outFile_f2f3=os.path.join(os.path.dirname(OUTFILE),'Intersection_%s_%s.txt' %(title2,title3)) idx_list3 = np.where(feats3 == interF)[0][0]
with open(outFile_f2f3, 'w') as outw: writer.writerow([list(f1f3)[i], idx_list1 + 1, idx_list3 + 1])
writer = csv.writer(outw, delimiter = '\t', lineterminator = '\n')
writer.writerow(['Feature', 'Position in %s '%title2, 'Postition in %s ' %title3]) outFile_f2f3 = os.path.join(
for i in range(len(list(f2f3))): os.path.dirname(OUTFILE), 'Intersection_%s_%s.txt' % (title2, title3)
# current feature in intersection )
interF = list(f2f3)[i] with open(outFile_f2f3, 'w') as outw:
# position of current feature in first list writer = csv.writer(outw, delimiter='\t', lineterminator='\n')
idx_list2 = np.where(feats2==interF)[0][0] writer.writerow(
# position of current feature in second list ['Feature', 'Position in %s ' % title2, 'Postition in %s ' % title3]
idx_list3 = np.where(feats3==interF)[0][0] )
writer.writerow([list(f2f3)[i], idx_list2+1, idx_list3+1]) for i in range(len(list(f2f3))):
# current feature in intersection
interF = list(f2f3)[i]
# position of current feature in first list
idx_list2 = np.where(feats2 == interF)[0][0]
# position of current feature in second list
idx_list3 = np.where(feats3 == interF)[0][0]
writer.writerow([list(f2f3)[i], idx_list2 + 1, idx_list3 + 1])
# # plot Venn diagrams # # plot Venn diagrams
......
import argparse
from itertools import combinations
from pathlib import Path
import numpy as np
import pandas as pd
from mlpy import canberra_stability
parser = argparse.ArgumentParser()
parser.add_argument('--resultsdir', type=str, help='Results folder')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--target', type=str, help='Clinical endpoint')
parser.add_argument(
'--model', type=str, default='randomForest', help='Model (default: %(default)s)'
)
parser.add_argument(
'--nf_min', type=int, default=10, help='Min #feat (default: %(default)s)'
)
parser.add_argument(
'--nf_max', type=int, default=50, help='Max #feat (default: %(default)s)'
)
parser.add_argument(
'--nf_step',
type=int,
default=10,
help='Increase by these many feat (default: %(default)s)',
)
parser.add_argument('--nf_rsnf', type=int, nargs='+', help='One or more #feat for rSNF')
parser.add_argument('--layers', type=str, nargs='+', help='')
args = parser.parse_args()
RESULTSDIR = args.resultsdir # top-level results directory
DATASET = args.dataset # 'tcga_breast'
TARGET = args.target # 'ER'
MODEL = args.model
NF_MIN = args.nf_min
NF_MAX = args.nf_max
NF_STEP = args.nf_step
NF_RSNF = args.nf_rsnf
LAYERS = args.layers
N_LAYERS = len(LAYERS)
MODE = 'rSNF'
assert (
Path(RESULTSDIR, DATASET).expanduser().exists()
), f"{RESULTSDIR}/{DATASET} not found"
assert (
Path(RESULTSDIR, f"{DATASET}_SNFdap").expanduser().exists()
), f"{RESULTSDIR}/{DATASET}_SNFdap not found"
for k in range(2, N_LAYERS + 1):
for comb in combinations(LAYERS, k):
layers_concat = '_'.join(comb)
bordas = []