Commit 86ce86c1 authored by Marco Chierici's avatar Marco Chierici
Browse files

SNF extension to multiview datasets (N>2); other code edits

parent fd1ae446
......@@ -18,12 +18,18 @@ data
# Results folders
results*
# Archived scripts
archive
# Runner
runner.sh
# snakemake stuff
.snakemake
# R stuff
.Rhistory
# Output redirection files
out*
err*
......@@ -59,30 +59,27 @@ snf_cv <- function(W, lab, K=5, N=10, clm, infocl){
return(med_NMI=median_NMI_allrep)
}
snf_tuning <- function(dist1, dist2, lab, clm, infocl){
# Pairwise distance between samples of two different datasets
dist_tab1 <- dist1
dist_tab2 <- dist2
##### generalized (multiview with N>2 views) version #####
snf_tuning <- function(distL, lab, clm, infocl){
# min and max K values
minK <- 10
maxK <- 30
stepK <- 1
K_values <- seq(minK,maxK,stepK)
K_values <- seq(minK, maxK, stepK)
# min and max alpha values
min_alpha <- 0.3
max_alpha <- 0.8
step_alpha <- 0.05
alpha_values <- seq(min_alpha,max_alpha,step_alpha)
alpha_values <- seq(min_alpha, max_alpha, step_alpha)
registerDoParallel(cores=2)
# for each combination of K and alpha, compute NMI score median over 10x5-CV
NMI_tun <- foreach(K=K_values) %dopar% {foreach(alpha=alpha_values) %dopar%
{ W1_tun <- affinityMatrix(dist_tab1, K=K, alpha);
W2_tun <- affinityMatrix(dist_tab2, K=K, alpha);
W_K <- SNF(list(W1_tun, W2_tun), K=K);
med_NMI <- snf_cv(W_K, lab, clm=clm, infocl=infocl)}
{
affinityL <- lapply(distL, function(x) affinityMatrix(x, K=K, alpha))
W_K <- SNF(affinityL, K=K)
med_NMI <- snf_cv(W_K, lab, clm=clm, infocl=infocl)}
}
# K values
......@@ -92,7 +89,7 @@ snf_tuning <- function(dist1, dist2, lab, clm, infocl){
idx_max_alpha_fk <- list()
max_nmi_fk <- list()
tab_median_NMI <- matrix(,nrow=nK, ncol=nalpha)
tab_median_NMI <- matrix(, nrow=nK, ncol=nalpha)
# Set rownames
knames <- vector("character")
ik <- 1
......@@ -105,7 +102,7 @@ snf_tuning <- function(dist1, dist2, lab, clm, infocl){
# Set colnames
anames <- vector("character")
ia <- 1
for (i in seq(0.3,0.8,0.05)){
for (i in seq(0.3,0.8,0.05)) {
anames[ia] <- paste('alpha',i,sep='_')
ia <- ia+1
}
......@@ -116,7 +113,7 @@ snf_tuning <- function(dist1, dist2, lab, clm, infocl){
# K fixed, find max NMI over all alpha values
max_nmi_fk[elk] <- max(unlist(NMI_tun[[elk]]))
tab_median_NMI[elk,] <- unlist(NMI_tun[[elk]])
}
}
# Find K corresponding to max NMI
best_K_idx <- which.max(unlist(max_nmi_fk))[1]
best_K <- K_values[best_K_idx]
......@@ -124,4 +121,4 @@ snf_tuning <- function(dist1, dist2, lab, clm, infocl){
best_alpha_idx <- which.max(NMI_tun[[best_K_idx]])
best_alpha <- alpha_values[best_alpha_idx]
return(list(best_K=best_K, best_alpha=best_alpha, tab_median_NMI=tab_median_NMI))
}
}
\ No newline at end of file
## This code is written by Alessandro Zandona' <zandona@fbk.eu>.
## Edits and code optimization by Marco Chierici <chierici@fbk.eu>.
## Original code by Alessandro Zandonà (https://github.com/AleZandona/INF)
## Major edits, code improvements and multiview extension by Marco Chierici <chierici@fbk.eu>.
##
## Requires R >= 3.2.3
suppressPackageStartupMessages(library(argparse))
library("cvTools")
library("doParallel")
library("TunePareto")
library("igraph")
# MC edit
library(cvTools)
library(doParallel)
library(TunePareto)
library(igraph)
library(data.table)
library(lubridate)
# MC edit: functions
load_data <- function(filename)
{
#use fread for performance
......@@ -28,61 +26,49 @@ load_labels <- function(filename)
}
# -------------------
parser <- ArgumentParser(description="Perform a Similarity Network Fusion analysis on two datasets [samples X features]. NB: Same samples for the 2 dataset are required!")
parser$add_argument("--d1", type="character", help = "First dataset [features X samples]")
parser$add_argument("--d2", type="character", help = "Second dataset [features X samples]")
parser$add_argument("--lab", type="character", help = "one column: labels associated to samples; NO HEADER")
parser$add_argument("--outf", type="character", help = "Output file")
parser$add_argument("--scriptDir", type="character", help = "Directory with R files necessary to SNF")
parser$add_argument("--clust", type="character", choices=c('spectral', 'fastgreedy') ,help = "Clustering method on fused graph")
parser$add_argument("--infoclust", action="store_true", help = "Number of groups from clustering method must be equal to number of classes? [default = TRUE]")
# MC edit
parser$add_argument("--threads", type="integer", default=4, help = "Number of threads for rSNF [default = %default]")
parser <- ArgumentParser(description="Perform a Similarity Network Fusion analysis on two or more datasets [samples by features]. NB: Same sample ordering across the datasets is required!")
parser$add_argument("--data", type="character", help="Omic layers datasets [samples by features]", nargs="+")
parser$add_argument("--lab", type="character", help="one column: labels associated to samples; NO HEADER")
parser$add_argument("--outf", type="character", help="Output file")
parser$add_argument("--scriptDir", type="character", help="Directory with R files necessary to SNF")
parser$add_argument("--clust", type="character", choices=c('spectral', 'fastgreedy'), help="Clustering method on fused graph")
parser$add_argument("--clustinfo", action="store_true", help="Should the number of clusters be equal to the number of classes? [default: FALSE]")
parser$add_argument("--threads", type="integer", default=4, help="Number of threads for rSNF [default = %default]")
args <- parser$parse_args()
# Read input parameters
dataFile1 <- args$d1
dataFile2 <- args$d2
labFile <- args$lab
outFile <- args$outf
sDir <- args$scriptDir
clustMethod <- args$clust
clustInfo <- args$infoclust
clustInfo <- args$clustinfo
threads <- args$threads
print (clustInfo)
print(clustInfo)
# load R scripts
file_names <- as.list(dir(path=sDir, pattern="*", full.names=TRUE))
file_names <- as.list(dir(path=args$scriptDir, pattern="*", full.names=TRUE))
lpack <- lapply(file_names,source,.GlobalEnv)
# load files
# MC edits: changed to functions
d1 <- load_data(dataFile1)
d2 <- load_data(dataFile2)
dataF <- lapply(args$data, load_data)
lab <- load_labels(labFile)
# number of features
n1 <- ncol(d1)
n2 <- ncol(d2)
# nrow check
stopifnot(length(unique( c(sapply(dataF, nrow), length(lab)) )) == 1)
# sample names check
tmp <- as.data.frame(sapply(dataF, rownames))
stopifnot(length(unique(as.list(tmp))) == 1)
# data normalization (mean 0, std 1)
print(paste(now(), "-- Data normalization (1)"))
d1_n <- standardNormalization(d1)
print(paste(now(), "-- Data normalization (2)"))
d2_n <- standardNormalization(d2)
print(paste(now(), "-- Data normalization"))
dataL <- lapply(dataF, standardNormalization)
# Calculate pairwise distance between samples
print(paste(now(), "-- Pairwise distances (1)"))
dist_d1 <- dist2(as.matrix(d1_n), as.matrix(d1_n))
print(paste(now(), "-- Pairwise distances (2)"))
dist_d2 <- dist2(as.matrix(d2_n), as.matrix(d2_n))
print(paste(now(), "-- Pairwise distances"))
distL <- lapply(dataL, function(x) (dist2(as.matrix(x), as.matrix(x))))
# Parameters tuning (K, alpha)
t0 <- now()
print(paste(t0, "-- Parameter tuning"))
opt_par <- snf_tuning(dist_d1, dist_d2, lab=lab, clm=clustMethod, infocl=clustInfo)
opt_par <- snf_tuning(distL, lab=lab, clm=clustMethod, infocl=clustInfo)
K_opt <- opt_par[[1]]
alpha_opt <- opt_par[[2]]
t1 <- now()
......@@ -91,17 +77,15 @@ print("Optimal parameters:")
print(paste0("K_opt = ", K_opt, "; alpha_opt = ", alpha_opt))
# Similarity graphs
print(paste(now(), "-- Similarity graphs (1)"))
W1 = affinityMatrix(dist_d1, K=K_opt, alpha_opt)
print(paste(now(), "-- Similarity graphs (2)"))
W2 = affinityMatrix(dist_d2, K=K_opt, alpha_opt)
print(paste(now(), "-- Similarity graphs"))
affinityL <- lapply(distL, function(x) affinityMatrix(x, K=K_opt, alpha_opt))
# Fuse the graphs
print(paste(now(), "-- Graph fusion"))
W = SNF(list(W1,W2),K=K_opt)
W = SNF(affinityL, K=K_opt)
# Rescale fused graph
W_sc <- W/max(W)
colnames(W_sc) <- rownames(d1)
colnames(W_sc) <- rownames(dataF[[1]])
# Write fused graph
outfused <- gsub('.txt', '_similarity_mat_fused.txt', outFile)
......@@ -144,126 +128,69 @@ SNFNMI_allfeats <- calNMI(group, lab)
outNMI <- gsub('.txt', '_NMI_score.txt', outFile)
write.table(SNFNMI_allfeats, file=outNMI, quote=FALSE, col.names=FALSE, row.names=FALSE)
### For a posteriori features ranking, build affinity matrix on one feature at a time ###
### Data type 1 ###
t0 <- now()
print(paste(t0, "-- Testing importance of each feature (1)..."))
print(paste(t0, "-- Testing importance of each feature..."))
print(paste("[I] Running on", threads, "threads"))
# MC edit: prepare multi-threated environment
cl <- makeCluster(threads, outfile=gsub('.txt', '_MultiCoreLogging_1.txt', outFile))
registerDoParallel(cl)
# MC edit: split the loop over multiple threads
SNFNMI_d1 <- foreach(f=1:n1, .combine=c, .verbose=TRUE) %dopar% {
# MC edit: no need to renormalize the data, just pick the f-th column from the normalized matrix
d1_onefeat <- as.matrix(d1_n[,f])
# Calculate pairwise distance between samples
dist_d1 <- dist2(as.matrix(d1_onefeat), as.matrix(d1_onefeat))
# Similarity graphs
W1 = affinityMatrix(dist_d1, K=K_opt, alpha_opt)
if (clustMethod=="spectral"){
if (clustInfo){
# Impose number of clusters (based on true samples labels)
nclust <- length(unique(lab))
group_fi <- spectralClustering(W1, nclust)
} else {
nclust <- estimateNumberOfClustersGivenGraph(W1)[[1]]
# Spectral clustering
group_fi <- spectralClustering(W1, nclust)
SNFNMI_L <- lapply(dataL, function(dataset) {
cl <- makeCluster(threads, outfile=gsub('.txt', '_MultiCoreLogging.txt', outFile))
registerDoParallel(cl)
toExport <- c("K_opt", "alpha_opt", "clustMethod", "clustInfo", "group", "file_names")
SNFNMI <- foreach(d1_onefeat=dataset, .combine=c, .verbose=FALSE, .export=toExport) %dopar% {
# reload SNF scripts
lpack <- lapply(file_names, source, .GlobalEnv)
# Calculate pairwise distance between samples
dist_d1 <- dist2(as.matrix(d1_onefeat), as.matrix(d1_onefeat))
# Similarity graphs
W1 <- affinityMatrix(dist_d1, K=K_opt, alpha_opt)
if (clustMethod=="spectral"){
if (clustInfo){
# Impose number of clusters (based on true samples labels)
nclust <- length(unique(lab))
group_fi <- spectralClustering(W1, nclust)
} else {
nclust <- estimateNumberOfClustersGivenGraph(W1)[[1]]
# Spectral clustering
group_fi <- spectralClustering(W1, nclust)
}
} else if (clustMethod=="fastgreedy"){
# Rescale fused graph, so to apply community detection
W1_sc <- W1/max(W1)
# Graph from similarity matrix
g <- graph.adjacency(W1_sc, weighted=TRUE, diag=FALSE, mode='undirected')
# Community detection
m <- cluster_fast_greedy(g)
if (clustInfo){
# Impose number of clusters (based on true samples labels)
nclust <- length(unique(lab))
group_fi <- cutree(as.hclust(m), nclust)
} else {
group_fi <- m$membership
}
}
# Goodness of clustering
# The closer SNFNMI to 0, the less similar the inferred clusters are to the real ones
TMP <- calNMI(group_fi, group)
TMP
}
} else if (clustMethod=="fastgreedy"){
# Rescale fused graph, so to apply community detection
W1_sc <- W1/max(W1)
# Graph from similarity matrix
g <- graph.adjacency(W1_sc, weighted = TRUE, diag=FALSE, mode='undirected')
# Community detection
m <- cluster_fast_greedy(g)
if (clustInfo){
# Impose number of clusters (based on true samples labels)
nclust <- length(unique(lab))
group_fi <- cutree(as.hclust(m), nclust)
} else {
group_fi <- m$membership
}
}
# Goodness of clustering
# The closer SNFNMI to 0, the less similar the inferred clusters are to the real ones
TMP <- calNMI(group_fi, group)
TMP
stopCluster(cl)
SNFNMI
}
# MC edit
stopCluster(cl)
)
t1 <- now()
print(paste("Done:", time_length(interval(t0, t1), "minute"), "minutes elapsed."))
### For a posteriori features ranking, build affinity matrix on one feature at a time ###
### Data type 2 ###
t0 <- now()
print(paste(t0, "-- Testing importance of each feature (2)..."))
print(paste("[I] Running on", threads, "threads"))
cl <- makeCluster(threads, outfile=gsub('.txt', '_MultiCoreLogging_2.txt', outFile))
registerDoParallel(cl)
# MC edit: split the loop over multiple threads
SNFNMI_d2 <- foreach(f=1:n2, .combine=c, .verbose=TRUE) %dopar% {
# MC edit: no need to renormalize the data, just pick the f-th column from the normalized matrix
d2_onefeat <- as.matrix(d2_n[,f])
# Calculate pairwise distance between samples
dist_d2 <- dist2(as.matrix(d2_onefeat), as.matrix(d2_onefeat))
# Similarity graphs
W2 = affinityMatrix(dist_d2, K=K_opt, alpha_opt)
if (clustMethod=="spectral"){
if (clustInfo){
# Impose number of clusters (based on true samples labels)
nclust <- length(unique(lab))
group_fi <- spectralClustering(W2, nclust)
} else {
nclust <- estimateNumberOfClustersGivenGraph(W2)[[1]]
# Spectral clustering
group_fi <- spectralClustering(W2, nclust)
}
} else if (clustMethod=="fastgreedy"){
# Rescale fused graph, so to apply community detection
W2_sc <- W2/max(W2)
# Graph from similarity matrix
g <- graph.adjacency(W2_sc, weighted = TRUE, diag=FALSE, mode='undirected')
# Community detection
m <- cluster_fast_greedy(g)
if (clustInfo){
# Impose number of clusters (based on true samples labels)
nclust <- length(unique(lab))
group_fi <- cutree(as.hclust(m), nclust)
} else {
group_fi <- m$membership
}
}
# Goodness of clustering
# The closer SNFNMI to 0, the less similar the inferred clusters are to the real ones
TMP <- calNMI(group_fi, group)
TMP
# Associate feature name to respective SNFNMI score
# [MC] not-so-elegant way to apply names to all SNFNMI_L objects
for(i in 1:length(SNFNMI_L)) {
names(SNFNMI_L[[i]]) <- colnames(dataL[[i]])
}
# MC edit:
stopCluster(cl)
t1 <- now()
print(paste("Done:", time_length(interval(t0, t1), "minute"), "minutes elapsed."))
# Associate feature name to respective SNFNMI score
names(SNFNMI_d1) <- colnames(d1)
names(SNFNMI_d2) <- colnames(d2)
# Combine data type 1 & 2 results
SNFNMI_integr <- c(SNFNMI_d1, SNFNMI_d2)
SNFNMI_integr <- unlist(SNFNMI_L)
# Sort (decreasing) features based on SNFNMI score
idx_snfnmi <- order(SNFNMI_integr, decreasing=TRUE)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment