Commit 800ce5a2 by Marco Chierici

### Initial commit

parents
.gitignore 0 → 100644
 # Mac stuff .DS_Store* # Byte-compiled __pycache__/ *.py[cod] # Jupyter Notebook .ipynb_checkpoints
SNFtools/SNF.R 0 → 100755
 SNF <- function(Wall,K=20,t=20) { ###This function is the main function of our software. The inputs are as follows: # Wall : List of affinity matrices # K : number of neighbors # t : number of iterations for fusion ###The output is a unified similarity graph. It contains both complementary information and common structures from all individual network. ###You can do various applications on this graph, such as clustering(subtyping), classification, prediction. LW = length(Wall) normalize <- function(X) X / rowSums(X) # makes elements other than largest K zero newW <- vector("list", LW) nextW <- vector("list", LW) ###First, normalize different networks to avoid scale problems. for( i in 1: LW){ Wall[[i]] = normalize(Wall[[i]]); Wall[[i]] = (Wall[[i]]+t(Wall[[i]]))/2; } ### Calculate the local transition matrix S for( i in 1: LW){ newW[[i]] = (.dominateset(Wall[[i]],K)) } # perform the diffusion for t iterations for (i in 1:t) { for(j in 1:LW){ sumWJ = matrix(0,dim(Wall[[j]])[1], dim(Wall[[j]])[2]) for(k in 1:LW){ if(k != j) { sumWJ = sumWJ + Wall[[k]] } } nextW[[j]] = newW[[j]] %*% (sumWJ/(LW-1)) %*% t(newW[[j]]); } ###Normalize each new obtained networks. for(j in 1 : LW){ Wall[[j]] = nextW[[j]] + diag(nrow(Wall[[j]])); Wall[[j]] = (Wall[[j]] + t(Wall[[j]]))/2; } } # construct the combined affinity matrix by summing diffused matrices W = matrix(0,nrow(Wall[[1]]), ncol(Wall[[1]])) for(i in 1:LW){ W = W + Wall[[i]] } W = W/LW; W = normalize(W); # ensure affinity matrix is symmetrical W = (W + t(W)+diag(nrow(W))) / 2; return(W) }
 affinityMatrix <- function(Diff,K=20,sigma=0.5) { ###This function constructs similarity networks. N = nrow(Diff) Diff = (Diff + t(Diff)) / 2 diag(Diff) = 0; sortedColumns = as.matrix(t(apply(Diff,2,sort))) finiteMean <- function(x) { mean(x[is.finite(x)]) } means = apply(sortedColumns[,1:K+1],1,finiteMean)+.Machine\$double.eps; avg <- function(x,y) ((x+y)/2) Sig = outer(means,means,avg)/3*2 + Diff/3 + .Machine\$double.eps; Sig[Sig <= .Machine\$double.eps] = .Machine\$double.eps densities = dnorm(Diff,0,sigma*Sig,log = FALSE) W = (densities + t(densities)) / 2 return(W) }
SNFtools/calNMI.R 0 → 100755
 calNMI <- function(x, y) { ##This function calculate the NMI between two clusters. x = as.vector(x); y = as.vector(y); return(max(0, .mutualInformation(x, y)/sqrt(.entropy(x) * .entropy(y)), na.rm=TRUE)) }
 chiDist2 <- function(A,B){ ###This function implements the Chi-Square distance between A and B n = nrow(A) m = nrow(B) d = ncol(A) stopifnot(d == ncol(B)) res = matrix(nrow = n, ncol = m) sqA = A^2 sqB = B^2 twoAB = 2 * (A %*% t(B)) for (a_num in 1:n) { for (b_num in 1:m) { res[a_num, b_num] = sum((sqA[a_num, ] + sqB[b_num, ] - twoAB[a_num, b_num]) / (A[a_num, ] + B[b_num, ])) } } res = res / 2 return(res) }
 concordanceNetworkNMI = function(Wall,C) { # Given a list of affinity matrices, Wall, the number of clusters C, return a matrix containing the NMIs # between cluster assignments made with spectral clustering. #For example, if the input "wall" contains two networks, the output is a 3x3 matrix of NMIs. Note here NMI is a metric to measure the fitness of two clustering results. if NMI = 1, it indicates two clusters are identical; if NMI = 0, it indicates two #clusters are totally differently, independent. # the output looks like this: # fused network network1 newtwork2 #fused network 1 0.7 0.55 #network 1 0.72 1 0.67 #network 2 0.55 0.67 1 #The output above means that, the fused network is more similar to network 1 than network 2. And network 1 and network 2 has a similarity 0.67. #sometimes, if the inputed networks have low similarities, which means they are contradicting each other, fusing them may not be a good idea. # Calculate the fused network # C is the number of clusters. LW = length(Wall) # Get the cluster labels for each of the networks labels = lapply(Wall, function(x) spectralClustering(x, C)) # Calculate the NMI between each pair clusters NMIs = matrix(NA, LW, LW) for (i in 1:LW) { for (j in 1:LW) { NMIs[i, j] = calNMI(labels[[i]], labels[[j]]); } } return(NMIs) }
 displayClusters <- function(W, group) { normalize <- function(X) X / rowSums(X) ind = sort(as.vector(group),index.return=TRUE) ind = ind\$ix diag(W) = 0 W = normalize(W); W = W + t(W); image(1:ncol(W),1:nrow(W),W[ind,ind],col = grey(100:0 / 100),xlab = 'Patients',ylab='Patients'); }
SNFtools/dist2.R 0 → 100755
 dist2 <- function(X,C) { ndata = nrow(X) ncentres = nrow(C) sumsqX = rowSums(X^2) sumsqC = rowSums(C^2) XC = 2 * (X %*% t(C)) res = matrix(rep(sumsqX,times=ncentres),ndata,ncentres) + t(matrix(rep(sumsqC,times=ndata),ncentres,ndata)) - XC res[res < 0] = 0 return(res) }
 estimateNumberOfClustersGivenGraph <- function(W, NUMC=2:5) { # This function estimates the number of clusters given the two huristics # given in the supplementary materials of our nature method paper # W is the similarity graph # NUMC is a vector which contains the possible choices of number of # clusters. # # # K1 is the estimated best number of clusters according to eigen-gaps # K12 is the estimated SECOND best number of clusters according to eigen-gaps # # K2 is the estimated number of clusters according to rotation cost # K22 is the estimated SECOND number of clusters according to rotation cost # # an example would be [K1, K2, K12,K22] = Estimate_Number_of_Clusters_given_graph(W, # [2:5]); # # Note that this function can only give an estimate of the number of # clusters. How to determine the "OPTIMAL" number of clusters, is still an # open question so far. if (min(NUMC) == 1) { warning('Note that we always assume there are more than one cluster.'); NUMC = NUMC[NUMC > 1] } W = (W + t(W))/2 diag(W) = 0 if (length(NUMC) > 0) { degs = rowSums(W) # compute unnormalized Laplacian degs[degs == 0] = .Machine\$double.eps D = diag(degs) L = D - W Di = diag(1 / sqrt(degs)) L = Di %*% L %*% Di # compute the eigenvectors corresponding to the k smallest # eigs\$valuess eigs = eigen(L) eigs_order = sort(eigs\$values, index.return=T)\$ix eigs\$values = eigs\$values[eigs_order] eigs\$vectors = eigs\$vectors[, eigs_order] eigengap = abs(diff(eigs\$values)) eigengap = eigengap * (1 - eigs\$values[1:length(eigs\$values) - 1] ) / (1 - eigs\$values[2:length(eigs\$values)]) quality = list() for (c_index in 1:length(NUMC)) { ck = NUMC[c_index] UU = eigs\$vectors[, 1:ck] # MC edit: # check for all-0 rows tmpidx <- rowSums(UU==0) == ncol(UU) # substitute with small values UU[tmpidx,] <- .Machine\$double.eps # end MC edit EigenvectorsDiscrete <- .discretisation(UU)[[1]] EigenVectors = EigenvectorsDiscrete^2 # MATLAB: sort(EigenVectors,2, 'descend'); temp1 <- EigenVectors[do.call(order, lapply(1:ncol(EigenVectors), function(i) EigenVectors[, i])), ] temp1 <- t(apply(temp1, 1, sort, TRUE)) quality[[c_index]] = (1 - eigs\$values[ck + 1]) / (1 - eigs\$values[ck]) * sum( sum( diag(1 / (temp1[, 1] + .Machine\$double.eps) ) %*% temp1[, 1:max(2, ck-1)] )) } t1 <- sort(eigengap[NUMC], decreasing=TRUE, index.return=T)\$ix K1 = NUMC[t1[1]] K12 = NUMC[t1[2]] t2 <- sort(unlist(quality), index.return=TRUE)\$ix K2 <- NUMC[t2[1]] K22 <- NUMC[t2[2]] } return (list(K1, K12, K2, K22)) }
 groupPredict <- function(train,test,groups,K=20,alpha=0.5,t=20,method=1){ ###This function is used to predict the subtype of new patients. #train and test have the same number of view and the same number of columns # group is the label for the train data # K, alpha, t are the prameters for SNF. #K is the number of neighbors #alpha is the hyperparameter used in constructing similarity network # t is the number of iterations #method is a indicator of which method to use to predict the label. method = 0 means to use local and global consistency; method = 1 means to use label propagation. Wi= vector("list", length=length(train)); for (i in 1:length(train)){ view= standardNormalization(rbind(train[[i]],test[[i]])); Dist1 = dist2(view, view); Wi[[i]] = affinityMatrix(Dist1, K, alpha); } W = SNF(Wi,K,t); Y0=matrix(0,nrow(view), max(groups)); for (i in 1:length(groups)) Y0[i,groups[i]]=1; Y=.csPrediction(W,Y0,method); newgroups=rep(0,nrow(view)); for (i in 1:nrow(Y)) newgroups[i]=which(Y[i,]==max(Y[i,])); return (newgroups); }
 .csPrediction <- function(W,Y0,method){ ###This function implements the label propagation to predict the label(subtype) for new patients. ### note method is an indicator of which semi-supervised method to use # method == 0 indicates to use the local and global consistency method # method >0 indicates to use label propagation method. alpha=0.9; P= W/rowSums(W) if(method==0){ Y= (1-alpha)* solve( diag(dim(P)[1])- alpha*P)%*%Y0; } else { NLabel=which(rowSums(Y0)==0)[1]-1; Y=Y0; for (i in 1:1000){ Y=P%*%Y; Y[1:NLabel,]=Y0[1:NLabel,]; } } return(Y); } .discretisation <- function(eigenVectors) { normalize <- function(x) x / sqrt(sum(x^2)) eigenVectors = t(apply(eigenVectors,1,normalize)) n = nrow(eigenVectors) k = ncol(eigenVectors) R = matrix(0,k,k) R[,1] = t(eigenVectors[round(n/2),]) mini <- function(x) { i = which(x == min(x)) return(i[1]) } c = matrix(0,n,1) for (j in 2:k) { c = c + abs(eigenVectors %*% matrix(R[,j-1],k,1)) i = mini(c) R[,j] = t(eigenVectors[i,]) } lastObjectiveValue = 0 for (i in 1:20) { eigenDiscrete = .discretisationEigenVectorData(eigenVectors %*% R) svde = svd(t(eigenDiscrete) %*% eigenVectors) U = svde[['u']] V = svde[['v']] S = svde[['d']] NcutValue = 2 * (n-sum(S)) if(abs(NcutValue - lastObjectiveValue) < .Machine\$double.eps) break lastObjectiveValue = NcutValue R = V %*% t(U) } return(list(discrete=eigenDiscrete,continuous =eigenVectors)) } .discretisationEigenVectorData <- function(eigenVector) { Y = matrix(0,nrow(eigenVector),ncol(eigenVector)) maxi <- function(x) { i = which(x == max(x)) return(i[1]) } j = apply(eigenVector,1,maxi) Y[cbind(1:nrow(eigenVector),j)] = 1 return(Y) } .dominateset <- function(xx,KK=20) { ###This function outputs the top KK neighbors. zero <- function(x) { s = sort(x, index.return=TRUE) x[s\$ix[1:(length(x)-KK)]] = 0 return(x) } normalize <- function(X) X / rowSums(X) A = matrix(0,nrow(xx),ncol(xx)); for(i in 1:nrow(xx)){ A[i,] = zero(xx[i,]); } return(normalize(A)) } # Calculate the mutual information between vectors x and y. .mutualInformation <- function(x, y) { classx <- unique(x) classy <- unique(y) nx <- length(x) ncx <- length(classx) ncy <- length(classy) probxy <- matrix(NA, ncx, ncy) for (i in 1:ncx) { for (j in 1:ncy) { probxy[i, j] <- sum((x == classx[i]) & (y == classy[j])) / nx } } probx <- matrix(rowSums(probxy), ncx, ncy) proby <- matrix(colSums(probxy), ncx, ncy, byrow=TRUE) result <- sum(probxy * log(probxy / (probx * proby), 2), na.rm=TRUE) return(result) } # Calculate the entropy of vector x. .entropy <- function(x) { class <- unique(x) nx <- length(x) nc <- length(class) prob <- rep.int(NA, nc) for (i in 1:nc) { prob[i] <- sum(x == class[i])/nx } result <- -sum(prob * log(prob, 2)) return(result) } .repmat = function(X,m,n){ ##R equivalent of repmat (matlab) if (is.null(dim(X))) { mx = length(X) nx = 1 } else { mx = dim(X)[1] nx = dim(X)[2] } matrix(t(matrix(X,mx,nx*n)),mx*m,nx*n,byrow=T) } \ No newline at end of file
 ## This code is written by Alessandro Zandona' . ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . # Requires R >= 3.2.3 snf_cv <- function(W, lab, K=5, N=10, clm, infocl){ median_NMI <- list() nSamp <- dim(W)[1] SNFNMI_all <- list() set.seed(N) cv_folds <- generateCVRuns(lab, ntimes = N, nfold = K, stratified = TRUE) for (nfold in 1:N){ #cv_folds <- cvFolds(nSamp, K=5, type="random") cv_folds_current <- cv_folds[[nfold]] SNFNMI_K <- list() for(l in 1:K){ # subset of X by indexes selected with k-fold CV idx_k <- cv_folds_current[[l]] W_k <- W[idx_k,idx_k] # extract the respective labels lab_k <- lab[idx_k] if (clm=="spectral"){ if (infocl){ nclust <- length(unique(lab)) # predict clusters for subset k of X group_k <- spectralClustering(W_k, nclust) }else{ nclust <- estimateNumberOfClustersGivenGraph(W_k)[[1]] group_k <- spectralClustering(W_k, nclust) } } else if (clm=="fastgreedy") { W_sc_k <- W_k/max(W_k) g_k <- graph.adjacency(W_sc_k, weighted = TRUE, diag=FALSE, mode='undirected') m_k <- cluster_fast_greedy(g_k) if (infocl){ nclust <- length(unique(lab)) group_k <- cutree(as.hclust(m_k), nclust) } else { group_k <- m_k\$membership } } # NMI score SNFNMI_K[l] <- calNMI(group_k, lab_k) } # median of NMI score for current CV repetition median_NMI[nfold] <- median(unlist(SNFNMI_K)) } # median of NMI score over all CV repetitions median_NMI_allrep <- median(unlist(median_NMI)) return(med_NMI=median_NMI_allrep) } snf_tuning <- function(dist1, dist2, lab, clm, infocl){ # Pairwise distance between samples of two different datasets dist_tab1 <- dist1 dist_tab2 <- dist2 # min and max K values minK <- 10 maxK <- 30 stepK <- 1 K_values <- seq(minK,maxK,stepK) # min and max alpha values min_alpha <- 0.3 max_alpha <- 0.8 step_alpha <- 0.05 alpha_values <- seq(min_alpha,max_alpha,step_alpha) registerDoParallel(cores=2) # for each combination of K and alpha, compute NMI score median over 10x5-CV NMI_tun <- foreach(K=K_values) %dopar% {foreach(alpha=alpha_values) %dopar% { W1_tun <- affinityMatrix(dist_tab1, K=K, alpha); W2_tun <- affinityMatrix(dist_tab2, K=K, alpha); W_K <- SNF(list(W1_tun, W2_tun), K=K); med_NMI <- snf_cv(W_K, lab, clm=clm, infocl=infocl)} } # K values nK <- length(seq(minK,maxK,stepK)) # alpha values nalpha <- length(seq(min_alpha,max_alpha,step_alpha)) idx_max_alpha_fk <- list() max_nmi_fk <- list() tab_median_NMI <- matrix(,nrow=nK, ncol=nalpha) # Set rownames knames <- vector("character") ik <- 1 for (i in seq(10,30,1)){ knames[ik] <- paste('K',i,sep='_') ik <- ik+1 } rownames(tab_median_NMI) <- knames # Set colnames anames <- vector("character") ia <- 1 for (i in seq(0.3,0.8,0.05)){ anames[ia] <- paste('alpha',i,sep='_') ia <- ia+1 } colnames(tab_median_NMI) <- anames for (elk in c(1:nK)){ # K fixed, find max NMI over all alpha values max_nmi_fk[elk] <- max(unlist(NMI_tun[[elk]])) tab_median_NMI[elk,] <- unlist(NMI_tun[[elk]]) } # Find K corresponding to max NMI best_K_idx <- which.max(unlist(max_nmi_fk))[1] best_K <- K_values[best_K_idx] # Find alpha corresponding to max NMI (and previously found K) best_alpha_idx <- which.max(NMI_tun[[best_K_idx]]) best_alpha <- alpha_values[best_alpha_idx] return(list(best_K=best_K, best_alpha=best_alpha, tab_median_NMI=tab_median_NMI)) }
 spectralClustering <- function(affinity, K, type=3) { ###This function implements the famous spectral clustering algorithms. There are three variants. The default one is the third type. ###THe inputs are as follows: #affinity: the similarity matrix; #K: the number of clusters # type: indicators of variants of spectral clustering d = rowSums(affinity) d[d == 0] = .Machine\$double.eps D = diag(d)