Commit 800ce5a2 authored by Marco Chierici's avatar Marco Chierici
Browse files

Initial commit

parents
# Mac stuff
.DS_Store*
# Byte-compiled
__pycache__/
*.py[cod]
# Jupyter Notebook
.ipynb_checkpoints
SNF <- function(Wall,K=20,t=20) {
###This function is the main function of our software. The inputs are as follows:
# Wall : List of affinity matrices
# K : number of neighbors
# t : number of iterations for fusion
###The output is a unified similarity graph. It contains both complementary information and common structures from all individual network.
###You can do various applications on this graph, such as clustering(subtyping), classification, prediction.
LW = length(Wall)
normalize <- function(X) X / rowSums(X)
# makes elements other than largest K zero
newW <- vector("list", LW)
nextW <- vector("list", LW)
###First, normalize different networks to avoid scale problems.
for( i in 1: LW){
Wall[[i]] = normalize(Wall[[i]]);
Wall[[i]] = (Wall[[i]]+t(Wall[[i]]))/2;
}
### Calculate the local transition matrix S
for( i in 1: LW){
newW[[i]] = (.dominateset(Wall[[i]],K))
}
# perform the diffusion for t iterations
for (i in 1:t) {
for(j in 1:LW){
sumWJ = matrix(0,dim(Wall[[j]])[1], dim(Wall[[j]])[2])
for(k in 1:LW){
if(k != j) {
sumWJ = sumWJ + Wall[[k]]
}
}
nextW[[j]] = newW[[j]] %*% (sumWJ/(LW-1)) %*% t(newW[[j]]);
}
###Normalize each new obtained networks.
for(j in 1 : LW){
Wall[[j]] = nextW[[j]] + diag(nrow(Wall[[j]]));
Wall[[j]] = (Wall[[j]] + t(Wall[[j]]))/2;
}
}
# construct the combined affinity matrix by summing diffused matrices
W = matrix(0,nrow(Wall[[1]]), ncol(Wall[[1]]))
for(i in 1:LW){
W = W + Wall[[i]]
}
W = W/LW;
W = normalize(W);
# ensure affinity matrix is symmetrical
W = (W + t(W)+diag(nrow(W))) / 2;
return(W)
}
affinityMatrix <- function(Diff,K=20,sigma=0.5) {
###This function constructs similarity networks.
N = nrow(Diff)
Diff = (Diff + t(Diff)) / 2
diag(Diff) = 0;
sortedColumns = as.matrix(t(apply(Diff,2,sort)))
finiteMean <- function(x) { mean(x[is.finite(x)]) }
means = apply(sortedColumns[,1:K+1],1,finiteMean)+.Machine$double.eps;
avg <- function(x,y) ((x+y)/2)
Sig = outer(means,means,avg)/3*2 + Diff/3 + .Machine$double.eps;
Sig[Sig <= .Machine$double.eps] = .Machine$double.eps
densities = dnorm(Diff,0,sigma*Sig,log = FALSE)
W = (densities + t(densities)) / 2
return(W)
}
calNMI <- function(x, y) {
##This function calculate the NMI between two clusters.
x = as.vector(x);
y = as.vector(y);
return(max(0, .mutualInformation(x, y)/sqrt(.entropy(x) * .entropy(y)), na.rm=TRUE))
}
chiDist2 <- function(A,B){
###This function implements the Chi-Square distance between A and B
n = nrow(A)
m = nrow(B)
d = ncol(A)
stopifnot(d == ncol(B))
res = matrix(nrow = n, ncol = m)
sqA = A^2
sqB = B^2
twoAB = 2 * (A %*% t(B))
for (a_num in 1:n) {
for (b_num in 1:m) {
res[a_num, b_num] = sum((sqA[a_num, ] + sqB[b_num, ] - twoAB[a_num, b_num]) / (A[a_num, ] + B[b_num, ]))
}
}
res = res / 2
return(res)
}
concordanceNetworkNMI = function(Wall,C) {
# Given a list of affinity matrices, Wall, the number of clusters C, return a matrix containing the NMIs
# between cluster assignments made with spectral clustering.
#For example, if the input "wall" contains two networks, the output is a 3x3 matrix of NMIs. Note here NMI is a metric to measure the fitness of two clustering results. if NMI = 1, it indicates two clusters are identical; if NMI = 0, it indicates two #clusters are totally differently, independent.
# the output looks like this:
# fused network network1 newtwork2
#fused network 1 0.7 0.55
#network 1 0.72 1 0.67
#network 2 0.55 0.67 1
#The output above means that, the fused network is more similar to network 1 than network 2. And network 1 and network 2 has a similarity 0.67.
#sometimes, if the inputed networks have low similarities, which means they are contradicting each other, fusing them may not be a good idea.
# Calculate the fused network
# C is the number of clusters.
LW = length(Wall)
# Get the cluster labels for each of the networks
labels = lapply(Wall, function(x) spectralClustering(x, C))
# Calculate the NMI between each pair clusters
NMIs = matrix(NA, LW, LW)
for (i in 1:LW) {
for (j in 1:LW) {
NMIs[i, j] = calNMI(labels[[i]], labels[[j]]);
}
}
return(NMIs)
}
displayClusters <- function(W, group) {
normalize <- function(X) X / rowSums(X)
ind = sort(as.vector(group),index.return=TRUE)
ind = ind$ix
diag(W) = 0
W = normalize(W);
W = W + t(W);
image(1:ncol(W),1:nrow(W),W[ind,ind],col = grey(100:0 / 100),xlab = 'Patients',ylab='Patients');
}
dist2 <- function(X,C) {
ndata = nrow(X)
ncentres = nrow(C)
sumsqX = rowSums(X^2)
sumsqC = rowSums(C^2)
XC = 2 * (X %*% t(C))
res = matrix(rep(sumsqX,times=ncentres),ndata,ncentres) + t(matrix(rep(sumsqC,times=ndata),ncentres,ndata)) - XC
res[res < 0] = 0
return(res)
}
estimateNumberOfClustersGivenGraph <- function(W, NUMC=2:5) {
# This function estimates the number of clusters given the two huristics
# given in the supplementary materials of our nature method paper
# W is the similarity graph
# NUMC is a vector which contains the possible choices of number of
# clusters.
#
#
# K1 is the estimated best number of clusters according to eigen-gaps
# K12 is the estimated SECOND best number of clusters according to eigen-gaps
#
# K2 is the estimated number of clusters according to rotation cost
# K22 is the estimated SECOND number of clusters according to rotation cost
#
# an example would be [K1, K2, K12,K22] = Estimate_Number_of_Clusters_given_graph(W,
# [2:5]);
#
# Note that this function can only give an estimate of the number of
# clusters. How to determine the "OPTIMAL" number of clusters, is still an
# open question so far.
if (min(NUMC) == 1) {
warning('Note that we always assume there are more than one cluster.');
NUMC = NUMC[NUMC > 1]
}
W = (W + t(W))/2
diag(W) = 0
if (length(NUMC) > 0) {
degs = rowSums(W)
# compute unnormalized Laplacian
degs[degs == 0] = .Machine$double.eps
D = diag(degs)
L = D - W
Di = diag(1 / sqrt(degs))
L = Di %*% L %*% Di
# compute the eigenvectors corresponding to the k smallest
# eigs$valuess
eigs = eigen(L)
eigs_order = sort(eigs$values, index.return=T)$ix
eigs$values = eigs$values[eigs_order]
eigs$vectors = eigs$vectors[, eigs_order]
eigengap = abs(diff(eigs$values))
eigengap = eigengap * (1 - eigs$values[1:length(eigs$values) - 1] ) / (1 - eigs$values[2:length(eigs$values)])
quality = list()
for (c_index in 1:length(NUMC)) {
ck = NUMC[c_index]
UU = eigs$vectors[, 1:ck]
# MC edit:
# check for all-0 rows
tmpidx <- rowSums(UU==0) == ncol(UU)
# substitute with small values
UU[tmpidx,] <- .Machine$double.eps
# end MC edit
EigenvectorsDiscrete <- .discretisation(UU)[[1]]
EigenVectors = EigenvectorsDiscrete^2
# MATLAB: sort(EigenVectors,2, 'descend');
temp1 <- EigenVectors[do.call(order, lapply(1:ncol(EigenVectors), function(i) EigenVectors[, i])), ]
temp1 <- t(apply(temp1, 1, sort, TRUE))
quality[[c_index]] = (1 - eigs$values[ck + 1]) / (1 - eigs$values[ck]) *
sum( sum( diag(1 / (temp1[, 1] + .Machine$double.eps) ) %*% temp1[, 1:max(2, ck-1)] ))
}
t1 <- sort(eigengap[NUMC], decreasing=TRUE, index.return=T)$ix
K1 = NUMC[t1[1]]
K12 = NUMC[t1[2]]
t2 <- sort(unlist(quality), index.return=TRUE)$ix
K2 <- NUMC[t2[1]]
K22 <- NUMC[t2[2]]
}
return (list(K1, K12, K2, K22))
}
groupPredict <- function(train,test,groups,K=20,alpha=0.5,t=20,method=1){
###This function is used to predict the subtype of new patients.
#train and test have the same number of view and the same number of columns
# group is the label for the train data
# K, alpha, t are the prameters for SNF.
#K is the number of neighbors
#alpha is the hyperparameter used in constructing similarity network
# t is the number of iterations
#method is a indicator of which method to use to predict the label. method = 0 means to use local and global consistency; method = 1 means to use label propagation.
Wi= vector("list", length=length(train));
for (i in 1:length(train)){
view= standardNormalization(rbind(train[[i]],test[[i]]));
Dist1 = dist2(view, view);
Wi[[i]] = affinityMatrix(Dist1, K, alpha);
}
W = SNF(Wi,K,t);
Y0=matrix(0,nrow(view), max(groups));
for (i in 1:length(groups)) Y0[i,groups[i]]=1;
Y=.csPrediction(W,Y0,method);
newgroups=rep(0,nrow(view));
for (i in 1:nrow(Y)) newgroups[i]=which(Y[i,]==max(Y[i,]));
return (newgroups);
}
.csPrediction <- function(W,Y0,method){
###This function implements the label propagation to predict the label(subtype) for new patients.
### note method is an indicator of which semi-supervised method to use
# method == 0 indicates to use the local and global consistency method
# method >0 indicates to use label propagation method.
alpha=0.9;
P= W/rowSums(W)
if(method==0){
Y= (1-alpha)* solve( diag(dim(P)[1])- alpha*P)%*%Y0;
} else {
NLabel=which(rowSums(Y0)==0)[1]-1;
Y=Y0;
for (i in 1:1000){
Y=P%*%Y;
Y[1:NLabel,]=Y0[1:NLabel,];
}
}
return(Y);
}
.discretisation <- function(eigenVectors) {
normalize <- function(x) x / sqrt(sum(x^2))
eigenVectors = t(apply(eigenVectors,1,normalize))
n = nrow(eigenVectors)
k = ncol(eigenVectors)
R = matrix(0,k,k)
R[,1] = t(eigenVectors[round(n/2),])
mini <- function(x) {
i = which(x == min(x))
return(i[1])
}
c = matrix(0,n,1)
for (j in 2:k) {
c = c + abs(eigenVectors %*% matrix(R[,j-1],k,1))
i = mini(c)
R[,j] = t(eigenVectors[i,])
}
lastObjectiveValue = 0
for (i in 1:20) {
eigenDiscrete = .discretisationEigenVectorData(eigenVectors %*% R)
svde = svd(t(eigenDiscrete) %*% eigenVectors)
U = svde[['u']]
V = svde[['v']]
S = svde[['d']]
NcutValue = 2 * (n-sum(S))
if(abs(NcutValue - lastObjectiveValue) < .Machine$double.eps)
break
lastObjectiveValue = NcutValue
R = V %*% t(U)
}
return(list(discrete=eigenDiscrete,continuous =eigenVectors))
}
.discretisationEigenVectorData <- function(eigenVector) {
Y = matrix(0,nrow(eigenVector),ncol(eigenVector))
maxi <- function(x) {
i = which(x == max(x))
return(i[1])
}
j = apply(eigenVector,1,maxi)
Y[cbind(1:nrow(eigenVector),j)] = 1
return(Y)
}
.dominateset <- function(xx,KK=20) {
###This function outputs the top KK neighbors.
zero <- function(x) {
s = sort(x, index.return=TRUE)
x[s$ix[1:(length(x)-KK)]] = 0
return(x)
}
normalize <- function(X) X / rowSums(X)
A = matrix(0,nrow(xx),ncol(xx));
for(i in 1:nrow(xx)){
A[i,] = zero(xx[i,]);
}
return(normalize(A))
}
# Calculate the mutual information between vectors x and y.
.mutualInformation <- function(x, y) {
classx <- unique(x)
classy <- unique(y)
nx <- length(x)
ncx <- length(classx)
ncy <- length(classy)
probxy <- matrix(NA, ncx, ncy)
for (i in 1:ncx) {
for (j in 1:ncy) {
probxy[i, j] <- sum((x == classx[i]) & (y == classy[j])) / nx
}
}
probx <- matrix(rowSums(probxy), ncx, ncy)
proby <- matrix(colSums(probxy), ncx, ncy, byrow=TRUE)
result <- sum(probxy * log(probxy / (probx * proby), 2), na.rm=TRUE)
return(result)
}
# Calculate the entropy of vector x.
.entropy <- function(x) {
class <- unique(x)
nx <- length(x)
nc <- length(class)
prob <- rep.int(NA, nc)
for (i in 1:nc) {
prob[i] <- sum(x == class[i])/nx
}
result <- -sum(prob * log(prob, 2))
return(result)
}
.repmat = function(X,m,n){
##R equivalent of repmat (matlab)
if (is.null(dim(X))) {
mx = length(X)
nx = 1
} else {
mx = dim(X)[1]
nx = dim(X)[2]
}
matrix(t(matrix(X,mx,nx*n)),mx*m,nx*n,byrow=T)
}
\ No newline at end of file
## This code is written by Alessandro Zandona' <zandona@fbk.eu>.
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
# Requires R >= 3.2.3
snf_cv <- function(W, lab, K=5, N=10, clm, infocl){
median_NMI <- list()
nSamp <- dim(W)[1]
SNFNMI_all <- list()
set.seed(N)
cv_folds <- generateCVRuns(lab, ntimes = N, nfold = K, stratified = TRUE)
for (nfold in 1:N){
#cv_folds <- cvFolds(nSamp, K=5, type="random")
cv_folds_current <- cv_folds[[nfold]]
SNFNMI_K <- list()
for(l in 1:K){
# subset of X by indexes selected with k-fold CV
idx_k <- cv_folds_current[[l]]
W_k <- W[idx_k,idx_k]
# extract the respective labels
lab_k <- lab[idx_k]
if (clm=="spectral"){
if (infocl){
nclust <- length(unique(lab))
# predict clusters for subset k of X
group_k <- spectralClustering(W_k, nclust)
}else{
nclust <- estimateNumberOfClustersGivenGraph(W_k)[[1]]
group_k <- spectralClustering(W_k, nclust)
}
} else if (clm=="fastgreedy") {
W_sc_k <- W_k/max(W_k)
g_k <- graph.adjacency(W_sc_k, weighted = TRUE, diag=FALSE, mode='undirected')
m_k <- cluster_fast_greedy(g_k)
if (infocl){
nclust <- length(unique(lab))
group_k <- cutree(as.hclust(m_k), nclust)
} else {
group_k <- m_k$membership
}
}
# NMI score
SNFNMI_K[l] <- calNMI(group_k, lab_k)
}
# median of NMI score for current CV repetition
median_NMI[nfold] <- median(unlist(SNFNMI_K))
}
# median of NMI score over all CV repetitions
median_NMI_allrep <- median(unlist(median_NMI))
return(med_NMI=median_NMI_allrep)
}
snf_tuning <- function(dist1, dist2, lab, clm, infocl){
# Pairwise distance between samples of two different datasets
dist_tab1 <- dist1
dist_tab2 <- dist2
# min and max K values
minK <- 10
maxK <- 30
stepK <- 1
K_values <- seq(minK,maxK,stepK)
# min and max alpha values
min_alpha <- 0.3
max_alpha <- 0.8
step_alpha <- 0.05
alpha_values <- seq(min_alpha,max_alpha,step_alpha)
registerDoParallel(cores=2)
# for each combination of K and alpha, compute NMI score median over 10x5-CV
NMI_tun <- foreach(K=K_values) %dopar% {foreach(alpha=alpha_values) %dopar%
{ W1_tun <- affinityMatrix(dist_tab1, K=K, alpha);
W2_tun <- affinityMatrix(dist_tab2, K=K, alpha);
W_K <- SNF(list(W1_tun, W2_tun), K=K);
med_NMI <- snf_cv(W_K, lab, clm=clm, infocl=infocl)}
}
# K values
nK <- length(seq(minK,maxK,stepK))
# alpha values
nalpha <- length(seq(min_alpha,max_alpha,step_alpha))
idx_max_alpha_fk <- list()
max_nmi_fk <- list()
tab_median_NMI <- matrix(,nrow=nK, ncol=nalpha)
# Set rownames
knames <- vector("character")
ik <- 1
for (i in seq(10,30,1)){
knames[ik] <- paste('K',i,sep='_')
ik <- ik+1
}
rownames(tab_median_NMI) <- knames
# Set colnames
anames <- vector("character")
ia <- 1
for (i in seq(0.3,0.8,0.05)){
anames[ia] <- paste('alpha',i,sep='_')
ia <- ia+1
}
colnames(tab_median_NMI) <- anames
for (elk in c(1:nK)){
# K fixed, find max NMI over all alpha values
max_nmi_fk[elk] <- max(unlist(NMI_tun[[elk]]))
tab_median_NMI[elk,] <- unlist(NMI_tun[[elk]])
}
# Find K corresponding to max NMI
best_K_idx <- which.max(unlist(max_nmi_fk))[1]
best_K <- K_values[best_K_idx]
# Find alpha corresponding to max NMI (and previously found K)
best_alpha_idx <- which.max(NMI_tun[[best_K_idx]])
best_alpha <- alpha_values[best_alpha_idx]
return(list(best_K=best_K, best_alpha=best_alpha, tab_median_NMI=tab_median_NMI))
}
spectralClustering <- function(affinity, K, type=3) {
###This function implements the famous spectral clustering algorithms. There are three variants. The default one is the third type.
###THe inputs are as follows:
#affinity: the similarity matrix;
#K: the number of clusters
# type: indicators of variants of spectral clustering
d = rowSums(affinity)
d[d == 0] = .Machine$double.eps
D = diag(d)