Commit 9163c31d authored by Diego Fioravanti's avatar Diego Fioravanti
Browse files

Initial commit with the original files from the thesis plus datasets and requirement.txt

parents
# Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# dotenv
.env
# virtualenv
.venv
venv/
ENV/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# R
# History files
.Rhistory
.Rapp.history
# Session Data files
.RData
# Example code in package build process
*-Ex.R
# Output files from R CMD build
/*.tar.gz
# Output files from R CMD check
/*.Rcheck/
# RStudio files
.Rproj.user/
# produced vignettes
vignettes/*.html
vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
# knitr and R markdown default cache directories
/*_cache/
/cache/
# Temporary files created by R markdown
*.utf8.md
*.knit.md
# Various
.idea/
library(phytools)
require(geiger)
require(maps)
library(ape)
library(matrixcalc)
library(igraph)
library(ade4)
library(rgl)
library(vegan)
library(ggplot2)
library(smacof)
#------------------load---------------------------------#
# The leaves are 307 because the variable unassigned was deleted
tree <- read.newick(file="~/Desktop/preprocessing308/RAxML_bestTree.raxml_ott.tre" )
is.rooted(tree)
#Add taxonomy
table_correspondence_id_tax <- read.delim("~/Desktop/ThesisData/progettoR/table_correspondence_id_tax.txt")
hs <- read.delim("~/Desktop/ThesisData/progettoR/FBK_thesis/16S/otu_table_16S_L6_commsamp_HS.txt")
names <- as.data.frame(colnames(hs[-c(1,2)]))
table_correspondence_id_tax <- cbind(table_correspondence_id_tax,names)
tree$tip.label
index <- match(as.factor(tree$tip.label),table_correspondence_id_tax[,1])
tree$tip.label <- table_correspondence_id_tax[index,4]
write.tree(tree, file = "raxMl_label.tre", append = FALSE)
#----------Patristic Distance between leaves------------#
D <- cophenetic(tree)
is.positive.semi.definite(D) #FALSE
det(D)
#---------MDS PROCEDURE: MARK LAYER AND JOHN A.RHODES----#
#----------Convertion in Euclidean matrix----------------#
D_eu <- sqrt(D)
is.positive.semi.definite(D_eu)
is.euclid(as.dist(D_eu), plot = FALSE, print = FALSE, tol = 1e-07)
isSymmetric(D_eu)
# F <- diag(dim(D)[1])-(1/dim(D)[1])*(matrix(1,dim(D)[1],dim(D)[1]))
# H <- -0.5*F%*%(D)^2%*%(F)
# isSymmetric(H)
# is.positive.semi.definite(round(H,3))
#
#
# # We have to find a matrix X of dimesion (n-1) x n s.t.
# # H=X^tX that is the Cholesky decomposition
# # It is convinient to compute the eigenvalues
#
# P <- eigen(H)
# R <- P$vectors
# Q <- diag(P$values)
#
#
#
# #configuration of points in the a euclidean space of dimension n-1 by col
# X <- (Q[-dim(Q)[1],])^(0.5)%*%t(R) #leave out the last eigenvector that correspond to the eigevalue zero
# h <- t(X)%*%X
#
# colnames(X) <- as.factor(row.names(D))
#
#
# for( i in 1:dim(X)[2]){
# if(colnames(X)[i]%in%annotation[,1]){
# colnames(X)[i] <- as.character(annotation[which(annotation[,1]==colnames(X)[i]),2])
# }
# }
# coordinate <- X
# row.names(X) <- (1:dim(X)[1])
# write.table(coordinate,file="coordinateR307.txt", quote = FALSE, sep="\t" ,row.names=T)
# plot(t(X[1:2,]))
#
#
# plot3d(X[1,], X[2,],X[3,],pch=30,size=5, col= 4)
#
# # --------------------- R package for the MDS---------------------------------#
# library("bios2mds")
# S <- mmds(sqrt(D_ucf),pc=250)
# coordinate_bio <- S$coord # scale factor sqrt(308)*X
# scree.plot(S$eigen, lab = FALSE, title = NULL, xlim = NULL,
# ylim = NULL, new.plot = TRUE, pdf.file = NULL)
#----------------------------------MDS------------------------------------------
X <- torgerson(D_eu, p=306) #307x306
#order variables as in the dataset
index_2 <- match( table_correspondence_id_tax[,4],as.data.frame(row.names(X))[,1])
X <- X[index_2,]
colnames(X) <- 1:306
write.table(X,file="coordinates_307x306.txt", quote = FALSE, sep="\t" ,row.names=T)
#--------------------- Coordinates for each dataset-------------------------#
#UCf
ucf <- read.delim("~/Desktop/ThesisData/progettoR/HS_UCf/Sokol_16S_taxa_HS_UCflare_commsamp_training.txt", stringsAsFactors=F)[,-1]
names_ucf <- as.data.frame(colnames(ucf))
coord_names <- as.data.frame(row.names(X))
coord_ucf <- which(names_ucf[,1]%in%coord_names[,1])
#delete the coordinates that are not in the dataset
row_del <-which(coord_names[,1]%in%names_ucf[,1]==F)
X_ucf <- X[-row_del,]
PCs <- 1:dim(X_ucf)[2]
X_ucf <- cbind(PCs, t(X_ucf))
write.table(X_ucf,file="coordinates_ucf.txt", quote = FALSE, sep="\t" ,row.names=F)
#---------------------------------UCr--------------------------------------------#
ucr <- read.delim("~/Desktop/ThesisData/progettoR/HS_UCr/Sokol_16S_taxa_HS_UCr_commsamp_training.txt")[-1]
names_ucr <- as.data.frame(colnames(ucr))
index <- which(names_ucr[,1]%in%coord_names[,1])
#delete the coordinates that are not in the dataset
row_del <-which(coord_names[,1]%in%names_ucr[,1]==F)
X_ucr <- X[-row_del,]
PCs <- 1:dim(X_ucr)[2]
X_ucr <- cbind(PCs, t(X_ucr))
write.table(X_ucr,file="coordinates_ucr.txt", quote = FALSE, sep="\t" ,row.names=F)
#----------------------------------CDf----------------------------------------#
cdf <- read.delim("~/Desktop/ThesisData/progettoR/HS_CDf/Sokol_16S_taxa_HS_CDflare_commsamp_training.txt")[-1]
names_cdf <- as.data.frame(colnames(cdf))
index <- which(names_cdf[,1]%in%coord_names[,1])
#delete the coordinates that are not in the dataset
row_del <-which(coord_names[,1]%in%names_cdf[,1]==F)
X_cdf <- X[-row_del,]
PCs <- 1:dim(X_cdf)[2]
X_cdf <- cbind(PCs, t(X_cdf))
write.table(X_cdf,file="coordinates_cdf.txt", quote = FALSE, sep="\t" ,row.names=F)
#---------------------------------CDr---------------------------------------------------#
cdr <- read.delim("~/Desktop/ThesisData/progettoR/HS_CDr/Sokol_16S_taxa_HS_CDr_commsamp_training.txt")[-1]
names_cdr <- as.data.frame(colnames(cdr))
index <- which(names_cdr[,1]%in%coord_names[,1])
#delete the coordinates that are not in the dataset
row_del <-which(coord_names[,1]%in%names_cdr[,1]==F)
X_cdr <- X[-row_del,]
PCs <- 1:dim(X_cdr)[2]
X_cdr <- cbind(PCs, t(X_cdr))
write.table(X_cdr,file="coordinates_cdr.txt", quote = FALSE, sep="\t" ,row.names=F)
#----------------------------------iCDf----------------------------------------#
icdf <- read.delim("~/Desktop/ThesisData/progettoR/HS_iCDf/Sokol_16S_taxa_HS_iCDflare_commsamp_training.txt")[-1]
names_icdf <- as.data.frame(colnames(icdf))
index <- which(names_icdf[,1]%in%coord_names[,1])
#delete the coordinates that are not in the dataset
row_del <-which(coord_names[,1]%in%names_icdf[,1]==F)
X_icdf <- X[-row_del,]
PCs <- 1:dim(X_icdf)[2]
X_icdf <- cbind(PCs, t(X_icdf))
write.table(X_icdf,file="coordinates_icdf.txt", quote = FALSE, sep="\t" ,row.names=F)
#---------------------------------iCDr---------------------------------------------------#
icdr <- read.delim("~/Desktop/ThesisData/progettoR/HS_iCDr/Sokol_16S_taxa_HS_iCDr_commsamp_training.txt")[-1]
names_icdr <- as.data.frame(colnames(icdr))
index <- which(names_icdr[,1]%in%coord_names[,1])
#delete the coordinates that are not in the dataset
row_del <-which(coord_names[,1]%in%names_icdr[,1]==F)
X_icdr <- X[-row_del,]
PCs <- 1:dim(X_icdr)[2]
X_icdr<- cbind(PCs, t(X_icdr))
write.table(X_icdr,file="coordinates_icdr.txt", quote = FALSE, sep="\t" ,row.names=F)
\ No newline at end of file
This diff is collapsed.
################################################################################
# Aims : creation of synthetic data set to validate the sPCA-rSVD method
# fix the first 2 eigenvector, then construction of the covariance matrix. Compare the eigenvector computed
# using sPCA-rSVD with the true one
# selection of the degree of sparsity using the paramenter tuning
#-------------------------------------------------------------------------------
library("pracma")
library("cvTools")
library("circular")
library("TunePareto")
library("ggplot2")
sPCA_rSVD <- function(decomp, X, j_par, type=c("SCAD", "soft_thr", "hard_thr"), tol=1e-13, max.iter=100) {
type <- match.arg(type)
#decomp <- svd(X)
v_old <- decomp$v[,1]
u_old <-decomp$d[1]*decomp$u[,1]
#cat("#### v_old ", v_old, " ####\n")
if (j_par==0) {
lambda_par <- 0
} else {
Xvold <- sort(abs(X%*%v_old))
lambda_par <- (Xvold[j_par]+Xvold[(j_par+1)])/2
}
h_lambda <- switch(type,
soft_thr = mcia:::h_lambda_soft,
hard_thr = mcia:::h_lambda_hard,
SCAD = mcia:::h_lambda_SCAD)
#convergence in norm
norm_diffv <- norm_diffu <- tol+1
iter <- 0
while ((norm_diffv >= tol || norm_diffu >=tol) && iter < max.iter) {
iter <- iter + 1
u_new <- h_lambda(X%*%v_old, lambda=lambda_par)
v_new <- drop(t(X)%*%u_new)
v_new <- v_new/sqrt(crossprod(v_new,v_new))
norm_diffv <- sqrt(crossprod((v_new-v_old),(v_new-v_old)))
norm_diffu <- sqrt(crossprod((u_new-u_old),(u_new-u_old)))
v_old <- v_new
u_old <- u_new
}
u <- u_new/sqrt(crossprod(u_new,u_new))
res <- list(v=drop(v_new), u=drop(u), convergence=iter <= max.iter)
return(res)
}
#-------------------------------------------------------------------------------
#tuning_j_par K-fold CV Tuning j_par parameter selection
# input:
# X ------ matrix to be decomposed using SVD
# ntimes ------ ntimes number of repetition on CV
# nfold ------ nfold cross validation
# opt_penalty ------ type of penalty to select sparsity
#
# output: type numeric
# j_opt ------ optimal degree of sparsity
# scores ------ matrix of CV scores (nrow=p,ncol=N)
#-------------------------------------------------------------------------------
tuning_j_par <- function(X, ntimes, nfold, type, fun=median, verbose=FALSE, ...) {
nr <- nrow(X) #n
nc <- ncol(X) #p
#CV_score_fold <- rep(0, ntimes)
j_opt_cv <- rep(0, ntimes)
#matrix saving CV_score for each j in each fold
CV_score <- c()#matrix(0, nrow=nr, ncol=ntimes)
set.seed(1234)
cv_folds <- generateCVRuns(1:nc, ntimes=ntimes, nfold=nfold, stratified=TRUE)
CV_temp <- rep(0, ntimes)
for (i in 1:ntimes) {
if (verbose)
cat("Repetion: ", i, "\n", sep = "")
cv_folds_current <- cv_folds[[i]]
#array with CV score_j for each l-fold
CV_score_j <- matrix(0, nrow=nr, ncol=nfold)
for (l in 1:nfold) {
X_whitout_l <- X[,-cv_folds_current[[l]],drop=FALSE]
X_l <- X[,cv_folds_current[[l]],drop=FALSE]
ncj <- NCOL(X_l)
decomp <- svd(X_whitout_l)
for (j in 0:(nr-1)) {
if (verbose)
cat("Number of variables", j, "\n")
sPCA <- sPCA_rSVD(decomp, X=X_whitout_l, type=type, j_par=j, ...)
u_j <- sPCA$u
v_j <- drop(t(X_l)%*%u_j)
#compute single CV score,depend on j and the l-fold
CV_score_j[j+1, l] <- (1/(ncj*nr))*sum((X_l- outer(u_j,v_j))^2)
#cat("#### CV_score_j ",CV_score_j , " ####\n")
}
}
CV_score<-cbind(CV_score, apply(CV_score_j[,1:nfold], 1, sum))
#cat("#### CV_score ",CV_score , " ####\n")
}
# save j responsible for min of CV score
goals <- apply(CV_score, 1, fun)
j_opt_cv <- which.min(goals) - 1
if(j_opt_cv==nr)
cat("the optimal degree is exactly the length of the vector")
res <- list(j=j_opt_cv, scores=CV_score)
return(res)
}
# function to create data X from eigenvector and eigenvalues
create_X <- function(v1, v2, C, n=100, tol=1e-13, m = 1){
p <- length(v1)
# normalize eigenvector
v1 <- v1/ sqrt(sum(v1 * v1))
v2 <- v2/ sqrt(sum(v2 * v2))
V <- matrix(0,ncol=p,nrow=p)
V[,1:2]<-c(v1,v2)
# creation of the other 8 eigenvector
for (j in 3:ncol(V)){
set.seed(j+m) #fissare la V?
V[,j] <- runif(p)
}
# if V is not full rank, recompute it
if (det(V) < tol){
for (j in 3:ncol(V)){
set.seed(j+1234+m)
V[,j] <- runif(p)
}
}
GS_orth <- gramSchmidt(V, tol = tol)
# V = Q %*% R
V <- GS_orth$Q #orthogonal matrix
# Z ~ N(0,I_P)
# set.seed(m)
# Z <- matrix(rnorm(n*p), nrow = p, ncol = n)
Z <- matrix(0, nrow = p, ncol = n)
for (j in 1:n){
set.seed(j+m)
Z[,j] <- rnorm(p)
}
# generation of the data
X <- V %*% diag(sqrt(C)) %*% Z #p x n
return(X)
}
# test sPCA with known degree of sparsity
# generation of l = 100 datasets
l <- 100
opt_penalty="SCAD"
sPCA_oracle_method <- function(v1, v2, C, l=100, opt_penalty){
p <- length(v1)
j1_known <- length(which(v1==0))
j2_known <- length(which(v2==0))
v1_spca <- list()
v2_spca <- list()
for (m in 1:l){
X_temp <- create_X(v1, v2, C, m = m) #each matrix is p x n
#component 1
decomp <- svd(X_temp)
sPCA_1 <- sPCA_rSVD(decomp,X = X_temp, j_par = j1_known, type = opt_penalty)
v1_spca[[m]] <- sPCA_1$u
X_1 <- X_temp%*%tcrossprod(sPCA_1$v,sPCA_1$v)
#cumulative percentage of variance (CPEV)
cum_perc_1 <- sum(diag(tcrossprod(X_1,X_1)))/sum(diag(tcrossprod(X_temp,X_temp)))
lambda_1 <- cum_perc_1
#component 2
X_new_temp <- X_temp - sqrt(lambda_1 * sum(diag(t(X_temp)%*%X_temp))) * sPCA_1$u %*% t(sPCA_1$v)
decomp <- svd(X_new_temp)
sPCA_2 <- sPCA_1 <- sPCA_rSVD(decomp, X = X_new_temp, j_par = j2_known, type = opt_penalty)
v2_spca[[m]] <- sPCA_2$u
cat(m,"\n")
}
return(list(v1=v1_spca,v2=v2_spca))
}
sPCA_CV_method <- function(v1, v2, C, l=100, opt_penalty,N){
p <- length(v1)
v1_spca <- list()
v2_spca <- list()
for (m in 1:l){
X_temp <- create_X(v1, v2, C, m = m) #each matrix is p x n
#solution order 1
j1_opt<-tuning_j_par(X_temp, ntimes=5, nfold=5, type=opt_penalty, fun=median, verbose=FALSE)$j
decomp <- svd(X_temp)
sPCA_1 <- sPCA_rSVD(decomp,X = X_temp, j_par = j1_opt, type = opt_penalty)
v1_spca[[m]] <- sPCA_1$u
X_1 <- X_temp%*%tcrossprod(sPCA_1$v,sPCA_1$v)
#cumulative percentage of variance (CPEV)
cum_perc_1 <- sum(diag(tcrossprod(X_1,X_1)))/sum(diag(tcrossprod(X_temp,X_temp)))
lambda_1 <- cum_perc_1
#solution order 2
X_new_temp <- X_temp - sqrt(lambda_1 * sum(diag(t(X_temp)%*%X_temp))) * sPCA_1$u %*% t(sPCA_1$v)
j2_opt<-tuning_j_par(X_new_temp, ntimes=10, nfold=5, type=opt_penalty, fun=median, verbose=FALSE)$j
decomp <- svd(X_new_temp)
sPCA_2 <- sPCA_rSVD(decomp, X = X_new_temp, j_par = j2_opt, type = opt_penalty)
v2_spca[[m]] <- sPCA_2$u
cat(m,"\n")
}
return(list(v1=v1_spca,j1=j1_opt,v2=v2_spca,j2=j2_opt))
}
#compute the median angle
# v_true: vector of the true eigenvector
# v_spca: list of vector, one vector for each simulation (100)
compare_eig <- function(v_true,v_spca){
v_true <- v_true/ sqrt(sum(v_true^2))
M <- length(v_spca)
# sgn_true <- sign(v_true)
# create list v_spca with signs according to v_true sign
# v_spca_sgn <- list()
# for (h in 1:M){
# sgn_h <- sign(v_spca[[h]])
# if (any(abs(sgn_true-sgn_h)==2)){
# v_spca_sgn[[h]] <- v_spca[[h]]*(-1)
# }
# else v_spca_sgn[[h]] <- v_spca[[h]]
# }
#
# # angle between the 2 vectors
# angle <- vector(length = M)
# for (h in 1:M)
# angle[h] <- acos(sum(v_true * t(v_spca_sgn[[h]])) / (sqrt(sum(v_true^2)) * sqrt(sum(v_spca_sgn[[h]]^2))))
# angle between the 2 vectors
angle <- vector(length = M)
for (h in 1:M){
true_angle <- acos(sum(v_true * t(v_spca[[h]])) / (sqrt(sum(v_true^2)) * sqrt(sum(v_spca[[h]]^2))))
angle[h] <- min(abs(pi-true_angle),abs(true_angle))
}
median_angle <- median(circular(x = angle))
#percentage of correctly identified zero loadings
correct <- vector(length = M)
correct_perc <- vector()
#percentage of incorrectly identified zero loadings
incorrect <- vector(length = M)
incorrect_perc <- vector()
#find zero loadings in v_true
zero_true <- which(v_true==0)
n_zero_true <- length(zero_true)
not_zero_true <- which(v_true!=0)
n_not_zero_true <- length(not_zero_true)
for (h in 1:M) {
zero_spca_temp <- which(v_spca[[h]]==0)
not_zero_spca_temp <- which(v_spca[[h]]!=0)
#number of zero loadings in v_true found also in v_spca
correct[h] <- length(intersect(zero_true,zero_spca_temp))/n_zero_true
correct_perc[h] <- 100 * correct[h]
#number of loadings set to zero in v_spca but not zero in v_true