Commit 5da0d54c authored by Nicole Bussola's avatar Nicole Bussola
Browse files

Merge branch 'master' of https://gitlab.fbk.eu/MPBA/inf_revamped

parents 44955a48 c1315d92
## Author: MC
if(!require(pacman))install.packages("pacman")
pacman::p_load(tidyverse,
plyr,
caret,
mlr,
doParallel)
# GDrive INF folder: edit this
ROOT <- "~/Google Drive/work/INF"
# TCGA data directory
DATADIR <- file.path(ROOT, "TCGA_data")
# output directory
OUTROOT <- DATADIR
tumor <- "Breast"
targets <- list(ER="clin:breast_carcinoma_estrogen_receptor_status", subtypes="clin:PAM50Call")
task <- "ER"
# task-independent: generate a merged file from available omics layers
mergedFile <- file.path(OUTROOT, tumor, "merged.txt")
if(file.exists(mergedFile)) {
mrg <- read_tsv(mergedFile)
} else {
samples <- read_tsv(file.path(DATADIR, tumor, "clinical.csv"))
expr <- read_tsv(file.path(DATADIR, tumor, "gene_expression.csv"))
cnv <- read_tsv(file.path(DATADIR, tumor, "copy_number.csv"))
prot <- read_tsv(file.path(DATADIR, tumor, "protein_abundance.csv"))
samples <- samples %>% setNames(paste0("clin:", names(.))) %>%
dplyr::rename(Sample=`clin:Sample`)
# throw away columns with all NAs
samples <- samples[, colSums(is.na(samples)) != nrow(samples)]
# address PAM50 names (for backwards-compatibility w/ previous dataset)
samples$`clin:PAM50Call`[samples$`clin:PAM50Call` == "Basal"] <- "Basal-like"
samples$`clin:PAM50Call`[samples$`clin:PAM50Call` == "Her2"] <- "HER2-enriched"
samples$`clin:PAM50Call`[samples$`clin:PAM50Call` == "LumA"] <- "LuminalA"
samples$`clin:PAM50Call`[samples$`clin:PAM50Call` == "LumB"] <- "LuminalB"
expr <- expr %>% setNames(paste0("gene:", names(.))) %>%
dplyr::rename(Sample=`gene:Sample`)
# impute missing gene expr values
tmp <- mlr::impute(expr, classes=list(numeric=mlr::imputeMedian()))
# address colnames
colnames(tmp$data) <- colnames(expr)
expr.imputed <- as_tibble(tmp$data)
cnv <- cnv %>% setNames(paste0("cnv:", names(.))) %>%
dplyr::rename(Sample=`cnv:Sample`)
prot <- prot %>% setNames(paste0("prot:", names(.))) %>%
dplyr::rename(Sample=`prot:Sample`)
# inner-join all dataframes
mrg <- join_all(list(samples, expr.imputed, cnv, prot), by="Sample", type="inner")
# remove near-zero variance features
nzv <- nearZeroVar(mrg, saveMetrics=TRUE, foreach=TRUE)
mrg <- mrg[, !nzv$nzv]
# save
write_tsv(mrg, file.path(OUTROOT, tumor, "merged.txt"))
}
# prepare multicore
cl <- makeCluster(4)
registerDoParallel(cl)
# task-dependent: generate splits tr/ts
tgt <- targets[[task]] # column name of the target variable
if(task=="ER") {
mrg.sub <- mrg[mrg[[tgt]] %in% c("Negative", "Positive"), ]
} else if(task=="subtypes") {
mrg.sub <- mrg[mrg[[tgt]] %in% c("Basal-like", "HER2-enriched", "LuminalA", "LuminalB"), ]
}
y <- factor(mrg.sub[[tgt]])
foreach(split.id=seq(0, 9), .verbose=FALSE) %dopar% {
outdir <- file.path(OUTROOT, tumor, "INF", ifelse(task=="ER", "breast_ER", task), split.id)
if(!dir.exists(outdir))
dir.create(outdir, recursive=TRUE)
# make it so that the 1st split is for set.seed(78), which was the one created previously
set.seed(78+split.id)
train.idx <- caret::createDataPartition(y=y, p=0.7, list=FALSE)
train.data <- mrg.sub[train.idx,]
train.lab <- y[train.idx]
test.data <- mrg.sub[-train.idx,]
test.lab <- y[-train.idx]
# write labels
write.table(train.lab, file=file.path(outdir, sprintf("labels_%s_tr.txt", task)), sep="\t", quote=FALSE, row.names=F, col.names=F)
write.table(test.lab, file=file.path(outdir, sprintf("labels_%s_ts.txt", task)), sep="\t", quote=FALSE, row.names=F, col.names=F)
# write juxtaposed datasets
# Gene+CNV
data.table::fwrite(dplyr::select(train.data, Sample, dplyr::starts_with("gene:"), dplyr::starts_with("cnv:")),
file.path(outdir, "gene_cnv_tr.txt"), sep="\t")
data.table::fwrite(dplyr::select(test.data, Sample, dplyr::starts_with("gene:"), dplyr::starts_with("cnv:")),
file.path(outdir, "gene_cnv_ts.txt"), sep="\t")
# Gene+Prot
data.table::fwrite(dplyr::select(train.data, Sample, dplyr::starts_with("gene:"), dplyr::starts_with("prot:")),
file.path(outdir, "gene_prot_tr.txt"), sep="\t")
data.table::fwrite(dplyr::select(test.data, Sample, dplyr::starts_with("gene:"), dplyr::starts_with("prot:")),
file.path(outdir, "gene_prot_ts.txt"), sep="\t")
# CNV+Prot
data.table::fwrite(dplyr::select(train.data, Sample, dplyr::starts_with("cnv:"), dplyr::starts_with("prot:")),
file.path(outdir, "cnv_prot_tr.txt"), sep="\t")
data.table::fwrite(dplyr::select(test.data, Sample, dplyr::starts_with("cnv:"), dplyr::starts_with("prot:")),
file.path(outdir, "cnv_prot_ts.txt"), sep="\t")
# write single layer datasets + clinical info
for(omic in c("gene", "cnv", "prot", "clin")) {
data.table::fwrite(dplyr::select(train.data, Sample, dplyr::starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_tr.txt", omic)), sep="\t")
data.table::fwrite(dplyr::select(test.data, Sample, dplyr::starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_ts.txt", omic)), sep="\t")
}
}
## Author: MC
library(tidyverse)
library(plyr)
library(caret)
# GDrive INF folder: edit this
ROOT <- "~/Google Drive/work/INF"
# TCGA data directory
DATADIR <- file.path(ROOT, "TCGA_data")
# ACGT data dir (for clinical info)
ACGTDIR <- file.path(ROOT, "ACGT_data")
# output directory
OUTROOT <- DATADIR
tumor <- "Breast"
samples <- read_tsv(file.path(DATADIR, tumor, "clinical.csv"))
expr <- read_tsv(file.path(DATADIR, tumor, "gene_expression.csv"))
cnv <- read_tsv(file.path(DATADIR, tumor, "copy_number.csv"))
prot <- read_tsv(file.path(DATADIR, tumor, "protein_abundance.csv"))
samples <- samples %>% setNames(paste0("clin:", names(.))) %>%
dplyr::rename(Sample=`clin:Sample`)
# throw away columns with all NAs
samples <- samples[, colSums(is.na(samples)) != nrow(samples)]
expr <- expr %>% setNames(paste0("gene:", names(.))) %>%
dplyr::rename(Sample=`gene:Sample`)
cnv <- cnv %>% setNames(paste0("cnv:", names(.))) %>%
dplyr::rename(Sample=`cnv:Sample`)
prot <- prot %>% setNames(paste0("prot:", names(.))) %>%
dplyr::rename(Sample=`prot:Sample`)
# join all dataframes
mrg <- join_all(list(samples, expr, cnv, prot), by="Sample", type="right")
# save
write_tsv(mrg, file.path(OUTROOT, tumor, "merged.txt"))
# split tr/ts
tgt <- "clin:patient.breast_carcinoma_estrogen_receptor_status"
mrg.sub <- mrg[mrg[, tgt] %in% c("negative", "positive"), ]
y <- mrg.sub[, tgt]
mrg.sub <- dplyr::select(mrg.sub, -`tgt`)
for(split.id in seq(0, 9)) {
outdir <- file.path(OUTROOT, tumor, "INF", "breast_ER", split.id)
if(!dir.exists(outdir))
dir.create(outdir, recursive=TRUE)
# make it so that the 1st split is for set.seed(78), which was the one created previously
set.seed(78+split.id)
train.idx <- createDataPartition(y=y, p=0.7, list=FALSE)
train.data <- mrg.sub[train.idx,]
train.lab <- ifelse(y[train.idx] == "positive", 1, 0)
test.data <- mrg.sub[-train.idx,]
test.lab <- ifelse(y[-train.idx] == "positive", 1, 0)
# write labels
write.table(train.lab, file=file.path(outdir, "labels_ER_tr.txt"), sep="\t", quote=FALSE, row.names=F, col.names=F)
write.table(test.lab, file=file.path(outdir, "labels_ER_ts.txt"), sep="\t", quote=FALSE, row.names=F, col.names=F)
# write juxtaposed datasets
# Gene+CNV
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("cnv:")),
file.path(outdir, "gene_cnv_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("cnv:")),
file.path(outdir, "gene_cnv_ts.txt"))
# Gene+Prot
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("prot:")),
file.path(outdir, "gene_prot_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("prot:")),
file.path(outdir, "gene_prot_ts.txt"))
# CNV+Prot
write_tsv(dplyr::select(train.data, Sample, starts_with("cnv:"), starts_with("prot:")),
file.path(outdir, "cnv_prot_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("cnv:"), starts_with("prot:")),
file.path(outdir, "cnv_prot_ts.txt"))
# write single layer datasets
for(omic in c("gene", "cnv", "prot")) {
write_tsv(select(train.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_tr.txt", omic)))
write_tsv(select(test.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_ts.txt", omic)))
}
}
## Author: MC
library(tidyverse)
library(plyr)
library(caret)
# GDrive INF folder: edit this
ROOT <- "~/Google Drive/work/INF"
# TCGA data directory
DATADIR <- file.path(ROOT, "TCGA_data")
# ACGT data dir (for clinical info)
ACGTDIR <- file.path(ROOT, "ACGT_data")
# output directory
OUTROOT <- DATADIR
tumor <- "Breast"
samples <- read_tsv(file.path(DATADIR, tumor, "clinical.csv"))
expr <- read_tsv(file.path(DATADIR, tumor, "gene_expression.csv"))
cnv <- read_tsv(file.path(DATADIR, tumor, "copy_number.csv"))
prot <- read_tsv(file.path(DATADIR, tumor, "protein_abundance.csv"))
samples <- samples %>% setNames(paste0("clin:", names(.))) %>%
dplyr::rename(Sample=`clin:Sample`)
# throw away columns with all NAs
samples <- samples[, colSums(is.na(samples)) != nrow(samples)]
samples.acgt <- read_tsv(file.path(ACGTDIR, "original/clinical", str_to_lower(tumor))) %>%
dplyr::select(Sample=sampleID, "clin:PAM50"=PAM50_mRNA_nature2012)
samples.acgt$Sample <- gsub("-", ".", samples.acgt$Sample)
samples.acgt$`clin:PAM50` <- gsub(" ", "", samples.acgt$`clin:PAM50`)
samples <- dplyr::left_join(samples, samples.acgt, by="Sample")
expr <- expr %>% setNames(paste0("gene:", names(.))) %>%
dplyr::rename(Sample=`gene:Sample`)
cnv <- cnv %>% setNames(paste0("cnv:", names(.))) %>%
dplyr::rename(Sample=`cnv:Sample`)
prot <- prot %>% setNames(paste0("prot:", names(.))) %>%
dplyr::rename(Sample=`prot:Sample`)
# join all dataframes
mrg <- join_all(list(samples, expr, cnv, prot), by="Sample", type="right")
# save
write_tsv(mrg, file.path(OUTROOT, tumor, "merged.txt"))
# split tr/ts
tgt <- "clin:PAM50"
mrg.sub <- mrg[mrg[, tgt] %in% c("Basal-like", "HER2-enriched", "LuminalA", "LuminalB"), ]
y <- factor(mrg.sub[, tgt])
mrg.sub <- dplyr::select(mrg.sub, -`tgt`)
for(split.id in seq(0, 9)) {
outdir <- file.path(OUTROOT, tumor, "INF", "subtypes", split.id)
if(!dir.exists(outdir))
dir.create(outdir, recursive=TRUE)
# make it so that the 1st split is for set.seed(78), which was the one created previously
set.seed(78+split.id)
train.idx <- createDataPartition(y=y, p=0.7, list=FALSE)
train.data <- mrg.sub[train.idx,]
train.lab <- y[train.idx]
test.data <- mrg.sub[-train.idx,]
test.lab <- y[-train.idx]
# write labels
write.table(train.lab, file=file.path(outdir, "labels_subtypes_tr.txt"), sep="\t", quote=FALSE, row.names=F, col.names=F)
write.table(test.lab, file=file.path(outdir, "labels_subtypes_ts.txt"), sep="\t", quote=FALSE, row.names=F, col.names=F)
# write juxtaposed datasets
# Gene+CNV
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("cnv:")),
file.path(outdir, "gene_cnv_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("cnv:")),
file.path(outdir, "gene_cnv_ts.txt"))
# Gene+Prot
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("prot:")),
file.path(outdir, "gene_prot_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("prot:")),
file.path(outdir, "gene_prot_ts.txt"))
# CNV+Prot
write_tsv(dplyr::select(train.data, Sample, starts_with("cnv:"), starts_with("prot:")),
file.path(outdir, "cnv_prot_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("cnv:"), starts_with("prot:")),
file.path(outdir, "cnv_prot_ts.txt"))
# write single layer datasets
for(omic in c("gene", "cnv", "prot")) {
write_tsv(select(train.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_tr.txt", omic)))
write_tsv(select(test.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_ts.txt", omic)))
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment