Commit c65381bb authored by Marco Chierici's avatar Marco Chierici
Browse files

R data preprocessing

parent 7b7916f1
library(argparse)
library(tidyverse)
library(dplyr)
library(doParallel)
### functions
read_tcga <- function(datafile, sample_col="ID", suffix="01", filter=TRUE) {
df <- as.data.frame(readr::read_delim(datafile, " ", progress=FALSE)) %>%
tibble::column_to_rownames(sample_col) %>%
dplyr::select(ends_with(suffix))
if(filter) {
# variance filter
vars <- apply(df, 1, IQR)
quant <- quantile(vars, probs=0.5)
selected <- !is.na(vars) & vars > quant
dfout <- as.data.frame(t(df[selected, ])) %>% tibble::rownames_to_column(var="Sample")
} else
dfout <- as.data.frame(t(df)) %>% tibble::rownames_to_column(var="Sample")
dfout$Sample <- gsub("-", ".", dfout$Sample)
dfout
}
### main
parser <- ArgumentParser(description="Prepare TCGA data for INF pipeline.")
parser$add_argument("--tumor", type="character", help="Tumor type [e.g. GBM]")
parser$add_argument("--suffix", type="character", default="01", nargs="+", help="TCGA tumor suffix(es) to keep [e.g. 01] (default: %default)")
parser$add_argument("--datadir", type="character", help="Path to ShamirLab's ACGT data directory")
parser$add_argument("--outdir", type="character", help="Output data directory")
args <- parser$parse_args()
tumor <- args$tumor
tcga.suffix <- unlist(args$suffix)
ACGTDIR <- args$datadir
OUTROOT <- args$outdir
### PARAMETERS ###
# target variable
target.column <- "clin:_OS_IND"
target.name <- "OS"
# omics layers
all.layers <- c("gene", "meth", "mirna")
# base seed for tr/ts splits generation
base.seed <- 78
##################
if(str_to_lower(tumor) == "kidney") {
# this needs special care
samples <- readr::read_tsv(file.path(ACGTDIR, str_to_lower(tumor), "survival")) %>%
dplyr::rename(sampleID=PatientID, "_OS_IND"=Death)
samples <- samples[!is.na(samples$`_OS_IND`), ]
} else {
samples <- readr::read_tsv(file.path(ACGTDIR, "clinical", str_to_lower(tumor)))
}
expr <- read_tcga(file.path(ACGTDIR, str_to_lower(tumor), "exp"), suffix=tcga.suffix, filter=TRUE)
meth <- read_tcga(file.path(ACGTDIR, str_to_lower(tumor), "methy"), suffix=tcga.suffix, filter=TRUE)
mirna <- read_tcga(file.path(ACGTDIR, str_to_lower(tumor), "mirna"), suffix=tcga.suffix, filter=TRUE)
samples <- samples %>% setNames(paste0("clin:", names(.))) %>%
dplyr::rename(Sample=`clin:sampleID`)
# throw away columns with all NAs
samples <- samples[, colSums(is.na(samples)) != nrow(samples)]
samples <- samples[grepl(sprintf("%s$", tcga.suffix), samples$Sample), ]
samples$Sample <- gsub("-", ".", samples$Sample)
expr <- expr %>% setNames(paste0("gene:", names(.))) %>%
dplyr::rename(Sample=`gene:Sample`)
meth <- meth %>% setNames(paste0("meth:", names(.))) %>%
dplyr::rename(Sample=`meth:Sample`)
mirna <- mirna %>% setNames(paste0("mirna:", names(.))) %>%
dplyr::rename(Sample=`mirna:Sample`)
# get common samples
omics <- list(expr=expr$Sample, meth=meth$Sample, mirna=mirna$Sample)
sampleIntersect <- Reduce(intersect, omics)
samples <- samples[samples$Sample %in% sampleIntersect, ]
# join all dataframes
mrg <- plyr::join_all(list(samples, expr, meth, mirna), by="Sample", type="left")
# remove near-zero variance features
nzv <- caret::nearZeroVar(mrg, saveMetrics=TRUE, foreach=TRUE)
mrg <- mrg[, !nzv$nzv]
# save
if(!dir.exists(OUTROOT))
dir.create(OUTROOT, recursive=TRUE)
write_tsv(mrg, file.path(OUTROOT, "merged.txt"))
# split tr/ts
tgt <- target.column
mrg.sub <- mrg[!is.na(mrg[, tgt]), ]
y <- factor(mrg.sub[, tgt])
mrg.sub <- dplyr::select(mrg.sub, -all_of(tgt))
# prepare multicore
cl <- makeCluster(4)
registerDoParallel(cl)
foreach(split.id=seq(0, 9), .verbose=FALSE) %dopar% {
outdir <- file.path(OUTROOT, target.name, split.id)
if(!dir.exists(outdir))
dir.create(outdir, recursive=TRUE)
# make it so that the 1st split is for set.seed(78), which was the one created previously
set.seed(base.seed+split.id)
train.idx <- caret::createDataPartition(y=y, p=0.7, list=FALSE)
train.data <- mrg.sub[train.idx,]
train.lab <- y[train.idx]
test.data <- mrg.sub[-train.idx,]
test.lab <- y[-train.idx]
# write labels
write.table(train.lab, file=file.path(outdir, sprintf("labels_%s_tr.txt", target.name)), sep="\t", quote=FALSE, row.names=F, col.names=F)
write.table(test.lab, file=file.path(outdir, sprintf("labels_%s_ts.txt", target.name)), sep="\t", quote=FALSE, row.names=F, col.names=F)
datasets <- list(tr=train.data, ts=test.data)
# write single layer datasets
for(omic in all.layers) {
for(partition in names(datasets)) {
data.table::fwrite(dplyr::select(datasets[[partition]], Sample, dplyr::starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_%s.txt", omic, partition)), sep="\t")
}
}
}
stopCluster(cl)
## Author: MC
if(!require(pacman))install.packages("pacman")
pacman::p_load(tidyverse,
plyr,
caret,
mlr,
doParallel)
library(argparse)
library(tidyverse)
library(dplyr)
library(doParallel)
# GDrive INF folder: edit this
ROOT <- "~/Google Drive/work/INF"
parser <- ArgumentParser(description="Prepare TCGA data for INF pipeline.")
parser$add_argument("--task", type="character", help="Task (ER or subtypes)")
parser$add_argument("--datadir", type="character", help="ACGT data directory")
parser$add_argument("--outdir", type="character", help="Output data directory")
args <- parser$parse_args()
task <- args$task
# TCGA data directory
DATADIR <- file.path(ROOT, "TCGA_data")
DATADIR <- args$datadir
# output directory
OUTROOT <- DATADIR
OUTROOT <- args$outdir
### PARAMETERS ###
tumor <- "Breast"
targets <- list(ER="clin:breast_carcinoma_estrogen_receptor_status", subtypes="clin:PAM50Call")
task <- "ER"
base.seed <- 78 # use 78 to (re)generate the "main" set of splits
# base seed for tr/ts splits generation
base.seed <- 78
if(!dir.exists(OUTROOT))
dir.create(OUTROOT, recursive=TRUE)
# task-independent: generate a merged file from available omics layers
mergedFile <- file.path(OUTROOT, tumor, "merged.txt")
mergedFile <- file.path(OUTROOT, "merged.txt")
if(file.exists(mergedFile)) {
mrg <- read_tsv(mergedFile)
} else {
......@@ -53,12 +59,12 @@ if(file.exists(mergedFile)) {
dplyr::rename(Sample=`prot:Sample`)
# inner-join all dataframes
mrg <- join_all(list(samples, expr.imputed, cnv, prot), by="Sample", type="inner")
mrg <- plyr::join_all(list(samples, expr.imputed, cnv, prot), by="Sample", type="inner")
# remove near-zero variance features
nzv <- nearZeroVar(mrg, saveMetrics=TRUE, foreach=TRUE)
nzv <- caret::nearZeroVar(mrg, saveMetrics=TRUE, foreach=TRUE)
mrg <- mrg[, !nzv$nzv]
# save
write_tsv(mrg, file.path(OUTROOT, tumor, "merged.txt"))
write_tsv(mrg, mergedFile)
}
# prepare multicore
......@@ -75,7 +81,7 @@ if(task=="ER") {
y <- factor(mrg.sub[[tgt]])
foreach(split.id=seq(0, 9), .verbose=FALSE) %dopar% {
outdir <- file.path(OUTROOT, tumor, "INF", ifelse(task=="ER", "breast_ER", task), split.id)
outdir <- file.path(OUTROOT, task, sprintf("split_seed_%d", base.seed), split.id)
if(!dir.exists(outdir))
dir.create(outdir, recursive=TRUE)
set.seed(base.seed+split.id)
......@@ -89,26 +95,13 @@ foreach(split.id=seq(0, 9), .verbose=FALSE) %dopar% {
write.table(train.lab, file=file.path(outdir, sprintf("labels_%s_tr.txt", task)), sep="\t", quote=FALSE, row.names=F, col.names=F)
write.table(test.lab, file=file.path(outdir, sprintf("labels_%s_ts.txt", task)), sep="\t", quote=FALSE, row.names=F, col.names=F)
# write juxtaposed datasets
# Gene+CNV
data.table::fwrite(dplyr::select(train.data, Sample, dplyr::starts_with("gene:"), dplyr::starts_with("cnv:")),
file.path(outdir, "gene_cnv_tr.txt"), sep="\t")
data.table::fwrite(dplyr::select(test.data, Sample, dplyr::starts_with("gene:"), dplyr::starts_with("cnv:")),
file.path(outdir, "gene_cnv_ts.txt"), sep="\t")
# Gene+Prot
data.table::fwrite(dplyr::select(train.data, Sample, dplyr::starts_with("gene:"), dplyr::starts_with("prot:")),
file.path(outdir, "gene_prot_tr.txt"), sep="\t")
data.table::fwrite(dplyr::select(test.data, Sample, dplyr::starts_with("gene:"), dplyr::starts_with("prot:")),
file.path(outdir, "gene_prot_ts.txt"), sep="\t")
# CNV+Prot
data.table::fwrite(dplyr::select(train.data, Sample, dplyr::starts_with("cnv:"), dplyr::starts_with("prot:")),
file.path(outdir, "cnv_prot_tr.txt"), sep="\t")
data.table::fwrite(dplyr::select(test.data, Sample, dplyr::starts_with("cnv:"), dplyr::starts_with("prot:")),
file.path(outdir, "cnv_prot_ts.txt"), sep="\t")
datasets <- list(tr=train.data, ts=test.data)
# write single layer datasets + clinical info
for(omic in c("gene", "cnv", "prot", "clin")) {
data.table::fwrite(dplyr::select(train.data, Sample, dplyr::starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_tr.txt", omic)), sep="\t")
data.table::fwrite(dplyr::select(test.data, Sample, dplyr::starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_ts.txt", omic)), sep="\t")
for(partition in names(datasets)) {
data.table::fwrite(dplyr::select(datasets[[partition]], Sample, dplyr::starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_%s.txt", omic, partition)), sep="\t")
}
}
}
stopCluster(cl)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment