Commit 736968b3 authored by Marco Chierici's avatar Marco Chierici
Browse files

Create prepare_AML.R

parent 83317576
## Author: MC
library(tidyverse)
library(plyr)
library(caret)
# GDrive INF folder: edit this
ROOT <- "~/Google Drive/work/INF"
# TCGA data directory
DATADIR <- file.path(ROOT, "TCGA_data")
# ACGT data dir (for clinical info)
ACGTDIR <- file.path(ROOT, "ACGT_data")
# output directory
OUTROOT <- DATADIR
tumor <- "AML"
read_tcga <- function(datafile, sample_col="ID", suffix="01") {
df <- as.data.frame(read_delim(datafile, " ", progress=FALSE)) %>%
tibble::column_to_rownames(sample_col) %>%
dplyr::select(ends_with(suffix))
# variance filter
vars <- apply(df, 1, IQR)
quant <- quantile(vars, probs=0.5)
selected <- !is.na(vars) & vars > quant
as.data.frame(t(df[selected, ])) %>% tibble::rownames_to_column(var="Sample")
}
# list of ID suffixes to keep (tumor-dependent)
suffixes <- list()
suffixes[["AML"]] <- "03"
suffixes[["Colon"]] <- "01"
suffixes[["Breast"]] <- "01"
samples <- read_tsv(file.path(ACGTDIR, "original/clinical", str_to_lower(tumor)))
expr <- read_tcga(file.path(ACGTDIR, "original", str_to_lower(tumor), "exp"), suffix=suffixes[[tumor]])
meth <- read_tcga(file.path(ACGTDIR, "original", str_to_lower(tumor), "methy"), suffix=suffixes[[tumor]])
mirna <- read_tcga(file.path(ACGTDIR, "original", str_to_lower(tumor), "mirna"), suffix=suffixes[[tumor]])
samples <- samples %>% setNames(paste0("clin:", names(.))) %>%
dplyr::rename(Sample=`clin:sampleID`)
# throw away columns with all NAs
samples <- samples[, colSums(is.na(samples)) != nrow(samples)]
samples <- samples[grepl(sprintf("%s$", suffixes[[tumor]]), samples$Sample), ]
samples$Sample <- gsub("-", ".", samples$Sample)
expr <- expr %>% setNames(paste0("gene:", names(.))) %>%
dplyr::rename(Sample=`gene:Sample`)
meth <- meth %>% setNames(paste0("meth:", names(.))) %>%
dplyr::rename(Sample=`meth:Sample`)
mirna <- mirna %>% setNames(paste0("mirna:", names(.))) %>%
dplyr::rename(Sample=`mirna:Sample`)
# get common samples
omics <- list(expr=expr$Sample, meth=meth$Sample, mirna=mirna$Sample)
sampleIntersect <- Reduce(intersect, omics)
samples <- samples[samples$Sample %in% sampleIntersect, ]
# join all dataframes
mrg <- join_all(list(samples, expr, meth, mirna), by="Sample", type="left")
# remove near-zero variance features
nzv <- nearZeroVar(mrg, saveMetrics=TRUE, foreach=TRUE)
mrg <- mrg[, !nzv$nzv]
# save
write_tsv(mrg, file.path(OUTROOT, tumor, "merged.txt"))
# split tr/ts
tgt <- "clin:_OS_IND"
mrg.sub <- mrg[!is.na(mrg[, tgt]), ]
y <- factor(mrg.sub[, tgt])
mrg.sub <- dplyr::select(mrg.sub, -`tgt`)
table(y) # 0 = alive, 1 = deceased
# 0 1
# 56 101
for(split.id in seq(0, 9)) {
outdir <- file.path(OUTROOT, tumor, "INF", "OS", split.id)
if(!dir.exists(outdir))
dir.create(outdir, recursive=TRUE)
# make it so that the 1st split is for set.seed(78), which was the one created previously
set.seed(78+split.id)
train.idx <- createDataPartition(y=y, p=0.7, list=FALSE)
train.data <- mrg.sub[train.idx,]
train.lab <- y[train.idx]
test.data <- mrg.sub[-train.idx,]
test.lab <- y[-train.idx]
# write labels
write.table(train.lab, file=file.path(outdir, "labels_OS_tr.txt"), sep="\t", quote=FALSE, row.names=F, col.names=F)
write.table(test.lab, file=file.path(outdir, "labels_OS_ts.txt"), sep="\t", quote=FALSE, row.names=F, col.names=F)
# write juxtaposed datasets
# Gene+Meth
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("meth:")),
file.path(outdir, "gene_meth_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("meth:")),
file.path(outdir, "gene_meth_ts.txt"))
# Gene+mirna
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("mirna:")),
file.path(outdir, "gene_mirna_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("mirna:")),
file.path(outdir, "gene_mirna_ts.txt"))
# Meth+mirna
write_tsv(dplyr::select(train.data, Sample, starts_with("meth:"), starts_with("mirna:")),
file.path(outdir, "meth_mirna_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("meth:"), starts_with("mirna:")),
file.path(outdir, "meth_mirna_ts.txt"))
# write single layer datasets
for(omic in c("gene", "meth", "mirna")) {
write_tsv(select(train.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_tr.txt", omic)))
write_tsv(select(test.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_ts.txt", omic)))
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment