Commit b04198e6 authored by Marco Chierici's avatar Marco Chierici
Browse files

Added split generation script for Breast-subtypes

parent 34e0f0e8
## Author: MC
library(tidyverse)
library(plyr)
library(caret)
# GDrive INF folder: edit this
ROOT <- "~/Google Drive/work/INF"
# TCGA data directory
DATADIR <- file.path(ROOT, "TCGA_data")
# ACGT data dir (for clinical info)
ACGTDIR <- file.path(ROOT, "ACGT_data")
# output directory
OUTROOT <- DATADIR
tumor <- "Breast"
samples <- read_tsv(file.path(DATADIR, tumor, "clinical.csv"))
expr <- read_tsv(file.path(DATADIR, tumor, "gene_expression.csv"))
cnv <- read_tsv(file.path(DATADIR, tumor, "copy_number.csv"))
prot <- read_tsv(file.path(DATADIR, tumor, "protein_abundance.csv"))
samples <- samples %>% setNames(paste0("clin:", names(.))) %>%
dplyr::rename(Sample=`clin:Sample`)
# throw away columns with all NAs
samples <- samples[, colSums(is.na(samples)) != nrow(samples)]
samples.acgt <- read_tsv(file.path(ACGTDIR, "original/clinical", str_to_lower(tumor))) %>%
dplyr::select(Sample=sampleID, "clin:PAM50"=PAM50_mRNA_nature2012)
samples.acgt$Sample <- gsub("-", ".", samples.acgt$Sample)
samples.acgt$`clin:PAM50` <- gsub(" ", "", samples.acgt$`clin:PAM50`)
samples <- dplyr::left_join(samples, samples.acgt, by="Sample")
expr <- expr %>% setNames(paste0("gene:", names(.))) %>%
dplyr::rename(Sample=`gene:Sample`)
cnv <- cnv %>% setNames(paste0("cnv:", names(.))) %>%
dplyr::rename(Sample=`cnv:Sample`)
prot <- prot %>% setNames(paste0("prot:", names(.))) %>%
dplyr::rename(Sample=`prot:Sample`)
# join all dataframes
mrg <- join_all(list(samples, expr, cnv, prot), by="Sample", type="right")
# save
write_tsv(mrg, file.path(OUTROOT, tumor, "merged.txt"))
# split tr/ts
tgt <- "clin:PAM50"
mrg.sub <- mrg[mrg[, tgt] %in% c("Basal-like", "HER2-enriched", "LuminalA", "LuminalB"), ]
y <- factor(mrg.sub[, tgt])
mrg.sub <- dplyr::select(mrg.sub, -`tgt`)
for(split.id in seq(0, 9)) {
outdir <- file.path(OUTROOT, tumor, "INF", "subtypes", split.id)
if(!dir.exists(outdir))
dir.create(outdir, recursive=TRUE)
# make it so that the 1st split is for set.seed(78), which was the one created previously
set.seed(78+split.id)
train.idx <- createDataPartition(y=y, p=0.7, list=FALSE)
train.data <- mrg.sub[train.idx,]
train.lab <- y[train.idx]
test.data <- mrg.sub[-train.idx,]
test.lab <- y[-train.idx]
# write labels
write.table(train.lab, file=file.path(outdir, "labels_subtypes_tr.txt"), sep="\t", row.names=F, col.names=F)
write.table(test.lab, file=file.path(outdir, "labels_subtypes_ts.txt"), sep="\t", row.names=F, col.names=F)
# write juxtaposed datasets
# Gene+CNV
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("cnv:")),
file.path(outdir, "gene_cnv_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("cnv:")),
file.path(outdir, "gene_cnv_ts.txt"))
# Gene+Prot
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("prot:")),
file.path(outdir, "gene_prot_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("prot:")),
file.path(outdir, "gene_prot_ts.txt"))
# CNV+Prot
write_tsv(dplyr::select(train.data, Sample, starts_with("cnv:"), starts_with("prot:")),
file.path(outdir, "cnv_prot_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("cnv:"), starts_with("prot:")),
file.path(outdir, "cnv_prot_ts.txt"))
# write single layer datasets
for(omic in c("gene", "cnv", "prot")) {
write_tsv(select(train.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_tr.txt", omic)))
write_tsv(select(test.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_ts.txt", omic)))
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment