Commit f997db01 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Merge branch 'master' of gitlab.fbk.eu:MPBA/inf_revamped

parents 69275c80 02a4ef98
#%%
import os
import numpy as np
import pandas as pd
from mlpy import borda_count
from input_output import load_data
#%%
DATA_PATH = 'data/tcga_breast/subtypes'
PATH = 'results/tcga_breast/subtypes'
mode = 'rSNF'
assert mode in ['juxt', 'rSNF', 'single']
N_LAYERS = 3
if N_LAYERS==1:
assert mode=='single'
single_layer = 'prot'
else:
layer1 = 'gene'
layer2 = 'cnv'
layer3 = 'prot'
if N_LAYERS==3:
layers = f'{layer1}_{layer2}_{layer3}'
assert len(layers.split('_')) == 3
elif N_LAYERS==2:
layers = f'{layer1}_{layer2}'
assert len(layers.split('_')) == 2
else:
layers = f'{single_layer}'
assert len(layers.split('_')) == 1
N_SPLIT = 10
CV_K = 5
CV_N = 10
_, var_names, _ = load_data(os.path.join(DATA_PATH,f'0/{layers}_tr.txt') )
rankings = []
#%%
for i in range(N_SPLIT):
if mode == 'rSNF':
file_ranking = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_rankList_ranking.csv.gz')
else:
file_ranking = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_ranking.csv.gz')
rank = pd.read_csv(file_ranking, header=None, sep='\t').values
rankings.append(rank)
rankings = np.vstack(rankings)
# %%
BORDA_ID, _, BORDA_POS = borda_count(rankings)
len(rankings),BORDA_ID
# %%
borda_df = pd.DataFrame(columns=["FEATURE_ID", "FEATURE_NAME", "MEAN_POS"])
for i, pos in zip(BORDA_ID, BORDA_POS):
borda_df = borda_df.append({'FEATURE_ID': i, 'FEATURE_NAME': var_names[i], 'MEAN_POS': pos+1}, ignore_index=True)
borda_df.to_csv(f"{PATH}/Borda_allSpilts_{mode}_{layers}.txt", sep='\t', index=False, float_format="%.3f")
# %%
#%%
import os
import numpy as np
import pandas as pd
import operator
from input_output import load_data
#%%
DATA_PATH = 'data/tcga_breast/ER/'
PATH = 'results/tcga_breast/ER/'
mode = 'rSNFi'
N_LAYERS = 3
layer1 = 'gene'
layer2 = 'cnv'
layer3 = 'prot'
N_SPLITS = 10
CV_K = 5
CV_N = 10
if N_LAYERS==3:
layers = f'{layer1}_{layer2}_{layer3}'
assert len(layers.split('_')) == 3
elif N_LAYERS==2:
layers = f'{layer1}_{layer2}'
assert len(layers.split('_')) == 2
else:
print('nuumber of layers must be > 1')
#%%
all_feats=[]
for i in range(N_SPLITS):
file_featureList = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
all_feats.extend(list(feats.FEATURE_NAME))
all_feats = list(set(all_feats))
#%%
positions = dict()
means = dict()
x=((len(all_feats)-1)*np.ones((1,N_SPLITS*CV_K*CV_N)))
#%%
for i in all_feats:
positions[i]=x.tolist()[0]
means[i]=0.0
#%%
for i in range(N_SPLITS):
file_featureList = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_featurelist.txt')
feats = pd.read_csv(file_featureList, sep='\t')
# a=/pd.read_table("./LISTS/"+str(i)+"_gene_cnv_prot_tr_RandomForest_KBest_featurelist.txt",header=0,sep="\t")
z=[None]*len(feats)
for k in range(len(feats)):
z[feats.FEATURE_ID[k]]=feats.FEATURE_NAME[k]
# b=pd.read_table("./LISTS/"+str(i)+"_gene_cnv_prot_tr_RandomForest_KBest_ranking.csv",header=None,sep="\t")
file_ranking = os.path.join(PATH,f'{i}/{mode}/{layers}_tr_RandomForest_KBest_ranking.csv.gz')
rankings = pd.read_csv(file_ranking, header=None, sep='\t')
for j in range(CV_K*CV_N):
for k in range(rankings.shape[1]):
positions[z[rankings.iloc[j][k]]][i*(CV_K*CV_N)+j]=1.0*k
#%%
#%%
for i in all_feats:
means[i]=np.mean(positions[i])
# best_feat_steps = []
#%%
sorted_means = sorted(means.items(), key=operator.itemgetter(1))
len(sorted_means)
#%%
borda_df = pd.DataFrame(sorted_means, columns=['FEATURE_NAME', 'MEAN_POS'])
borda_df.to_csv(f"{PATH}/Borda_allSpilts_{mode}_{layers}.txt", sep='\t', index=False, float_format="%.3f")
# %%
## Author: MC
library(tidyverse)
library(plyr)
library(caret)
# GDrive INF folder: edit this
ROOT <- "~/Google Drive/work/INF"
# TCGA data directory
DATADIR <- file.path(ROOT, "TCGA_data")
# ACGT data dir (for clinical info)
ACGTDIR <- file.path(ROOT, "ACGT_data")
# output directory
OUTROOT <- DATADIR
tumor <- "AML"
read_tcga <- function(datafile, sample_col="ID", suffix="01") {
df <- as.data.frame(read_delim(datafile, " ", progress=FALSE)) %>%
tibble::column_to_rownames(sample_col) %>%
dplyr::select(ends_with(suffix))
# variance filter
vars <- apply(df, 1, IQR)
quant <- quantile(vars, probs=0.5)
selected <- !is.na(vars) & vars > quant
as.data.frame(t(df[selected, ])) %>% tibble::rownames_to_column(var="Sample")
}
# list of ID suffixes to keep (tumor-dependent)
suffixes <- list()
suffixes[["AML"]] <- "03"
suffixes[["Colon"]] <- "01"
suffixes[["Breast"]] <- "01"
samples <- read_tsv(file.path(ACGTDIR, "original/clinical", str_to_lower(tumor)))
expr <- read_tcga(file.path(ACGTDIR, "original", str_to_lower(tumor), "exp"), suffix=suffixes[[tumor]])
meth <- read_tcga(file.path(ACGTDIR, "original", str_to_lower(tumor), "methy"), suffix=suffixes[[tumor]])
mirna <- read_tcga(file.path(ACGTDIR, "original", str_to_lower(tumor), "mirna"), suffix=suffixes[[tumor]])
samples <- samples %>% setNames(paste0("clin:", names(.))) %>%
dplyr::rename(Sample=`clin:sampleID`)
# throw away columns with all NAs
samples <- samples[, colSums(is.na(samples)) != nrow(samples)]
samples <- samples[grepl(sprintf("%s$", suffixes[[tumor]]), samples$Sample), ]
samples$Sample <- gsub("-", ".", samples$Sample)
expr <- expr %>% setNames(paste0("gene:", names(.))) %>%
dplyr::rename(Sample=`gene:Sample`)
meth <- meth %>% setNames(paste0("meth:", names(.))) %>%
dplyr::rename(Sample=`meth:Sample`)
mirna <- mirna %>% setNames(paste0("mirna:", names(.))) %>%
dplyr::rename(Sample=`mirna:Sample`)
# get common samples
omics <- list(expr=expr$Sample, meth=meth$Sample, mirna=mirna$Sample)
sampleIntersect <- Reduce(intersect, omics)
samples <- samples[samples$Sample %in% sampleIntersect, ]
# join all dataframes
mrg <- join_all(list(samples, expr, meth, mirna), by="Sample", type="left")
# remove near-zero variance features
nzv <- nearZeroVar(mrg, saveMetrics=TRUE, foreach=TRUE)
mrg <- mrg[, !nzv$nzv]
# save
write_tsv(mrg, file.path(OUTROOT, tumor, "merged.txt"))
# split tr/ts
tgt <- "clin:_OS_IND"
mrg.sub <- mrg[!is.na(mrg[, tgt]), ]
y <- factor(mrg.sub[, tgt])
mrg.sub <- dplyr::select(mrg.sub, -`tgt`)
table(y) # 0 = alive, 1 = deceased
# 0 1
# 56 101
for(split.id in seq(0, 9)) {
outdir <- file.path(OUTROOT, tumor, "INF", "OS", split.id)
if(!dir.exists(outdir))
dir.create(outdir, recursive=TRUE)
# make it so that the 1st split is for set.seed(78), which was the one created previously
set.seed(78+split.id)
train.idx <- createDataPartition(y=y, p=0.7, list=FALSE)
train.data <- mrg.sub[train.idx,]
train.lab <- y[train.idx]
test.data <- mrg.sub[-train.idx,]
test.lab <- y[-train.idx]
# write labels
write.table(train.lab, file=file.path(outdir, "labels_OS_tr.txt"), sep="\t", quote=FALSE, row.names=F, col.names=F)
write.table(test.lab, file=file.path(outdir, "labels_OS_ts.txt"), sep="\t", quote=FALSE, row.names=F, col.names=F)
# write juxtaposed datasets
# Gene+Meth
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("meth:")),
file.path(outdir, "gene_meth_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("meth:")),
file.path(outdir, "gene_meth_ts.txt"))
# Gene+mirna
write_tsv(dplyr::select(train.data, Sample, starts_with("gene:"), starts_with("mirna:")),
file.path(outdir, "gene_mirna_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("gene:"), starts_with("mirna:")),
file.path(outdir, "gene_mirna_ts.txt"))
# Meth+mirna
write_tsv(dplyr::select(train.data, Sample, starts_with("meth:"), starts_with("mirna:")),
file.path(outdir, "meth_mirna_tr.txt"))
write_tsv(dplyr::select(test.data, Sample, starts_with("meth:"), starts_with("mirna:")),
file.path(outdir, "meth_mirna_ts.txt"))
# write single layer datasets
for(omic in c("gene", "meth", "mirna")) {
write_tsv(select(train.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_tr.txt", omic)))
write_tsv(select(test.data, Sample, starts_with(sprintf("%s:", omic))), file.path(outdir, sprintf("%s_ts.txt", omic)))
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment