Commit 92addcf3 authored by Nicole Bussola's avatar Nicole Bussola
Browse files

minor fix

parent 1c24e8a0
%% Cell type:markdown id: tags:
# UMAP projection
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from pathlib import Path
import numpy as np
from bokeh.plotting import figure, output_file, show, save
from bokeh.io import output_notebook, export_png
from bokeh.palettes import colorblind
from bokeh.models import CategoricalColorMapper, ColumnDataSource, LassoSelectTool, WheelZoomTool, ZoomInTool, BoxZoomTool, ResetTool
from bokeh.layouts import gridplot
from bokeh.resources import CDN
from bokeh.embed import file_html
import numpy as np
import umap
```
%% Cell type:code id: tags:
``` python
output_notebook()
```
%%%% Output: display_data
%%%% Output: display_data
%% Cell type:markdown id: tags:
## Load Features datasets
%% Cell type:code id: tags:
``` python
TASK = 'subtypes'
DATASET = 'tcga_breast'
MODEL = 'randomForest'
layers = 'gene_cnv_prot'
PATH = Path('data') / DATASET / TASK
PATH_RESULTS = Path('results') / DATASET / TASK / MODEL
SPLIT = 2# choose a random split for the train, test, and test2 files
```
%% Cell type:code id: tags:
``` python
file_tr = f'{PATH}/{SPLIT}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{SPLIT}/{layers}_ts.txt' # test UMAP on TS
file_test2 = f'{PATH}/{SPLIT}/{layers}_ts2.txt' # test UMAP on TS2
features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0)
features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0)
```
%% Cell type:code id: tags:
``` python
BEST = False # restrict the features to the INF signature
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{SPLIT}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
best_train = features_train[INF_feats]
best_test = features_test[INF_feats]
best_test2 = features_test2[INF_feats]
```
%% Cell type:code id: tags:
``` python
if BEST:
features_train = best_train
features_test = best_test
features_test2 = best_test2
```
%% Cell type:code id: tags:
``` python
samples_tr = features_train.index
labels_tr = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
features_train['labels'] = labels_tr
labels_tr = features_train['labels']
features_tr = features_train[features_train.columns[:-1]].values
samples_test = features_test.index
labels_test = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
features_test['labels'] = labels_test
labels_test = features_test['labels']
features_ts = features_test[features_test.columns[:-1]].values
samples_test2 = features_test2.index
labels_test2 = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
features_test2['labels'] = labels_test2
labels_test2 = features_test2['labels']
features_ts2 = features_test2[features_test2.columns[:-1]].values
```
%% Cell type:code id: tags:
``` python
train_data = features_tr
test_data = features_ts
test2_data = features_ts2
```
%% Cell type:code id: tags:
``` python
# Check
print(len(features_tr),len(samples_tr),len(labels_tr))
print(len(features_ts),len(samples_test),len(labels_test))
print(len(features_ts2),len(samples_test2),len(labels_test2))
```
%% Cell type:markdown id: tags:
## Fit on the training data and transform the test set into the learned space
%% Cell type:code id: tags:
``` python
mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2,
metric='euclidean').fit(train_data)
test_embedding = mapper.transform(test_data)
test2_embedding = mapper.transform(test2_data)
```
%% Cell type:code id: tags:
``` python
# Check
len(mapper.embedding_), len(test_embedding),len(test2_embedding)
```
%% Cell type:markdown id: tags:
## Plot UMAP 2D projection
%% Cell type:code id: tags:
``` python
TOOLTIPS = [
("index", "$index"),
("(x,y)", "($x, $y)"),
("desc", "@desc"),
]
mycols = colorblind['Colorblind'][4]
myclasses = pd.unique(labels_tr).tolist()
p = figure(plot_width=1200, plot_height=1200, tooltips = TOOLTIPS, tools='save', toolbar_location="left")
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
size=12
for col, theclass in zip(mycols,myclasses):
idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist()
samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr,columns=['x','y','sample'])
source_tr = ColumnDataSource(data=dict(
x=df_tr['x'],
y=df_tr['y'],
desc=df_tr['sample']))
p.circle(x='x',y='y',size=size,source=source_tr,color=col,alpha=0.8,legend=str(theclass))
idx_ts = np.where(np.array(labels_test)==theclass)[0].tolist()
samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts,columns=['x','y','sample'])
source_ts = ColumnDataSource(data=dict(
x=df_ts['x'],
y=df_ts['y'],
desc=df_ts['sample']))
p.triangle(x='x',y='y',size=size,source=source_ts,color=col,alpha=0.8)
idx_ts2 = np.where(np.array(labels_test2)==theclass)[0].tolist()
samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2,columns=['x','y','sample'])
source_ts2 = ColumnDataSource(data=dict(
x=df_ts2['x'],
y=df_ts2['y'],
desc=df_ts2['sample']))
p.diamond(x='x',y='y',size=size,source=source_ts2,color=col,alpha=0.8)
p.add_tools(LassoSelectTool())
p.add_tools(WheelZoomTool())
p.legend.label_text_font_size = "20pt"
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "15pt"
p.add_tools(ZoomInTool())
p.add_tools(ResetTool())
p.add_tools(BoxZoomTool())
p.legend.location = "top_left"
p.legend.click_policy='hide'
# p.title()
if BEST:
export_png(p, filename=f"subtypes_INF_split{SPLIT}.png") #save the plot
else:
export_png(p, filename=f"subtypes_juXT_split{SPLIT}.png")
show(p)
```
%% Cell type:markdown id: tags:
## Grid plot for all other splits
%% Cell type:code id: tags:
``` python
def range_with_ignore(start, stop, ignore):
return np.concatenate([
np.arange(start, ignore),
np.arange(ignore + 1, stop)
])
```
%% Cell type:code id: tags:
``` python
BEST = False
mycols = colorblind['Colorblind'][4]
plots = []
size = 12
for split in range_with_ignore(0,9,SPLIT).tolist():
for split in range_with_ignore(0,10,SPLIT).tolist():
file_tr = f'{PATH}/{split}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{split}/{layers}_ts.txt' # test UMAP
file_test2 = f'{PATH}/{split}/{layers}_ts2.txt' # test UMAP
features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0)
features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0)
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{split}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
best_train = features_train[INF_feats]
best_test = features_test[INF_feats]
best_test2 = features_test2[INF_feats]
if BEST:
features_train = best_train
features_test = best_test
features_test2 = best_test2
samples_tr = features_train.index
labels_tr = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
features_train['labels'] = labels_tr
labels_tr = features_train['labels']
features_tr = features_train[features_train.columns[:-1]].values
samples_test = features_test.index
labels_test = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
features_test['labels'] = labels_test
labels_test = features_test['labels']
features_ts = features_test[features_test.columns[:-1]].values
samples_test2 = features_test2.index
labels_test2 = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
features_test2['labels'] = labels_test2
labels_test2 = features_test2['labels']
features_ts2 = features_test2[features_test2.columns[:-1]].values
train_data = features_tr
test_data = features_ts
test2_data = features_ts2
mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2, metric='euclidean').fit(train_data)
test_embedding = mapper.transform(test_data)
test2_embedding = mapper.transform(test2_data)
myclasses = pd.unique(labels_tr).tolist()
p = figure(title=f'split {split}')
p.title.text_font_size = '25pt'
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
for col, theclass in zip(mycols,myclasses):
idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist()
samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr,columns=['x','y','sample'])
source_tr = ColumnDataSource(data=dict(
x=df_tr['x'],
y=df_tr['y'],
desc=df_tr['sample']))
p.circle(x='x',y='y',size=size,source=source_tr,color=col,alpha=0.8,legend=str(theclass))
idx_ts = np.where(np.array(labels_test)==theclass)[0].tolist()
samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts,columns=['x','y','sample'])
source_ts = ColumnDataSource(data=dict(
x=df_ts['x'],
y=df_ts['y'],
desc=df_ts['sample']))
p.triangle(x='x',y='y',size=size,source=source_ts,color=col,alpha=0.8)
idx_ts2 = np.where(np.array(labels_test2)==theclass)[0].tolist()
samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2,columns=['x','y','sample'])
source_ts2 = ColumnDataSource(data=dict(
x=df_ts2['x'],
y=df_ts2['y'],
desc=df_ts2['sample']))
p.diamond(x='x',y='y',size=size,source=source_ts2,color=col,alpha=0.8)
# p.legend.location = "bottom_left"
p.legend.label_text_font_size = "20pt"
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "15pt"
plots.append(p)
```
%%%% Output: stream
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
%% Cell type:code id: tags:
``` python
grid = gridplot([[plots[0], plots[1], plots[2]], [plots[3], plots[4], plots[5]], [plots[6], plots[7], plots[8]]], plot_width=1200, plot_height=1200)
if BEST:
export_png(grid, filename="subtypes_INF_suppl.png")
else:
export_png(grid, filename="subtypes_juXT_suppl.png")
show(grid)
```
%% Cell type:code id: tags:
``` python
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment