Commit 1c24e8a0 authored by Nicole Bussola's avatar Nicole Bussola
Browse files

add function in the grid plot

parent ff38b502
%% Cell type:markdown id: tags:
# UMAP projection
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from pathlib import Path
import numpy as np
from bokeh.plotting import figure, output_file, show, save
from bokeh.io import output_notebook, export_png
from bokeh.palettes import colorblind
from bokeh.models import CategoricalColorMapper, ColumnDataSource, LassoSelectTool, WheelZoomTool, ZoomInTool, BoxZoomTool, ResetTool
from bokeh.layouts import gridplot
from bokeh.resources import CDN
from bokeh.embed import file_html
import numpy as np
import umap
```
%% Cell type:code id: tags:
``` python
output_notebook()
```
%% Output
%% Cell type:markdown id: tags:
## Load Features datasets
%% Cell type:code id: tags:
``` python
TASK = 'subtypes'
DATASET = 'tcga_breast'
MODEL = 'randomForest'
layers = 'gene_cnv_prot'
PATH = Path('data') / DATASET / TASK
PATH_RESULTS = Path('results') / DATASET / TASK / MODEL
SPLIT = 2# choose a random split for the train, test, and test2 files
```
%% Cell type:code id: tags:
``` python
SPLIT = 2# choose a random split for the train, test, and test2 files
file_tr = f'{PATH}/{SPLIT}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{SPLIT}/{layers}_ts.txt' # test UMAP on TS
file_test2 = f'{PATH}/{SPLIT}/{layers}_ts2.txt' # test UMAP on TS2
features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0)
features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0)
```
%% Cell type:code id: tags:
``` python
BEST = True # restrict the features to the INF signature
BEST = False # restrict the features to the INF signature
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{SPLIT}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
best_train = features_train[INF_feats]
best_test = features_test[INF_feats]
best_test2 = features_test2[INF_feats]
```
%% Cell type:code id: tags:
``` python
if BEST:
features_train = best_train
features_test = best_test
features_test2 = best_test2
```
%% Cell type:code id: tags:
``` python
samples_tr = features_train.index
labels_tr = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
features_train['labels'] = labels_tr
labels_tr = features_train['labels']
features_tr = features_train[features_train.columns[:-1]].values
samples_test = features_test.index
labels_test = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
features_test['labels'] = labels_test
labels_test = features_test['labels']
features_ts = features_test[features_test.columns[:-1]].values
samples_test2 = features_test2.index
labels_test2 = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
features_test2['labels'] = labels_test2
labels_test2 = features_test2['labels']
features_ts2 = features_test2[features_test2.columns[:-1]].values
```
%% Cell type:code id: tags:
``` python
train_data = features_tr
test_data = features_ts
test2_data = features_ts2
```
%% Cell type:code id: tags:
``` python
# Check
print(len(features_tr),len(samples_tr),len(labels_tr))
print(len(features_ts),len(samples_test),len(labels_test))
print(len(features_ts2),len(samples_test2),len(labels_test2))
```
%% Cell type:markdown id: tags:
## Fit on the training data and transform the test set into the learned space
%% Cell type:code id: tags:
``` python
mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2,
metric='euclidean').fit(train_data)
test_embedding = mapper.transform(test_data)
test2_embedding = mapper.transform(test2_data)
```
%% Cell type:code id: tags:
``` python
# Check
len(mapper.embedding_), len(test_embedding),len(test2_embedding)
```
%% Cell type:markdown id: tags:
## Plot UMAP 2D projection
%% Cell type:code id: tags:
``` python
TOOLTIPS = [
("index", "$index"),
("(x,y)", "($x, $y)"),
("desc", "@desc"),
]
mycols = colorblind['Colorblind'][4]
myclasses = pd.unique(labels_tr).tolist()
p = figure(plot_width=1200, plot_height=1200, tooltips = TOOLTIPS, tools='save', toolbar_location="left")
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
size=12
for col, theclass in zip(mycols,myclasses):
idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist()
samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr,columns=['x','y','sample'])
source_tr = ColumnDataSource(data=dict(
x=df_tr['x'],
y=df_tr['y'],
desc=df_tr['sample']))
p.circle(x='x',y='y',size=size,source=source_tr,color=col,alpha=0.8,legend=str(theclass))
idx_ts = np.where(np.array(labels_test)==theclass)[0].tolist()
samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts,columns=['x','y','sample'])
source_ts = ColumnDataSource(data=dict(
x=df_ts['x'],
y=df_ts['y'],
desc=df_ts['sample']))
p.triangle(x='x',y='y',size=size,source=source_ts,color=col,alpha=0.8)
idx_ts2 = np.where(np.array(labels_test2)==theclass)[0].tolist()
samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2,columns=['x','y','sample'])
source_ts2 = ColumnDataSource(data=dict(
x=df_ts2['x'],
y=df_ts2['y'],
desc=df_ts2['sample']))
p.diamond(x='x',y='y',size=size,source=source_ts2,color=col,alpha=0.8)
p.add_tools(LassoSelectTool())
p.add_tools(WheelZoomTool())
p.legend.label_text_font_size = "20pt"
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "15pt"
p.add_tools(ZoomInTool())
p.add_tools(ResetTool())
p.add_tools(BoxZoomTool())
p.legend.location = "bottom_left"
p.legend.location = "top_left"
p.legend.click_policy='hide'
# p.title()
if BEST:
export_png(p, filename=f"subtypes_INF_split{SPLIT}.png") #save the plot
else:
export_png(p, filename=f"subtypes_juXT_split{SPLIT}.png")
show(p)
```
%% Cell type:markdown id: tags:
## Grid plot for all other splits
%% Cell type:code id: tags:
``` python
BEST = True
def range_with_ignore(start, stop, ignore):
return np.concatenate([
np.arange(start, ignore),
np.arange(ignore + 1, stop)
])
```
%% Cell type:code id: tags:
``` python
BEST = False
mycols = colorblind['Colorblind'][4]
plots = []
size = 12
for split in range_with_ignore(0,9,SPLIT).tolist():
for SPLIT in np.arange(9).tolist():
file_tr = f'{PATH}/{SPLIT}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{SPLIT}/{layers}_ts.txt' # test UMAP
file_test2 = f'{PATH}/{SPLIT}/{layers}_ts2.txt' # test UMAP
file_tr = f'{PATH}/{split}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{split}/{layers}_ts.txt' # test UMAP
file_test2 = f'{PATH}/{split}/{layers}_ts2.txt' # test UMAP
features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0)
features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0)
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{SPLIT}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{split}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
best_train = features_train[INF_feats]
best_test = features_test[INF_feats]
best_test2 = features_test2[INF_feats]
if BEST:
features_train = best_train
features_test = best_test
features_test2 = best_test2
samples_tr = features_train.index
labels_tr = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
labels_tr = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
features_train['labels'] = labels_tr
labels_tr = features_train['labels']
features_tr = features_train[features_train.columns[:-1]].values
samples_test = features_test.index
labels_test = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
labels_test = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
features_test['labels'] = labels_test
labels_test = features_test['labels']
features_ts = features_test[features_test.columns[:-1]].values
samples_test2 = features_test2.index
labels_test2 = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
labels_test2 = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
features_test2['labels'] = labels_test2
labels_test2 = features_test2['labels']
features_ts2 = features_test2[features_test2.columns[:-1]].values
train_data = features_tr
test_data = features_ts
test2_data = features_ts2
mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2, metric='euclidean').fit(train_data)
test_embedding = mapper.transform(test_data)
test2_embedding = mapper.transform(test2_data)
myclasses = pd.unique(labels_tr).tolist()
p = figure(title=f'split {SPLIT}')
p = figure(title=f'split {split}')
p.title.text_font_size = '25pt'
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
for col, theclass in zip(mycols,myclasses):
idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist()
samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr,columns=['x','y','sample'])
source_tr = ColumnDataSource(data=dict(
x=df_tr['x'],
y=df_tr['y'],
desc=df_tr['sample']))
p.circle(x='x',y='y',size=12,source=source_tr,color=col,alpha=0.8,legend=str(theclass))
p.circle(x='x',y='y',size=size,source=source_tr,color=col,alpha=0.8,legend=str(theclass))
idx_ts = np.where(np.array(labels_test)==theclass)[0].tolist()
samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts,columns=['x','y','sample'])
source_ts = ColumnDataSource(data=dict(
x=df_ts['x'],
y=df_ts['y'],
desc=df_ts['sample']))
p.triangle(x='x',y='y',size=12,source=source_ts,color=col,alpha=0.8)
p.triangle(x='x',y='y',size=size,source=source_ts,color=col,alpha=0.8)
idx_ts2 = np.where(np.array(labels_test2)==theclass)[0].tolist()
samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2,columns=['x','y','sample'])
source_ts2 = ColumnDataSource(data=dict(
x=df_ts2['x'],
y=df_ts2['y'],
desc=df_ts2['sample']))
p.diamond(x='x',y='y',size=12,source=source_ts2,color=col,alpha=0.8)
p.diamond(x='x',y='y',size=size,source=source_ts2,color=col,alpha=0.8)
# p.legend.location = "bottom_left"
p.legend.label_text_font_size = "20pt"
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "15pt"
plots.append(p)
```
%% Output
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-13-710083258612> in <module>
31
32 samples_tr = features_train.index
---> 33 labels_tr = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
34
35 features_train['labels'] = labels_tr
~/anaconda3/envs/inf_env/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
683 )
684
--> 685 return _read(filepath_or_buffer, kwds)
686
687 parser_f.__name__ = name
~/anaconda3/envs/inf_env/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
455
456 # Create the parser.
--> 457 parser = TextFileReader(fp_or_buf, **kwds)
458
459 if chunksize or iterator:
~/anaconda3/envs/inf_env/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
893 self.options["has_index_names"] = kwds["has_index_names"]
894
--> 895 self._make_engine(self.engine)
896
897 def close(self):
~/anaconda3/envs/inf_env/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
1133 def _make_engine(self, engine="c"):
1134 if engine == "c":
-> 1135 self._engine = CParserWrapper(self.f, **self.options)
1136 else:
1137 if engine == "python":
~/anaconda3/envs/inf_env/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
1915 kwds["usecols"] = self.usecols
1916
-> 1917 self._reader = parsers.TextReader(src, **kwds)
1918 self.unnamed_cols = self._reader.unnamed_cols
1919
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: [Errno 2] File b'data/tcga_breast/subtypes/0/labels_subtypes_tr.txt' does not exist: b'data/tcga_breast/subtypes/0/labels_subtypes_tr.txt'
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
%% Cell type:code id: tags:
``` python
grid = gridplot([[plots[0], plots[1], plots[2]], [plots[3], plots[4], plots[5]], [plots[6], plots[7], plots[8]]], plot_width=1200, plot_height=1200)
if BEST:
export_png(grid, filename="subtypes_INF_suppl.png")
else:
export_png(grid, filename="subtypes_juXT_suppl.png")
show(grid)
```
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment