Commit 92addcf3 authored by Nicole Bussola's avatar Nicole Bussola
Browse files

minor fix

parent 1c24e8a0
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# UMAP projection # UMAP projection
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import os import os
import pandas as pd import pandas as pd
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
from bokeh.plotting import figure, output_file, show, save from bokeh.plotting import figure, output_file, show, save
from bokeh.io import output_notebook, export_png from bokeh.io import output_notebook, export_png
from bokeh.palettes import colorblind from bokeh.palettes import colorblind
from bokeh.models import CategoricalColorMapper, ColumnDataSource, LassoSelectTool, WheelZoomTool, ZoomInTool, BoxZoomTool, ResetTool from bokeh.models import CategoricalColorMapper, ColumnDataSource, LassoSelectTool, WheelZoomTool, ZoomInTool, BoxZoomTool, ResetTool
from bokeh.layouts import gridplot from bokeh.layouts import gridplot
from bokeh.resources import CDN from bokeh.resources import CDN
from bokeh.embed import file_html from bokeh.embed import file_html
import numpy as np import numpy as np
import umap import umap
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
output_notebook() output_notebook()
``` ```
%%%% Output: display_data %%%% Output: display_data
%%%% Output: display_data %%%% Output: display_data
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Load Features datasets ## Load Features datasets
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
TASK = 'subtypes' TASK = 'subtypes'
DATASET = 'tcga_breast' DATASET = 'tcga_breast'
MODEL = 'randomForest' MODEL = 'randomForest'
layers = 'gene_cnv_prot' layers = 'gene_cnv_prot'
PATH = Path('data') / DATASET / TASK PATH = Path('data') / DATASET / TASK
PATH_RESULTS = Path('results') / DATASET / TASK / MODEL PATH_RESULTS = Path('results') / DATASET / TASK / MODEL
SPLIT = 2# choose a random split for the train, test, and test2 files SPLIT = 2# choose a random split for the train, test, and test2 files
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
file_tr = f'{PATH}/{SPLIT}/{layers}_tr.txt' # Fit UMAP file_tr = f'{PATH}/{SPLIT}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{SPLIT}/{layers}_ts.txt' # test UMAP on TS file_test = f'{PATH}/{SPLIT}/{layers}_ts.txt' # test UMAP on TS
file_test2 = f'{PATH}/{SPLIT}/{layers}_ts2.txt' # test UMAP on TS2 file_test2 = f'{PATH}/{SPLIT}/{layers}_ts2.txt' # test UMAP on TS2
features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0) features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0)
features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0) features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0) features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
BEST = False # restrict the features to the INF signature BEST = False # restrict the features to the INF signature
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{SPLIT}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist() INF_feats = pd.read_csv(f'{PATH_RESULTS}/{SPLIT}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
best_train = features_train[INF_feats] best_train = features_train[INF_feats]
best_test = features_test[INF_feats] best_test = features_test[INF_feats]
best_test2 = features_test2[INF_feats] best_test2 = features_test2[INF_feats]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
if BEST: if BEST:
features_train = best_train features_train = best_train
features_test = best_test features_test = best_test
features_test2 = best_test2 features_test2 = best_test2
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
samples_tr = features_train.index samples_tr = features_train.index
labels_tr = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist() labels_tr = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
features_train['labels'] = labels_tr features_train['labels'] = labels_tr
labels_tr = features_train['labels'] labels_tr = features_train['labels']
features_tr = features_train[features_train.columns[:-1]].values features_tr = features_train[features_train.columns[:-1]].values
samples_test = features_test.index samples_test = features_test.index
labels_test = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist() labels_test = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
features_test['labels'] = labels_test features_test['labels'] = labels_test
labels_test = features_test['labels'] labels_test = features_test['labels']
features_ts = features_test[features_test.columns[:-1]].values features_ts = features_test[features_test.columns[:-1]].values
samples_test2 = features_test2.index samples_test2 = features_test2.index
labels_test2 = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist() labels_test2 = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
features_test2['labels'] = labels_test2 features_test2['labels'] = labels_test2
labels_test2 = features_test2['labels'] labels_test2 = features_test2['labels']
features_ts2 = features_test2[features_test2.columns[:-1]].values features_ts2 = features_test2[features_test2.columns[:-1]].values
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
train_data = features_tr train_data = features_tr
test_data = features_ts test_data = features_ts
test2_data = features_ts2 test2_data = features_ts2
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Check # Check
print(len(features_tr),len(samples_tr),len(labels_tr)) print(len(features_tr),len(samples_tr),len(labels_tr))
print(len(features_ts),len(samples_test),len(labels_test)) print(len(features_ts),len(samples_test),len(labels_test))
print(len(features_ts2),len(samples_test2),len(labels_test2)) print(len(features_ts2),len(samples_test2),len(labels_test2))
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Fit on the training data and transform the test set into the learned space ## Fit on the training data and transform the test set into the learned space
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2, mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2,
metric='euclidean').fit(train_data) metric='euclidean').fit(train_data)
test_embedding = mapper.transform(test_data) test_embedding = mapper.transform(test_data)
test2_embedding = mapper.transform(test2_data) test2_embedding = mapper.transform(test2_data)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Check # Check
len(mapper.embedding_), len(test_embedding),len(test2_embedding) len(mapper.embedding_), len(test_embedding),len(test2_embedding)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Plot UMAP 2D projection ## Plot UMAP 2D projection
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
TOOLTIPS = [ TOOLTIPS = [
("index", "$index"), ("index", "$index"),
("(x,y)", "($x, $y)"), ("(x,y)", "($x, $y)"),
("desc", "@desc"), ("desc", "@desc"),
] ]
mycols = colorblind['Colorblind'][4] mycols = colorblind['Colorblind'][4]
myclasses = pd.unique(labels_tr).tolist() myclasses = pd.unique(labels_tr).tolist()
p = figure(plot_width=1200, plot_height=1200, tooltips = TOOLTIPS, tools='save', toolbar_location="left") p = figure(plot_width=1200, plot_height=1200, tooltips = TOOLTIPS, tools='save', toolbar_location="left")
p.title.align = "center" p.title.align = "center"
p.title.text_color = "black" p.title.text_color = "black"
p.title.text_font_size = "25px" p.title.text_font_size = "25px"
size=12 size=12
for col, theclass in zip(mycols,myclasses): for col, theclass in zip(mycols,myclasses):
idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist() idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist()
samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1) samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train)) data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr,columns=['x','y','sample']) df_tr = pd.DataFrame(data_tr,columns=['x','y','sample'])
source_tr = ColumnDataSource(data=dict( source_tr = ColumnDataSource(data=dict(
x=df_tr['x'], x=df_tr['x'],
y=df_tr['y'], y=df_tr['y'],
desc=df_tr['sample'])) desc=df_tr['sample']))
p.circle(x='x',y='y',size=size,source=source_tr,color=col,alpha=0.8,legend=str(theclass)) p.circle(x='x',y='y',size=size,source=source_tr,color=col,alpha=0.8,legend=str(theclass))
idx_ts = np.where(np.array(labels_test)==theclass)[0].tolist() idx_ts = np.where(np.array(labels_test)==theclass)[0].tolist()
samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1) samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts)) data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts,columns=['x','y','sample']) df_ts = pd.DataFrame(data_ts,columns=['x','y','sample'])
source_ts = ColumnDataSource(data=dict( source_ts = ColumnDataSource(data=dict(
x=df_ts['x'], x=df_ts['x'],
y=df_ts['y'], y=df_ts['y'],
desc=df_ts['sample'])) desc=df_ts['sample']))
p.triangle(x='x',y='y',size=size,source=source_ts,color=col,alpha=0.8) p.triangle(x='x',y='y',size=size,source=source_ts,color=col,alpha=0.8)
idx_ts2 = np.where(np.array(labels_test2)==theclass)[0].tolist() idx_ts2 = np.where(np.array(labels_test2)==theclass)[0].tolist()
samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1) samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2)) data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2,columns=['x','y','sample']) df_ts2 = pd.DataFrame(data_ts2,columns=['x','y','sample'])
source_ts2 = ColumnDataSource(data=dict( source_ts2 = ColumnDataSource(data=dict(
x=df_ts2['x'], x=df_ts2['x'],
y=df_ts2['y'], y=df_ts2['y'],
desc=df_ts2['sample'])) desc=df_ts2['sample']))
p.diamond(x='x',y='y',size=size,source=source_ts2,color=col,alpha=0.8) p.diamond(x='x',y='y',size=size,source=source_ts2,color=col,alpha=0.8)
p.add_tools(LassoSelectTool()) p.add_tools(LassoSelectTool())
p.add_tools(WheelZoomTool()) p.add_tools(WheelZoomTool())
p.legend.label_text_font_size = "20pt" p.legend.label_text_font_size = "20pt"
p.yaxis.major_label_text_font_size = "15pt" p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "15pt" p.xaxis.major_label_text_font_size = "15pt"
p.add_tools(ZoomInTool()) p.add_tools(ZoomInTool())
p.add_tools(ResetTool()) p.add_tools(ResetTool())
p.add_tools(BoxZoomTool()) p.add_tools(BoxZoomTool())
p.legend.location = "top_left" p.legend.location = "top_left"
p.legend.click_policy='hide' p.legend.click_policy='hide'
# p.title() # p.title()
if BEST: if BEST:
export_png(p, filename=f"subtypes_INF_split{SPLIT}.png") #save the plot export_png(p, filename=f"subtypes_INF_split{SPLIT}.png") #save the plot
else: else:
export_png(p, filename=f"subtypes_juXT_split{SPLIT}.png") export_png(p, filename=f"subtypes_juXT_split{SPLIT}.png")
show(p) show(p)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Grid plot for all other splits ## Grid plot for all other splits
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def range_with_ignore(start, stop, ignore): def range_with_ignore(start, stop, ignore):
return np.concatenate([ return np.concatenate([
np.arange(start, ignore), np.arange(start, ignore),
np.arange(ignore + 1, stop) np.arange(ignore + 1, stop)
]) ])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
BEST = False BEST = False
mycols = colorblind['Colorblind'][4] mycols = colorblind['Colorblind'][4]
plots = [] plots = []
size = 12 size = 12
for split in range_with_ignore(0,9,SPLIT).tolist(): for split in range_with_ignore(0,10,SPLIT).tolist():
file_tr = f'{PATH}/{split}/{layers}_tr.txt' # Fit UMAP file_tr = f'{PATH}/{split}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{split}/{layers}_ts.txt' # test UMAP file_test = f'{PATH}/{split}/{layers}_ts.txt' # test UMAP
file_test2 = f'{PATH}/{split}/{layers}_ts2.txt' # test UMAP file_test2 = f'{PATH}/{split}/{layers}_ts2.txt' # test UMAP
features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0) features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0)
features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0) features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0) features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0)
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{split}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist() INF_feats = pd.read_csv(f'{PATH_RESULTS}/{split}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
best_train = features_train[INF_feats] best_train = features_train[INF_feats]
best_test = features_test[INF_feats] best_test = features_test[INF_feats]
best_test2 = features_test2[INF_feats] best_test2 = features_test2[INF_feats]
if BEST: if BEST:
features_train = best_train features_train = best_train
features_test = best_test features_test = best_test
features_test2 = best_test2 features_test2 = best_test2
samples_tr = features_train.index samples_tr = features_train.index
labels_tr = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist() labels_tr = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
features_train['labels'] = labels_tr features_train['labels'] = labels_tr
labels_tr = features_train['labels'] labels_tr = features_train['labels']
features_tr = features_train[features_train.columns[:-1]].values features_tr = features_train[features_train.columns[:-1]].values
samples_test = features_test.index samples_test = features_test.index
labels_test = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist() labels_test = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
features_test['labels'] = labels_test features_test['labels'] = labels_test
labels_test = features_test['labels'] labels_test = features_test['labels']
features_ts = features_test[features_test.columns[:-1]].values features_ts = features_test[features_test.columns[:-1]].values
samples_test2 = features_test2.index samples_test2 = features_test2.index
labels_test2 = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist() labels_test2 = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
features_test2['labels'] = labels_test2 features_test2['labels'] = labels_test2
labels_test2 = features_test2['labels'] labels_test2 = features_test2['labels']
features_ts2 = features_test2[features_test2.columns[:-1]].values features_ts2 = features_test2[features_test2.columns[:-1]].values
train_data = features_tr train_data = features_tr
test_data = features_ts test_data = features_ts
test2_data = features_ts2 test2_data = features_ts2
mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2, metric='euclidean').fit(train_data) mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2, metric='euclidean').fit(train_data)
test_embedding = mapper.transform(test_data) test_embedding = mapper.transform(test_data)
test2_embedding = mapper.transform(test2_data) test2_embedding = mapper.transform(test2_data)
myclasses = pd.unique(labels_tr).tolist() myclasses = pd.unique(labels_tr).tolist()
p = figure(title=f'split {split}') p = figure(title=f'split {split}')
p.title.text_font_size = '25pt' p.title.text_font_size = '25pt'
p.title.align = "center" p.title.align = "center"
p.title.text_color = "black" p.title.text_color = "black"
p.title.text_font_size = "25px" p.title.text_font_size = "25px"
for col, theclass in zip(mycols,myclasses): for col, theclass in zip(mycols,myclasses):
idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist() idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist()
samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1) samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))