Commit c6f6b7e3 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Black formatting

parent 92addcf3
......@@ -11,70 +11,72 @@
import numpy as np
from bokeh.plotting import figure, output_file, show, save
from bokeh.io import output_notebook, export_png
from bokeh.palettes import colorblind
from bokeh.models import CategoricalColorMapper, ColumnDataSource, LassoSelectTool, WheelZoomTool, ZoomInTool, BoxZoomTool, ResetTool
from bokeh.models import (
CategoricalColorMapper,
ColumnDataSource,
LassoSelectTool,
WheelZoomTool,
ZoomInTool,
BoxZoomTool,
ResetTool,
)
from bokeh.layouts import gridplot
from bokeh.resources import CDN
from bokeh.embed import file_html
import numpy as np
import umap
```
%% Cell type:code id: tags:
``` python
output_notebook()
```
%%%% Output: display_data
%%%% Output: display_data
%% Cell type:markdown id: tags:
## Load Features datasets
%% Cell type:code id: tags:
``` python
TASK = 'subtypes'
DATASET = 'tcga_breast'
MODEL = 'randomForest'
layers = 'gene_cnv_prot'
TASK = "subtypes"
DATASET = "tcga_breast"
MODEL = "randomForest"
PATH = Path('data') / DATASET / TASK
PATH_RESULTS = Path('results') / DATASET / TASK / MODEL
layers = "gene_cnv_prot"
SPLIT = 2# choose a random split for the train, test, and test2 files
PATH = Path("data") / DATASET / TASK
PATH_RESULTS = Path("results") / DATASET / TASK / MODEL
SPLIT = 2 # choose a random split for the train, test, and test2 files
```
%% Cell type:code id: tags:
``` python
file_tr = f'{PATH}/{SPLIT}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{SPLIT}/{layers}_ts.txt' # test UMAP on TS
file_test2 = f'{PATH}/{SPLIT}/{layers}_ts2.txt' # test UMAP on TS2
features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0)
features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0)
file_tr = f"{PATH}/{SPLIT}/{layers}_tr.txt" # Fit UMAP
file_test = f"{PATH}/{SPLIT}/{layers}_ts.txt" # test UMAP on TS
file_test2 = f"{PATH}/{SPLIT}/{layers}_ts2.txt" # test UMAP on TS2
features_train = pd.read_csv(file_tr, sep="\t", header=0, index_col=0)
features_test = pd.read_csv(file_test, sep="\t", header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep="\t", header=0, index_col=0)
```
%% Cell type:code id: tags:
``` python
BEST = False # restrict the features to the INF signature
BEST = False # restrict the features to the INF signature
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{SPLIT}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
INF_feats = pd.read_csv(
f"{PATH_RESULTS}/{SPLIT}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt",
sep="\t",
)["FEATURE_NAME"].values.tolist()
best_train = features_train[INF_feats]
best_test = features_test[INF_feats]
best_test2 = features_test2[INF_feats]
```
......@@ -90,31 +92,37 @@
%% Cell type:code id: tags:
``` python
samples_tr = features_train.index
labels_tr = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
labels_tr = pd.read_csv(f"{PATH}/{SPLIT}/labels_{TASK}_tr.txt", sep="\t", header=None)[
0
].tolist()
features_train['labels'] = labels_tr
labels_tr = features_train['labels']
features_train["labels"] = labels_tr
labels_tr = features_train["labels"]
features_tr = features_train[features_train.columns[:-1]].values
samples_test = features_test.index
labels_test = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
labels_test = pd.read_csv(
f"{PATH}/{SPLIT}/labels_{TASK}_ts.txt", sep="\t", header=None
)[0].tolist()
features_test['labels'] = labels_test
labels_test = features_test['labels']
features_test["labels"] = labels_test
labels_test = features_test["labels"]
features_ts = features_test[features_test.columns[:-1]].values
samples_test2 = features_test2.index
labels_test2 = pd.read_csv(f'{PATH}/{SPLIT}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
labels_test2 = pd.read_csv(
f"{PATH}/{SPLIT}/labels_{TASK}_ts2.txt", sep="\t", header=None
)[0].tolist()
features_test2['labels'] = labels_test2
labels_test2 = features_test2['labels']
features_test2["labels"] = labels_test2
labels_test2 = features_test2["labels"]
features_ts2 = features_test2[features_test2.columns[:-1]].values
```
%% Cell type:code id: tags:
......@@ -126,33 +134,34 @@
%% Cell type:code id: tags:
``` python
# Check
print(len(features_tr),len(samples_tr),len(labels_tr))
print(len(features_ts),len(samples_test),len(labels_test))
print(len(features_ts2),len(samples_test2),len(labels_test2))
print(len(features_tr), len(samples_tr), len(labels_tr))
print(len(features_ts), len(samples_test), len(labels_test))
print(len(features_ts2), len(samples_test2), len(labels_test2))
```
%% Cell type:markdown id: tags:
## Fit on the training data and transform the test set into the learned space
%% Cell type:code id: tags:
``` python
mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2,
metric='euclidean').fit(train_data)
mapper = umap.UMAP(
n_neighbors=40, min_dist=0.01, n_components=2, metric="euclidean"
).fit(train_data)
test_embedding = mapper.transform(test_data)
test2_embedding = mapper.transform(test2_data)
```
%% Cell type:code id: tags:
``` python
# Check
len(mapper.embedding_), len(test_embedding),len(test2_embedding)
len(mapper.embedding_), len(test_embedding), len(test2_embedding)
```
%% Cell type:markdown id: tags:
## Plot UMAP 2D projection
......@@ -164,58 +173,69 @@
("index", "$index"),
("(x,y)", "($x, $y)"),
("desc", "@desc"),
]
mycols = colorblind['Colorblind'][4]
mycols = colorblind["Colorblind"][4]
myclasses = pd.unique(labels_tr).tolist()
p = figure(plot_width=1200, plot_height=1200, tooltips = TOOLTIPS, tools='save', toolbar_location="left")
p = figure(
plot_width=1200,
plot_height=1200,
tooltips=TOOLTIPS,
tools="save",
toolbar_location="left",
)
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
size=12
size = 12
for col, theclass in zip(mycols,myclasses):
for col, theclass in zip(mycols, myclasses):
idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist()
idx_tr = np.where(np.array(labels_tr) == theclass)[0].tolist()
samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr,columns=['x','y','sample'])
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr, columns=["x", "y", "sample"])
source_tr = ColumnDataSource(data=dict(
x=df_tr['x'],
y=df_tr['y'],
desc=df_tr['sample']))
p.circle(x='x',y='y',size=size,source=source_tr,color=col,alpha=0.8,legend=str(theclass))
idx_ts = np.where(np.array(labels_test)==theclass)[0].tolist()
source_tr = ColumnDataSource(
data=dict(x=df_tr["x"], y=df_tr["y"], desc=df_tr["sample"])
)
p.circle(
x="x",
y="y",
size=size,
source=source_tr,
color=col,
alpha=0.8,
legend=str(theclass),
)
idx_ts = np.where(np.array(labels_test) == theclass)[0].tolist()
samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts,columns=['x','y','sample'])
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts, columns=["x", "y", "sample"])
source_ts = ColumnDataSource(data=dict(
x=df_ts['x'],
y=df_ts['y'],
desc=df_ts['sample']))
p.triangle(x='x',y='y',size=size,source=source_ts,color=col,alpha=0.8)
source_ts = ColumnDataSource(
data=dict(x=df_ts["x"], y=df_ts["y"], desc=df_ts["sample"])
)
p.triangle(x="x", y="y", size=size, source=source_ts, color=col, alpha=0.8)
idx_ts2 = np.where(np.array(labels_test2)==theclass)[0].tolist()
idx_ts2 = np.where(np.array(labels_test2) == theclass)[0].tolist()
samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2,columns=['x','y','sample'])
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2, columns=["x", "y", "sample"])
source_ts2 = ColumnDataSource(
data=dict(x=df_ts2["x"], y=df_ts2["y"], desc=df_ts2["sample"])
)
p.diamond(x="x", y="y", size=size, source=source_ts2, color=col, alpha=0.8)
source_ts2 = ColumnDataSource(data=dict(
x=df_ts2['x'],
y=df_ts2['y'],
desc=df_ts2['sample']))
p.diamond(x='x',y='y',size=size,source=source_ts2,color=col,alpha=0.8)
p.add_tools(LassoSelectTool())
p.add_tools(WheelZoomTool())
p.legend.label_text_font_size = "20pt"
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "15pt"
......@@ -223,15 +243,15 @@
p.add_tools(ZoomInTool())
p.add_tools(ResetTool())
p.add_tools(BoxZoomTool())
p.legend.location = "top_left"
p.legend.click_policy='hide'
p.legend.click_policy = "hide"
# p.title()
if BEST:
export_png(p, filename=f"subtypes_INF_split{SPLIT}.png") #save the plot
export_png(p, filename=f"subtypes_INF_split{SPLIT}.png") # save the plot
else:
export_png(p, filename=f"subtypes_juXT_split{SPLIT}.png")
show(p)
```
......@@ -242,152 +262,154 @@
%% Cell type:code id: tags:
``` python
def range_with_ignore(start, stop, ignore):
return np.concatenate([
np.arange(start, ignore),
np.arange(ignore + 1, stop)
])
return np.concatenate([np.arange(start, ignore), np.arange(ignore + 1, stop)])
```
%% Cell type:code id: tags:
``` python
BEST = False
mycols = colorblind['Colorblind'][4]
mycols = colorblind["Colorblind"][4]
plots = []
size = 12
for split in range_with_ignore(0,10,SPLIT).tolist():
file_tr = f'{PATH}/{split}/{layers}_tr.txt' # Fit UMAP
file_test = f'{PATH}/{split}/{layers}_ts.txt' # test UMAP
file_test2 = f'{PATH}/{split}/{layers}_ts2.txt' # test UMAP
features_train = pd.read_csv(file_tr, sep='\t', header=0, index_col=0)
features_test = pd.read_csv(file_test, sep='\t', header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep='\t', header=0, index_col=0)
for split in range_with_ignore(0, 10, SPLIT).tolist():
INF_feats = pd.read_csv(f'{PATH_RESULTS}/{split}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt', sep='\t')['FEATURE_NAME'].values.tolist()
file_tr = f"{PATH}/{split}/{layers}_tr.txt" # Fit UMAP
file_test = f"{PATH}/{split}/{layers}_ts.txt" # test UMAP
file_test2 = f"{PATH}/{split}/{layers}_ts2.txt" # test UMAP
features_train = pd.read_csv(file_tr, sep="\t", header=0, index_col=0)
features_test = pd.read_csv(file_test, sep="\t", header=0, index_col=0)
features_test2 = pd.read_csv(file_test2, sep="\t", header=0, index_col=0)
INF_feats = pd.read_csv(
f"{PATH_RESULTS}/{split}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt",
sep="\t",
)["FEATURE_NAME"].values.tolist()
best_train = features_train[INF_feats]
best_test = features_test[INF_feats]
best_test2 = features_test2[INF_feats]
if BEST:
features_train = best_train
features_test = best_test
features_test2 = best_test2
samples_tr = features_train.index
labels_tr = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_tr.txt', sep='\t', header=None)[0].tolist()
labels_tr = pd.read_csv(
f"{PATH}/{split}/labels_{TASK}_tr.txt", sep="\t", header=None
)[0].tolist()
features_train['labels'] = labels_tr
labels_tr = features_train['labels']
features_train["labels"] = labels_tr
labels_tr = features_train["labels"]
features_tr = features_train[features_train.columns[:-1]].values
samples_test = features_test.index
labels_test = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts.txt', sep='\t', header=None)[0].tolist()
labels_test = pd.read_csv(
f"{PATH}/{split}/labels_{TASK}_ts.txt", sep="\t", header=None
)[0].tolist()
features_test['labels'] = labels_test
labels_test = features_test['labels']
features_test["labels"] = labels_test
labels_test = features_test["labels"]
features_ts = features_test[features_test.columns[:-1]].values
samples_test2 = features_test2.index
labels_test2 = pd.read_csv(f'{PATH}/{split}/labels_{TASK}_ts2.txt', sep='\t', header=None)[0].tolist()
labels_test2 = pd.read_csv(
f"{PATH}/{split}/labels_{TASK}_ts2.txt", sep="\t", header=None
)[0].tolist()
features_test2['labels'] = labels_test2
labels_test2 = features_test2['labels']
features_test2["labels"] = labels_test2
labels_test2 = features_test2["labels"]
features_ts2 = features_test2[features_test2.columns[:-1]].values
train_data = features_tr
test_data = features_ts
test2_data = features_ts2
mapper = umap.UMAP(n_neighbors=40, min_dist=0.01, n_components=2, metric='euclidean').fit(train_data)
mapper = umap.UMAP(
n_neighbors=40, min_dist=0.01, n_components=2, metric="euclidean"
).fit(train_data)
test_embedding = mapper.transform(test_data)
test2_embedding = mapper.transform(test2_data)
myclasses = pd.unique(labels_tr).tolist()
p = figure(title=f'split {split}')
p.title.text_font_size = '25pt'
p = figure(title=f"split {split}")
p.title.text_font_size = "25pt"
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
for col, theclass in zip(mycols, myclasses):
for col, theclass in zip(mycols,myclasses):
idx_tr = np.where(np.array(labels_tr)==theclass)[0].tolist()
idx_tr = np.where(np.array(labels_tr) == theclass)[0].tolist()
samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr,columns=['x','y','sample'])
data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))
df_tr = pd.DataFrame(data_tr, columns=["x", "y", "sample"])
source_tr = ColumnDataSource(
data=dict(x=df_tr["x"], y=df_tr["y"], desc=df_tr["sample"])
)
p.circle(
x="x",
y="y",
size=size,
source=source_tr,
color=col,
alpha=0.8,
legend=str(theclass),
)
source_tr = ColumnDataSource(data=dict(
x=df_tr['x'],
y=df_tr['y'],
desc=df_tr['sample']))
p.circle(x='x',y='y',size=size,source=source_tr,color=col,alpha=0.8,legend=str(theclass))
idx_ts = np.where(np.array(labels_test)==theclass)[0].tolist()
idx_ts = np.where(np.array(labels_test) == theclass)[0].tolist()
samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts,columns=['x','y','sample'])
data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))
df_ts = pd.DataFrame(data_ts, columns=["x", "y", "sample"])
source_ts = ColumnDataSource(data=dict(
x=df_ts['x'],
y=df_ts['y'],
desc=df_ts['sample']))
p.triangle(x='x',y='y',size=size,source=source_ts,color=col,alpha=0.8)
source_ts = ColumnDataSource(
data=dict(x=df_ts["x"], y=df_ts["y"], desc=df_ts["sample"])
)
p.triangle(x="x", y="y", size=size, source=source_ts, color=col, alpha=0.8)
idx_ts2 = np.where(np.array(labels_test2)==theclass)[0].tolist()
idx_ts2 = np.where(np.array(labels_test2) == theclass)[0].tolist()
samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2,columns=['x','y','sample'])
data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))
df_ts2 = pd.DataFrame(data_ts2, columns=["x", "y", "sample"])
source_ts2 = ColumnDataSource(data=dict(
x=df_ts2['x'],
y=df_ts2['y'],
desc=df_ts2['sample']))
p.diamond(x='x',y='y',size=size,source=source_ts2,color=col,alpha=0.8)
# p.legend.location = "bottom_left"
source_ts2 = ColumnDataSource(
data=dict(x=df_ts2["x"], y=df_ts2["y"], desc=df_ts2["sample"])
)
p.diamond(x="x", y="y", size=size, source=source_ts2, color=col, alpha=0.8)
# p.legend.location = "bottom_left"
p.legend.label_text_font_size = "20pt"
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.major_label_text_font_size = "15pt"
plots.append(p)
```
%% Cell type:code id: tags:
``` python
grid = gridplot([[plots[0], plots[1], plots[2]], [plots[3], plots[4], plots[5]], [plots[6], plots[7], plots[8]]], plot_width=1200, plot_height=1200)
grid = gridplot(
[
[plots[0], plots[1], plots[2]],
[plots[3], plots[4], plots[5]],
[plots[6], plots[7], plots[8]],
],
plot_width=1200,
plot_height=1200,
)
if BEST:
export_png(grid, filename="subtypes_INF_suppl.png")
else:
export_png(grid, filename="subtypes_juXT_suppl.png")
show(grid)
```
%% Cell type:code id: tags:
``` python
show(grid)
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment