{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# UMAP projection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from pathlib import Path\n", "import numpy as np\n", "\n", "from bokeh.plotting import figure, output_file, show, save\n", "from bokeh.io import output_notebook, export_png\n", "from bokeh.palettes import colorblind\n", "from bokeh.models import (\n", " CategoricalColorMapper,\n", " ColumnDataSource,\n", " LassoSelectTool,\n", " WheelZoomTool,\n", " ZoomInTool,\n", " BoxZoomTool,\n", " ResetTool,\n", ")\n", "from bokeh.layouts import gridplot\n", "from bokeh.resources import CDN\n", "from bokeh.embed import file_html\n", "\n", "import umap" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "output_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Features datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "TASK = \"subtypes\"\n", "DATASET = \"tcga_breast\"\n", "MODEL = \"randomForest\"\n", "\n", "layers = \"gene_cnv_prot\"\n", "\n", "PATH = Path(\"data\") / DATASET / TASK\n", "PATH_RESULTS = Path(\"results\") / DATASET / TASK / MODEL\n", "\n", "SPLIT = 2 # choose a random split for the train, test, and test2 files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file_tr = f\"{PATH}/{SPLIT}/{layers}_tr.txt\" # Fit UMAP\n", "file_test = f\"{PATH}/{SPLIT}/{layers}_ts.txt\" # test UMAP on TS\n", "file_test2 = f\"{PATH}/{SPLIT}/{layers}_ts2.txt\" # test UMAP on TS2\n", "\n", "features_train = pd.read_csv(file_tr, sep=\"\\t\", header=0, index_col=0)\n", "features_test = pd.read_csv(file_test, sep=\"\\t\", header=0, index_col=0)\n", "features_test2 = pd.read_csv(file_test2, sep=\"\\t\", header=0, index_col=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "BEST = False # restrict the features to the INF signature\n", "\n", "INF_feats = pd.read_csv(\n", " f\"{PATH_RESULTS}/{SPLIT}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt\",\n", " sep=\"\\t\",\n", ")[\"FEATURE_NAME\"].values.tolist()\n", "\n", "best_train = features_train[INF_feats]\n", "best_test = features_test[INF_feats]\n", "best_test2 = features_test2[INF_feats]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if BEST:\n", " features_train = best_train\n", " features_test = best_test\n", " features_test2 = best_test2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "samples_tr = features_train.index\n", "labels_tr = pd.read_csv(f\"{PATH}/{SPLIT}/labels_{TASK}_tr.txt\", sep=\"\\t\", header=None)[\n", " 0\n", "].tolist()\n", "\n", "features_train[\"labels\"] = labels_tr\n", "labels_tr = features_train[\"labels\"]\n", "features_tr = features_train[features_train.columns[:-1]].values\n", "\n", "\n", "samples_test = features_test.index\n", "labels_test = pd.read_csv(\n", " f\"{PATH}/{SPLIT}/labels_{TASK}_ts.txt\", sep=\"\\t\", header=None\n", ")[0].tolist()\n", "\n", "features_test[\"labels\"] = labels_test\n", "labels_test = features_test[\"labels\"]\n", "features_ts = features_test[features_test.columns[:-1]].values\n", "\n", "\n", "samples_test2 = features_test2.index\n", "labels_test2 = pd.read_csv(\n", " f\"{PATH}/{SPLIT}/labels_{TASK}_ts2.txt\", sep=\"\\t\", header=None\n", ")[0].tolist()\n", "\n", "\n", "features_test2[\"labels\"] = labels_test2\n", "labels_test2 = features_test2[\"labels\"]\n", "features_ts2 = features_test2[features_test2.columns[:-1]].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_data = features_tr\n", "test_data = features_ts\n", "test2_data = features_ts2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check\n", "print(len(features_tr), len(samples_tr), len(labels_tr))\n", "print(len(features_ts), len(samples_test), len(labels_test))\n", "print(len(features_ts2), len(samples_test2), len(labels_test2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fit on the training data and transform the test set into the learned space" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mapper = umap.UMAP(\n", " n_neighbors=40, min_dist=0.01, n_components=2, metric=\"euclidean\"\n", ").fit(train_data)\n", "test_embedding = mapper.transform(test_data)\n", "test2_embedding = mapper.transform(test2_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check\n", "len(mapper.embedding_), len(test_embedding), len(test2_embedding)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot UMAP 2D projection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "TOOLTIPS = [\n", " (\"index\", \"$index\"),\n", " (\"(x,y)\", \"($x, $y)\"),\n", " (\"desc\", \"@desc\"),\n", "]\n", "\n", "mycols = colorblind[\"Colorblind\"][4]\n", "myclasses = pd.unique(labels_tr).tolist()\n", "\n", "p = figure(\n", " plot_width=1200,\n", " plot_height=1200,\n", " tooltips=TOOLTIPS,\n", " tools=\"save\",\n", " toolbar_location=\"left\",\n", ")\n", "\n", "\n", "p.title.align = \"center\"\n", "p.title.text_color = \"black\"\n", "p.title.text_font_size = \"25px\"\n", "\n", "size = 12\n", "\n", "for col, theclass in zip(mycols, myclasses):\n", "\n", " idx_tr = np.where(np.array(labels_tr) == theclass)[0].tolist()\n", " samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)\n", " data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))\n", " df_tr = pd.DataFrame(data_tr, columns=[\"x\", \"y\", \"sample\"])\n", "\n", " source_tr = ColumnDataSource(\n", " data=dict(x=df_tr[\"x\"], y=df_tr[\"y\"], desc=df_tr[\"sample\"])\n", " )\n", " p.circle(\n", " x=\"x\",\n", " y=\"y\",\n", " size=size,\n", " source=source_tr,\n", " color=col,\n", " alpha=0.8,\n", " legend=str(theclass),\n", " )\n", "\n", " idx_ts = np.where(np.array(labels_test) == theclass)[0].tolist()\n", " samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)\n", " data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))\n", " df_ts = pd.DataFrame(data_ts, columns=[\"x\", \"y\", \"sample\"])\n", "\n", " source_ts = ColumnDataSource(\n", " data=dict(x=df_ts[\"x\"], y=df_ts[\"y\"], desc=df_ts[\"sample\"])\n", " )\n", " p.triangle(x=\"x\", y=\"y\", size=size, source=source_ts, color=col, alpha=0.8)\n", "\n", " idx_ts2 = np.where(np.array(labels_test2) == theclass)[0].tolist()\n", " samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)\n", " data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))\n", " df_ts2 = pd.DataFrame(data_ts2, columns=[\"x\", \"y\", \"sample\"])\n", "\n", " source_ts2 = ColumnDataSource(\n", " data=dict(x=df_ts2[\"x\"], y=df_ts2[\"y\"], desc=df_ts2[\"sample\"])\n", " )\n", " p.diamond(x=\"x\", y=\"y\", size=size, source=source_ts2, color=col, alpha=0.8)\n", "\n", "\n", "p.add_tools(LassoSelectTool())\n", "p.add_tools(WheelZoomTool())\n", "p.legend.label_text_font_size = \"20pt\"\n", "p.yaxis.major_label_text_font_size = \"15pt\"\n", "p.xaxis.major_label_text_font_size = \"15pt\"\n", "\n", "\n", "p.add_tools(ZoomInTool())\n", "p.add_tools(ResetTool())\n", "p.add_tools(BoxZoomTool())\n", "p.legend.location = \"top_left\"\n", "p.legend.click_policy = \"hide\"\n", "# p.title()\n", "\n", "if BEST:\n", " export_png(p, filename=f\"subtypes_INF_split{SPLIT}.png\") # save the plot\n", "else:\n", " export_png(p, filename=f\"subtypes_juXT_split{SPLIT}.png\")\n", "\n", "show(p)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grid plot for all other splits" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def range_with_ignore(start, stop, ignore):\n", " return np.concatenate([np.arange(start, ignore), np.arange(ignore + 1, stop)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "BEST = False\n", "\n", "mycols = colorblind[\"Colorblind\"][4]\n", "plots = []\n", "size = 12\n", "\n", "for split in range_with_ignore(0, 10, SPLIT).tolist():\n", "\n", " file_tr = f\"{PATH}/{split}/{layers}_tr.txt\" # Fit UMAP\n", " file_test = f\"{PATH}/{split}/{layers}_ts.txt\" # test UMAP\n", " file_test2 = f\"{PATH}/{split}/{layers}_ts2.txt\" # test UMAP\n", "\n", " features_train = pd.read_csv(file_tr, sep=\"\\t\", header=0, index_col=0)\n", " features_test = pd.read_csv(file_test, sep=\"\\t\", header=0, index_col=0)\n", " features_test2 = pd.read_csv(file_test2, sep=\"\\t\", header=0, index_col=0)\n", "\n", " INF_feats = pd.read_csv(\n", " f\"{PATH_RESULTS}/{split}/rSNFi/{layers}_ts_RandomForest_KBest_featurelist.txt\",\n", " sep=\"\\t\",\n", " )[\"FEATURE_NAME\"].values.tolist()\n", "\n", " best_train = features_train[INF_feats]\n", " best_test = features_test[INF_feats]\n", " best_test2 = features_test2[INF_feats]\n", "\n", " if BEST:\n", " features_train = best_train\n", " features_test = best_test\n", " features_test2 = best_test2\n", "\n", " samples_tr = features_train.index\n", " labels_tr = pd.read_csv(\n", " f\"{PATH}/{split}/labels_{TASK}_tr.txt\", sep=\"\\t\", header=None\n", " )[0].tolist()\n", "\n", " features_train[\"labels\"] = labels_tr\n", " labels_tr = features_train[\"labels\"]\n", " features_tr = features_train[features_train.columns[:-1]].values\n", "\n", " samples_test = features_test.index\n", " labels_test = pd.read_csv(\n", " f\"{PATH}/{split}/labels_{TASK}_ts.txt\", sep=\"\\t\", header=None\n", " )[0].tolist()\n", "\n", " features_test[\"labels\"] = labels_test\n", " labels_test = features_test[\"labels\"]\n", " features_ts = features_test[features_test.columns[:-1]].values\n", "\n", " samples_test2 = features_test2.index\n", " labels_test2 = pd.read_csv(\n", " f\"{PATH}/{split}/labels_{TASK}_ts2.txt\", sep=\"\\t\", header=None\n", " )[0].tolist()\n", "\n", " features_test2[\"labels\"] = labels_test2\n", " labels_test2 = features_test2[\"labels\"]\n", " features_ts2 = features_test2[features_test2.columns[:-1]].values\n", "\n", " train_data = features_tr\n", " test_data = features_ts\n", " test2_data = features_ts2\n", "\n", " mapper = umap.UMAP(\n", " n_neighbors=40, min_dist=0.01, n_components=2, metric=\"euclidean\"\n", " ).fit(train_data)\n", " test_embedding = mapper.transform(test_data)\n", " test2_embedding = mapper.transform(test2_data)\n", "\n", " myclasses = pd.unique(labels_tr).tolist()\n", "\n", " p = figure(title=f\"split {split}\")\n", " p.title.text_font_size = \"25pt\"\n", "\n", " p.title.align = \"center\"\n", " p.title.text_color = \"black\"\n", " p.title.text_font_size = \"25px\"\n", "\n", " for col, theclass in zip(mycols, myclasses):\n", "\n", " idx_tr = np.where(np.array(labels_tr) == theclass)[0].tolist()\n", " samples_train = np.expand_dims(samples_tr[idx_tr,], axis=1)\n", " data_tr = np.hstack((mapper.embedding_[idx_tr,], samples_train))\n", " df_tr = pd.DataFrame(data_tr, columns=[\"x\", \"y\", \"sample\"])\n", "\n", " source_tr = ColumnDataSource(\n", " data=dict(x=df_tr[\"x\"], y=df_tr[\"y\"], desc=df_tr[\"sample\"])\n", " )\n", " p.circle(\n", " x=\"x\",\n", " y=\"y\",\n", " size=size,\n", " source=source_tr,\n", " color=col,\n", " alpha=0.8,\n", " legend=str(theclass),\n", " )\n", "\n", " idx_ts = np.where(np.array(labels_test) == theclass)[0].tolist()\n", " samples_ts = np.expand_dims(samples_test[idx_ts,], axis=1)\n", " data_ts = np.hstack((test_embedding[idx_ts,], samples_ts))\n", " df_ts = pd.DataFrame(data_ts, columns=[\"x\", \"y\", \"sample\"])\n", "\n", " source_ts = ColumnDataSource(\n", " data=dict(x=df_ts[\"x\"], y=df_ts[\"y\"], desc=df_ts[\"sample\"])\n", " )\n", " p.triangle(x=\"x\", y=\"y\", size=size, source=source_ts, color=col, alpha=0.8)\n", "\n", " idx_ts2 = np.where(np.array(labels_test2) == theclass)[0].tolist()\n", " samples_ts2 = np.expand_dims(samples_test2[idx_ts2,], axis=1)\n", " data_ts2 = np.hstack((test2_embedding[idx_ts2,], samples_ts2))\n", " df_ts2 = pd.DataFrame(data_ts2, columns=[\"x\", \"y\", \"sample\"])\n", "\n", " source_ts2 = ColumnDataSource(\n", " data=dict(x=df_ts2[\"x\"], y=df_ts2[\"y\"], desc=df_ts2[\"sample\"])\n", " )\n", " p.diamond(x=\"x\", y=\"y\", size=size, source=source_ts2, color=col, alpha=0.8)\n", "\n", " # p.legend.location = \"bottom_left\"\n", " p.legend.label_text_font_size = \"20pt\"\n", " p.yaxis.major_label_text_font_size = \"15pt\"\n", " p.xaxis.major_label_text_font_size = \"15pt\"\n", " plots.append(p)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grid = gridplot(\n", " [\n", " [plots[0], plots[1], plots[2]],\n", " [plots[3], plots[4], plots[5]],\n", " [plots[6], plots[7], plots[8]],\n", " ],\n", " plot_width=1200,\n", " plot_height=1200,\n", ")\n", "\n", "if BEST:\n", " export_png(grid, filename=\"subtypes_INF_suppl.png\")\n", "else:\n", " export_png(grid, filename=\"subtypes_juXT_suppl.png\")\n", "\n", "show(grid)" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3.6.9 64-bit", "language": "python", "name": "python36964bita08c9a394aa84e7d9622460ca3efcae1" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }