Commit 85105e95 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Merge branch 'viz/umap-vendor' into 'master'

Add first umap HN plots

Added the first files produced: work in progress label is because the paths to processed data (the .npy s) has to be defined w.r.t the repository.
Adding first files: notebook for simple scatter plot with bokeh(experiment/) and script for interactive plot(examples/).

See merge request !1
parents f0cb9154 7e770234
This python script take as input a csv with patient information,
data from medical imaging,
plot a 2-dim plot of the raw data reduced using umap.
It also provide a tool for visualizing the actual image,
selecting a point on the scatter plot
RUN: run with bokeh server: 'bokeh serve <script path>,
go to the specified local url
LOCATION: csv a upper level folder callend 'vendor_plot',
data in upper level folder called 'bbox64'
pandas (sort of)
PARAMETER: (important ones)
modality 'PT' /'CT
import os
import numpy as np
import pandas as pd
from holoviews import opts
import holoviews as hv
import holoviews.plotting.bokeh
from holoviews.streams import Selection1D
import umap
from pathlib import Path
renderer = hv.renderer('bokeh')
def scan_explore(datapath_val,
print(f"Subject ID:{subject_id} Slice(x):{slice_id} Modality:{modality}")
slice_id = int(slice_id)
data_ext = ".npy"
if modality == 'PT':
mod_id = 1
elif modality == 'CT':
mod_id = 0
print("Invalid scan modality.")
# extract slice from data
object_path_val = datapath_val/(subject_id + data_ext) # path op
object_path_bz = datapath_bz/(subject_id + data_ext)
img_slice = np.load(object_path_val)[mod_id, slice_id, :, :]
except FileNotFoundError:
img_slice = np.load(object_path_bz)[mod_id, slice_id, :, :]
return hv.Image(img_slice).opts(title=f"id:{subject_id} mod:{modality}",
fontsize={'title': 10})
def scan_wrap(datapath_val, datapath_bz, points_data,
index, slice_id=0, modality='CT'):
print("Index supplied ", index)
# the supplied index is always a list with one element
if (index is None) | (index == []):
pt_index = 0
pt_index = index[0]
subject_id = ([pt_index])['Subject']
res = scan_explore(datapath_val, datapath_bz,
subject_id, slice_id, modality)
return res
def get_data_lbl(path, file_list, path_orig_df, modality):
Extract data and manufacturer label from data folder and info file
path: data folder
file_list: list of files with extension
path_orig_df: our dataset with file info
modality: either PT (for PET) or CT
pandas dataframe with cols:
data: list of 3d-numpy arrays
manufacturer: scan device manufacturer
dataset: dataset id
subject_id: scanned subject id
if modality == "CT":
mod_id = 0
elif modality == "PT":
mod_id = 1
print("Please enter a valid modality parameter")
df_sel_mod = path_orig_df.loc[path_orig_df["Modality"] == modality]
data = []
lbl = []
dataset_id = []
subject_id = []
for elem in file_list:
# subj id removing extension
subj_id = elem.split(".")[0]
ds_split = subj_id.split("-")
if len(ds_split) == 3:
dataset_tmp = subj_id.split("-")[1]
# save manufacturer
vendor = df_sel_mod.loc[df_sel_mod["Subject ID"] == subj_id,
elif len(ds_split) == 1: # bz unique with different format
dataset_tmp = "BZ_0"
vendor = "Philips" # bolzano manufacturer
# load patient data
data_tmp = np.load(path/elem)[mod_id, :, :, :]
# save to lists
return pd.DataFrame(data={'data': data, 'manufacturer': lbl,
'dataset': dataset_id, 'subject_id': subject_id})
def get_scatter(df, group, ds):
keydims = ['x_u', 'y_u']
valdims = ['Manufacturer', 'Dataset', 'Subject']
df_sel = df.loc[(df['manufacturer'] == group) & (df['dataset'] == ds)]
red_x = df_sel['red_x']
red_y = df_sel['red_y']
manufacturer = df_sel['manufacturer']
dataset = df_sel['dataset']
subject_id = df_sel['subject_id']
plot = hv.Scatter((red_x, red_y, manufacturer, dataset, subject_id),
kdims=keydims, vdims=valdims)
return plot
def get_scatter_full(df):
keydims = ['x_u', 'y_u']
valdims = ['Manufacturer', 'Dataset', 'Subject']
df_sel = df
red_x = df_sel['red_x']
red_y = df_sel['red_y']
manufacturer = df_sel['manufacturer']
dataset = df_sel['dataset']
subject_id = df_sel['subject_id']
plot = hv.Scatter((red_x, red_y, manufacturer, dataset, subject_id),
kdims=keydims, vdims=valdims)
return plot
# set script parameters
modality = "CT" # choose a modality for the 2d rim red plot
data_folder = Path().cwd()/'..'/'..'/'data'
info_df_path = data_folder/'HN_val'/'processed'/'path_original_data.csv'
DATAPATH = data_folder/'HN_val'/'processed'/'bbox'/'bbox_64'
DATAPATH_BZ = data_folder/'HN_BZ'/'processed'/'bbox'/'bbox_64'
file_list = os.listdir(DATAPATH)
file_list_bz = os.listdir(DATAPATH_BZ)
path_orig_df = pd.read_csv(info_df_path)
print("Environment set.")
# prepare data and dim red
# actually get the data using pandas dataframes
# collect df from multiple folder for inputs and merge vertically
df_val = get_data_lbl(DATAPATH, file_list, path_orig_df, modality)
df_bz = get_data_lbl(DATAPATH_BZ, file_list_bz, path_orig_df, modality)
df = df_val.append(df_bz, ignore_index=True)
# flatten the 3d array
df['flat_data'] = [x.flatten('C') for x in df['data']]
df.drop(columns=['data'], inplace=True) # remove data col to save memory
umap_euclid = umap.UMAP(metric="euclidean", n_components=2,
n_neighbors=20, min_dist=.2)
reduced_data = umap_euclid.fit_transform(df['flat_data'].tolist())
df['red_x'] = reduced_data[:, 0]
df['red_y'] = reduced_data[:, 1]
df.drop(columns=['flat_data'], inplace=True) # remove flat_data col
print("Dim red completed")
# set up the layered scatterplot
# compute possible combination in the dataset
# use them to generate all the relevant sub-plot to be stacked
# compute unique co occurrences of manufacturer and
# dataset id actually present in the data
comb_vendor_ds = np.unique(df['manufacturer'] + '-' + df['dataset'])
comb_vendor_ds = [x.split('-') for x in comb_vendor_ds]
scatter_dict = {(group, ds): get_scatter(df, group, ds)
for group, ds in comb_vendor_ds}
# overlayed plots with different group, give color/legend can be hovered over
overlay = hv.NdOverlay(scatter_dict, kdims=['Manufacturer', 'dataset'])
overlay.opts(opts.Scatter(alpha=0.75, size=7, width=650,
height=650, tools=['hover']))
# one transparent layer just for point index information with tapping widgets
scatter_title = f"Umap 2d plot HN mod: {modality}"
p_ref = get_scatter_full(df).opts(opts.Scatter(title=scatter_title,
alpha=0.0, size=7,
width=650, height=650,
s = overlay * p_ref
# attach stream to overlayed plot
# stream depend on tap widget, unique selection widget and
# output a list with its index
# in this case it will account for index over the last layered
# scatterplot (in this case the complete one)
sel_stream = Selection1D(source=s, index=[0])
print("overlay plot and stream have been set.")
# define dynamic map taking subj id from stream and points data,
# slice id and modality from widgets
# to use already there the index variable and the kdim from
# sliders explicitly in the plot func
# it required that lambda func otherwise only the func has to be passed.
# prob there is a better way...
scan_dmap = hv.DynamicMap(lambda index, slice_id,
modality: scan_wrap(datapath_val=DATAPATH,
kdims=['slice_id', 'modality'])
# # define value domains for key dimensions not in streams
scan_dmap = scan_dmap.redim.range(slice_id=(0, 64))
scan_dmap = scan_dmap.redim.values(modality=['CT', 'PT'])
# rendering
layout = s + scan_dmap # build layout
doc = renderer.server_doc(layout) # instantiate renderer
doc.title = 'HN Scan explorer' # set title
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment