Commit 619216bd authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Remove all digital-pathology related files

parent 9bb0ecb7
......@@ -16,12 +16,6 @@ install.packages("TunePareto")
To install `mlpy`, follow the instructions [here](https://gitlab.fbk.eu/MPBA/mlpy).
To install `openslide`:
```bash
apt-get install openslide-tools
pip install openslide-python
```
or follow the instructions [here](https://openslide.org/download/).
To install `bootstrapped`:
```bash
......
from DP_classification.config import EXPERIMENT_DATA_DIR, DATA_PATH, DATASET_SOURCE, TILES_DIR, SVS_DIR
rule preprocess_clinical_patients:
input:
'DP_classification/preprocessing/preprocess_clinical_file.py',
DATA_PATH / DATASET_SOURCE / 'clinical.csv',
DATA_PATH / DATASET_SOURCE / 'slides_filename_uuid.csv'
output:
EXPERIMENT_DATA_DIR / 'metadata_patients.csv'
shell:
f'mkdir -p {EXPERIMENT_DATA_DIR} && ' \
'python DP_classification/preprocessing/preprocess_clinical_file.py'
rule compute_patients:
input:
'DP_classification/preprocessing/compute_patients.py',
EXPERIMENT_DATA_DIR / 'metadata_patients.csv',
expand(str(EXPERIMENT_DATA_DIR / '{split_id}' / 'gene_tr.txt'), split_id=[i for i in range(10)]),
expand(str(EXPERIMENT_DATA_DIR / '{split_id}' / 'gene_ts.txt'), split_id=[i for i in range(10)])
output:
expand(str(EXPERIMENT_DATA_DIR / 'patients_split_{split_id}_tr.txt'), split_id=[i for i in range(10)]),
expand(str(EXPERIMENT_DATA_DIR / 'patients_split_{split_id}_ts.txt'), split_id=[i for i in range(10)])
shell:
'python DP_classification/preprocessing/compute_patients.py'
rule svs_to_random_tiles:
input:
'DP_classification/preprocessing/svs_to_random_tiles.py',
EXPERIMENT_DATA_DIR / 'metadata_patients.csv',
ancient(SVS_DIR), # ancient --> do not consider timestamp
output:
directory(TILES_DIR)
shell:
'python DP_classification/preprocessing/svs_to_random_tiles.py'
rule preprocess_tiles_metadata:
input:
'DP_classification/preprocessing/preprocess_tiles.py',
TILES_DIR,
EXPERIMENT_DATA_DIR / 'metadata_patients.csv',
expand(str(EXPERIMENT_DATA_DIR / 'patients_split_{split_id}_tr.txt'), split_id=[i for i in range(10)]),
expand(str(EXPERIMENT_DATA_DIR / 'patients_split_{split_id}_ts.txt'), split_id=[i for i in range(10)])
output:
EXPERIMENT_DATA_DIR / 'metadata_tiles.csv'
shell:
'python DP_classification/preprocessing/preprocess_tiles.py'
#%%
import os
from pathlib import Path
import pandas as pd
import requests
from tqdm import tqdm
# %%
DATA_DIR = Path('data/tcga_breast')
SVS_DIR = DATA_DIR / 'svs'
csv = pd.read_csv(DATA_DIR / 'slides_filename_uuid.csv')
os.makedirs(SVS_DIR, exist_ok=True)
#%%
for i, row in tqdm(csv.iterrows()):
uuid = row['uuid']
filename = row['filename']
print(uuid)
data_endpt = "https://api.gdc.cancer.gov/data/{}".format(uuid)
response = requests.get(data_endpt, headers = {"Content-Type": "application/json"})
# The file name can be found in the header within the Content-Disposition key.
response_head_cd = response.headers["Content-Disposition"]
output_path = SVS_DIR / filename
with open(output_path, "wb") as output_file:
output_file.write(response.content)
# %%
# %%
import os
from pathlib import Path
import numpy as np
import pandas as pd
from DP_classification.config import DATA_PATH
# %%
clinical = pd.read_csv(DATA_PATH / 'tcga_breast' / 'clinical_complete.csv', sep='\t')
filenames_uuids = pd.read_csv(DATA_PATH / 'tcga_breast' / 'TCGA_BRCA_filename_uuid.tsv', sep='\t')
# %%
patient_column = 'Sample'
filename_column = 'patient.samples.sample.portions.portion.slides.slide.image_file_name'
patients_filenames = clinical[[patient_column, filename_column]]
patients_filenames.rename({filename_column: 'filename'}, axis=1, inplace=True)
# %%
filenames_uuids['filename_lower'] = filenames_uuids['filename'].str.lower()
# %%
merged = pd.merge(patients_filenames, filenames_uuids, left_on='filename', right_on='filename_lower')
# %%
merged = merged[['Sample', 'filename_y', 'id']]
merged.rename({'filename_y': 'filename', 'id': 'uuid'}, axis=1, inplace=True)
# %%
merged.to_csv(DATA_PATH / 'slides_filename_uuid.csv', index=False)
# %%
......@@ -2,7 +2,6 @@ name: inf
channels:
- defaults
- bioconda
- pytorch
dependencies:
- python=3.6
- gsl
......@@ -21,11 +20,8 @@ dependencies:
- conda
- snakemake
- notebook
- scikit-image
- pytorch
- torchvision
- scikit-image
- black
- tensorboard
- bokeh
- umap-learn
prefix: //anaconda/envs/inf
......
from .augment import *
from .conversions import *
#from .loaders import *
from .tiles import *
#from .visualization import *
from .wsi import *
\ No newline at end of file
import numpy as np
from skimage import transform
from skimage import color
from skimage import draw
def rotate_img(img, masks=None, max_rotate = 90):
angle = np.random.uniform(-1,1)
angle = angle*max_rotate
rotate_angle = np.pi*angle/180
affine_tf = transform.AffineTransform(rotation=rotate_angle)
img_ = transform.warp(img, inverse_map=affine_tf,mode='wrap', preserve_range=True)
if masks is None:
return(img_)
else:
masks_ = []
for mask in masks:
mask_ = transform.warp(mask, inverse_map=affine_tf, mode='wrap', preserve_range=True)
masks_.append(mask_)
return(img_, masks_)
def shear_img(img, masks=None, max_shear = 10):
angle = np.random.uniform(-1,1)
angle = angle*max_shear
shear_angle = np.pi*angle/180
affine_tf = transform.AffineTransform(shear=shear_angle)
img_ = transform.warp(img, inverse_map=affine_tf, mode='wrap', preserve_range=True)
if masks is None:
return(img_)
else:
masks_ = []
for mask in masks:
mask_ = transform.warp(mask, inverse_map=affine_tf, mode='wrap', preserve_range=True)
# mask_[mask_<255] = 0
masks_.append(mask_)
return(img_, masks_)
def rescale_img(img, masks=None, max_scale = [0.5, 2]):
scale_h = np.random.uniform(max_scale[0], max_scale[1])
scale_w = np.random.uniform(max_scale[0], max_scale[1])
img_ = transform.rescale(img, (scale_h, scale_w), mode='wrap', preserve_range=True)
if masks is None:
return(img_)
else:
masks_ = []
for mask in masks:
mask_ = transform.rescale(mask, (scale_h, scale_w), mode='wrap', preserve_range=True)
# mask_[mask_<255] = 0
masks_.append(mask_)
return(img_, masks_)
def crop_img(img, masks=None, max_crop = [0.5, 0.5]):
img_ = np.copy(img)
h,w = img_.shape[0], img_.shape[1]
size_h = int(np.random.uniform(int((1-max_crop[0])*h), h))
size_w = int(np.random.uniform(int((1-max_crop[1])*w), w))
h0 = int(np.random.uniform(0, h - size_h))
w0 = int(np.random.uniform(0, w - size_w))
img_ = img_[h0 : h0+size_h, w0 : w0+size_w]
if masks is None:
return(img_)
else:
masks_ = []
for mask in masks:
mask_ = mask[h0 : h0+size_h, w0 : w0+size_w]
masks_.append(mask_)
return(img_, masks_)
def flip_img(img, masks=None, flip_probs = [0.5, 0.5]):
img_ = np.copy(img)
h_flip = np.random.uniform(0, 1)<flip_probs[0]
v_flip = np.random.uniform(0, 1)<flip_probs[1]
if h_flip:
img_ = img_[:,::-1]
if v_flip:
img_ = img_[::-1,:]
if masks is None:
return(img_)
else:
masks_ = []
for mask_ in masks:
if h_flip:
mask_ = mask_[:,::-1]
if v_flip:
mask_ = mask_[::-1,:]
masks_.append(mask_)
return(img_, masks_)
#%% greyscale augment
def bright_contrast(img, max_scale=0.01):
img_ = np.copy(img)
low = np.min(img_)
high = np.max(img_)
scale = np.random.uniform(max_scale, 1)
shift = np.random.uniform(0, 1-scale)
img_ = (img_ - low)/(high - low)
img_ = img_*scale + shift
img_[0,0] = 0
img_[-1,-1] = 1
return(img_)
def invert(img, inv_prob=0.5):
img_ = np.copy(img)
invert = np.random.random()<inv_prob
if invert:
img_ = -1*img_ + 1
return(img_)
#%% morphological augment
def random_circle(img, max_radius = 0.5, alpha=0.25):
img_ = np.copy(img)
h,w = img_.shape[0], img_.shape[1]
random_col = np.random.random()
random_r = np.random.randint(0, h)
random_c = np.random.randint(0, w)
random_radius = np.random.randint(0.5, int(np.max([h,w])*max_radius))
rr,cc = draw.circle(r = random_r, c = random_c, radius = random_radius, shape=(h, w))
img_h = color.rgb2hsv(img_)
img_h[rr,cc, 0] = random_col
img_h[rr,cc, 1] = 0.1
img_h[rr,cc, 2] = 1 - img_h[rr,cc, 2]*alpha
#
out = color.hsv2rgb(img_h)
return(out)
#%% color augment
def generate_3d():
"""Generate a 3D random rotation matrix.
Returns:
np.matrix: A 3D rotation matrix.
"""
x1, x2, x3 = np.random.rand(3)
R = np.matrix([[np.cos(2 * np.pi * x1), np.sin(2 * np.pi * x1), 0],
[-np.sin(2 * np.pi * x1), np.cos(2 * np.pi * x1), 0],
[0, 0, 1]])
v = np.matrix([[np.cos(2 * np.pi * x2) * np.sqrt(x3)],
[np.sin(2 * np.pi * x2) * np.sqrt(x3)],
[np.sqrt(1 - x3)]])
H = np.eye(3) - 2 * v * v.T
M = -H * R
return M
def rotate_colors(img):
rotation_matrix = np.array(generate_3d())
img_rotated = np.dot(img, rotation_matrix)
img_rotated = (img_rotated - np.min(img_rotated))/(np.max(img_rotated) - np.min(img_rotated))
return(img_rotated)
#%%
def data_augment(img, masks=None,
max_rotate=30,
max_shear=5,
max_scale = [0.8, 1.2],
max_crop=[0.2, 0.2],
flip_probs=[0.5, 0.5]):
img_, masks_ = rotate_img(img, masks, max_rotate=max_rotate)
img_, masks_ = shear_img(img_, masks_, max_shear=max_shear)
img_, masks_ = rescale_img(img_, masks_, max_scale)
img_, masks_ = flip_img(img_, masks_, flip_probs=flip_probs)
img_, masks_ = crop_img(img_, masks_, max_crop=max_crop)
return(img_, masks_)
import numpy as np
from PIL import Image
def numpy2PIL(img):
#rescale to 0-255
if((np.max(img) - np.min(img))) != 0:
img = (img - np.min(img))/(np.max(img) - np.min(img))*255
img = np.uint8(img)
img_PIL = Image.fromarray(img)
return(img_PIL)
def PIL2numpy(img):
return(np.array(img))
import numpy as np
import os
import pandas as pd
from PIL import Image
#import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import skimage
from .conversions import numpy2PIL
from .augment import data_augment
# TODO: use torchvision functional api for a unique transform for input (and target if it exists)
#%%
def __transform__(image, phase, size):
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
if phase == 'train':
transformations = transforms.Compose([transforms.RandomRotation((-180,180)),
transforms.RandomCrop(size),
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip(),
transforms.ToTensor(),
normalize])
image = transformations(image)
else:
transformations = transforms.Compose([transforms.CenterCrop(size),
transforms.ToTensor(),
normalize])
image = transformations(image)
return(image)
class CSVLabelsLoader(Dataset):
'''
Load images and labels. Use when images are in a unique folder and labels are in a csv file (see specs below).
image_dir: folder were all the images are stored
labels_file: csv file with two columns without header.
The first column should contain the filenames of the images WITH extension.
The second column should contain the label of each image.
'''
def __init__(self, image_dir, labels_file=None, transform=None, size=224, phase='train', seed=0):
np.random.seed(seed)
assert phase in ['train', 'valid', 'test'], "Phase should be either 'train' or 'valid' or 'test'."
self.phase = phase
self.size = size
self.image_dir = image_dir
if self.phase != 'test':
assert labels_file is not None, "Labels file should be provided"
label_data = pd.read_csv(labels_file, header=None)
samples = np.array(label_data.iloc[:,0]).astype(str)
labels = np.array(label_data.iloc[:,1])
else:
samples = np.array(os.listdir(self.image_dir))
labels = np.repeat(None, len(samples))
idx_shuffle = np.random.permutation(len(samples))
self.samples = samples[idx_shuffle]
self.labels = labels[idx_shuffle]
if transform is None:
self.transform = __transform__
else:
self.transform = transform
def __len__(self):
return(len(self.files))
def __getitem__(self, idx):
sample = self.samples[idx]
img_path = os.path.join(self.image_dir, sample)
image = Image.open(img_path).convert('RGB')
image = self.transform(image, self.phase, self.size)
label = self.labels[np.where(self.samples == sample)[0][0]]
item = {'image': image, 'label': label, 'sample': sample}
return(item)
class FolderLabelsLoader(Dataset):
'''
Load images and labels. Use when images are in a separate folders and each folder is a class.
image_dir: folder were all the images are stored
classes: selected classes
'''
def __init__(self, root_dir, classes=None, transform=None, size=224, phase='train', seed=0):
self.root_dir = root_dir
assert phase in ['train', 'valid', 'test'], "Phase should be either 'train' or 'valid' or 'test'."
self.phase = phase
self.size = size
if classes is None:
self.classes = np.array(sorted(os.listdir(root_dir)))
else:
self.classes = classes
if transform is None:
self.transform = __transform__
else:
self.transform = transform
np.random.seed(seed)
files=[]
labels=[]
original_classes = []
for i, CLASS in enumerate(self.classes):
os.chdir(self.root_dir)
os.chdir(CLASS)
files_class = np.array(os.listdir(os.getcwd()))
label_class = np.repeat(str(i), len(files_class))
original_class = np.repeat(CLASS, len(files_class))
files.append(['{}/{}'.format(CLASS, f) for f in files_class])
labels.append(label_class)
original_classes.append(original_class)
files = np.concatenate(files)
labels = np.concatenate(labels)
original_classes = np.concatenate(original_classes)
idx_shuffle = np.random.permutation(len(files))
self.samples = files[idx_shuffle]
self.labels = labels[idx_shuffle]
self.original_classes = original_classes[idx_shuffle]
def __len__(self):
return(self.dataset.shape[0])
def __getitem__(self, idx):
sample = self.samples[idx]
label = self.labels[idx]
original_class = self.original_classes[idx]
img_path = os.path.join(self.root_dir, sample)
image = Image.open(img_path).convert('RGB')
image = self.transform(image, self.phase, self.size)
item = {'image': image, 'label': label, 'sample': sample, 'class':original_class}
return(item)
def MaskLabelsLoader(Dataset):
def __init__(self, root_dir, size = 256, phase = 'train', seed=0):
self.ROOT = root_dir
self.size = size
self.phase = phase
np.random.seed(seed)
self.samples = os.listdir(root_dir)
def __len__(self):
return(len(self.samples))
def __getitem__(self, idx):
SAMPLE = self.samples[idx]
SAMPLE_PATH = self.ROOT + SAMPLE
img = skimage.io.imread(SAMPLE_PATH + '/images/' + SAMPLE + '.png')
if self.phase == 'test':
img = img[:,:,:3]
img = numpy2PIL(img)
img, size = self._to_tensor(img)
sample = {'image': img, 'sample':SAMPLE, 'size':size}
elif self.phase == 'valid':
labels = skimage.io.imread(SAMPLE_PATH + '/labels/' + SAMPLE + '.png')
img = numpy2PIL(img)
labels = numpy2PIL(labels)
img, size, labels = self._to_tensor(img, labels)
sample = {'image': img, 'mask': labels, 'sample':SAMPLE, 'size':size}
elif self.phase == 'train':
labels = skimage.io.imread(SAMPLE_PATH + '/labels/' + SAMPLE + '.png') #0-255
img, labels = data_augment(img, [labels],
max_rotate=10,
max_shear=0,
max_scale = [0.8,1.2],
max_crop=[0.1,0.1],
flip_probs=[0.5,0.5])
labels = labels[0]
th = np.max(labels)*0.9 #threshold_otsu(labels)
labels[labels<th] = 0
labels[labels>=th] = 1
img = numpy2PIL(img)
labels = numpy2PIL(labels)
img, size, labels = self._to_tensor(img, labels)
sample = {'image': img, 'mask': labels, 'sample':SAMPLE, 'size':size}
else:
print('Phase not implemented')
sample = {'image': None, 'mask': None, 'sample':None, 'size':None}
return(sample)
def _to_tensor(self, image, labels=None):
w,h = image.size
image = transforms.Compose([transforms.Resize((self.size, self.size), interpolation = Image.NEAREST), transforms.ToTensor()])(image)
if labels != None:
labels = transforms.ToTensor()(labels)
return(image, (h,w), labels)
else:
return(image, (h,w))
#%%
import matplotlib.pyplot as plt
loader = FolderLabelsLoader('.../GTEx_images_tissues/', phase='train')
sample = loader[4]
plt.imshow(sample['image'].numpy()[2,:,:], 'gray')
print(sample['label'])
print(sample['sample'])
print(sample['class'])
import numpy as np
import skimage.morphology as morph
from skimage.filters import threshold_otsu
from skimage import color
from scipy import ndimage, linalg
# import colorsys
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# %% general utilities
def has_enough_tissue(image, threshold=0.8, near_zero_var_threshold=0.1):