Commit 0bd7b229 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Remove unused files

parent 8a2ddba5
%% Cell type:code id: tags:
``` python
import os
PATH = os.getcwd()
import sys
import numpy as np
import pandas as pd
import SimpleITK as sitk
from dicom_utils import augmentation as aug, processing as dup
from dataset import NumpyCSVDataset
#%%
```
%% Cell type:code id: tags:
``` python
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
```
%% Cell type:code id: tags:
``` python
SIZE = 64
DATASETDIR = f"{PATH}/data/processed/bbox_fixed2_64_TRAIN"
#PRIMA SOLUZIONE
#OUTPUT_DIR = f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_cloned'
#os.makedirs( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_cloned' , exist_ok=False)
#SECONDA SOLUZIONE
OUTPUT_DIR = f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_buildingclones'
#os.makedirs( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_buildingclones' , exist_ok=False)
dataset = NumpyCSVDataset(DATASETDIR , f"{PATH}/data/labels.csv" , "Locoregional", SIZE , mode="test")
```
%% Cell type:markdown id: tags:
### Soluzione
%% Cell type:code id: tags:
``` python
# L'idea e' copiare i file negativi solamente una volta, mentre copiare i file "positivi" piu' volte,
# in modo da creare vari cloni dei file positivi
# Per far questo:
# 1. quando trovi un file positivo devi ripetere K volte questa operazione di cloning
# dove K e' il rapporto di sbilanciamento n_NEGATIVI / n_POSITIVI
# 2. devi aggiungere al nome del soggetto un suffisso,
# perche' se crei tante copie dello stesso file il nome deve essere diverso, altrimenti sovrascrivi e siamo al punto di prima
# occhio alla differenza tra:
# > sample: e' un oggetto di tipo dizionario che viene restituito da NumpyCSVDataset.
# contiene: sample['sample']: il nome del soggetto
# sample['data']: la matrice (3D) di numpy che contiene l'immagine 3D, ovvero i valori di grigio di ogni pixel
# sample['target']: la label (0 o 1)
# > file XXX.npy: file che salva in formato binario l'immagine 3D (= sample['data'])
# > dataset: oggetto di tipo NumpyCSVDataset: praticamente e' una lista di dizionari (vedi sample) un dizionario per ogni file in DATASETDIR
# Praticamente: e' la classe NumpyCSVDataset che si 'arrangia' a caricare i file .npy dalla classe DATASETDIR,
# capire il nome del soggetto (dal nome del file),
# cercare la label del soggetto (nel file labels.csv)
# e associare il tutto in un dizionario che poi viene messo in dataset
#TROVO K:
labels = dataset._labels.values
idx_positive = np.where(labels==1)[0]
K = int((len(labels) - len(idx_positive))/len(idx_positive))
for i in range(len(dataset)):
sample = dataset[i]
SUB = sample['sample']
print(SUB)
image_orig = sample['data']
label = sample['target']
ratio = 1 if label==0 else K # se negativo copia una volta (ratio=1), altrimenti copia K volte (ratio=K)
# CREO UN CONTATORE DA USARE COME SUFFISSO
id_image=0
for j in range(ratio):
# image_aug = augment_3D(image_orig, 'train', SIZE) # Questo passaggio non lo faccio perche' non voglio fare augmentation
np.save(f'{OUTPUT_DIR}/{SUB}_{id_image}.npy', image_orig) # nel nome del file ho inserito il suffisso
id_image +=1
```
%% Cell type:markdown id: tags:
## ================
%% Cell type:markdown id: tags:
PRIMA SOLUZIONE
%% Cell type:markdown id: tags:
Copy each file from one folder to another
%% Cell type:code id: tags:
``` python
# questo e' sbagliato perche' 'sample' e' un dizionario che contiene, tra le varie cose la variabile numpy con dentro l'immagine
# image = sample['data'], dovresti salvare quella: np.save(f'{OUTPUT_DIR}/{subject}.npy', IMAGE )
for i in range(len(dataset)):
sample = dataset[ i ]
subject = sample['sample']
np.save(f'{OUTPUT_DIR}/{subject}.npy', sample )
```
%% Cell type:markdown id: tags:
Clone 1-times only positive samples
%% Cell type:code id: tags:
``` python
labels = dataset._labels.values
idx_positive = np.where(labels==1)[0]
for j in range( len(idx_positive) ):
sample = dataset[ idx_positive[j] ]
image_orig = sample['data']
subject = sample['sample']
sample['target'] = int( sample['target'] )
#QUesta e' giusta
np.save(f'{OUTPUT_DIR}/{subject}_cl3.npy', image_orig )
# questa e' sbagliata (stesso errore che hai fatto sopra).
# siccome usi lo stesso nome del file, il fatto che prima tu abbia salvato il file correttamente diventa inutile
np.save(f'{OUTPUT_DIR}/{subject}_cl3.npy', sample )
```
%% Cell type:markdown id: tags:
SECONDA SOLUZIONE
%% Cell type:markdown id: tags:
"building" new files
%% Cell type:code id: tags:
``` python
for i in range(len(dataset)):
it= dataset[i].items()
it = list(it)
sample = sample = dataset[i]
subject = sample['sample']
data = it[0]
target = it[1]
sample = it[2]
dic = {data, target, sample}
#sbagliato, stai salvando un dizionario,come sopra
np.save(f'{OUTPUT_DIR}/{subject}.npy', dic )
```
%% Cell type:markdown id: tags:
CHECK SOLUZIONI
%% Cell type:code id: tags:
``` python
#newdataset = NumpyCSVDataset( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_cloned', f"{PATH}/data/labels.csv" , "Locoregional", SIZE , mode="test")
newdataset = NumpyCSVDataset( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_buildingclones', f"{PATH}/data/labels.csv" , "Locoregional", SIZE , mode="test")
```
%% Cell type:code id: tags:
``` python
idx_positive[0]#5
dataset[ 5 ]['target']
```
%% Cell type:code id: tags:
``` python
newdataset[ 5 ]['target']
```
%% Output
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-9-1c6cfa41156c> in <module>()
----> 1 newdataset[ 5 ]['target']
~/projects/networks_dami/dataset.py in __getitem__(self, idx, no_data)
46 data = np.load(data_file)
47
---> 48 data = self.augmentation(data, self.mode, self.size) #qst è numpy
49 data = torch.Tensor(data) #qst è tensor - la network vuole tensor
50 output = {'data': data, 'target': label, 'sample': sample}
~/projects/networks_dami/dataset.py in augment_3D(image, mode, size)
8
9 def augment_3D(image, mode, size):
---> 10 N_CHANNELS = image.shape[0]
11 image_seq = [sitk.GetImageFromArray(image[i,:,:,:]) for i in range(N_CHANNELS)]
12
IndexError: tuple index out of range
%% Cell type:code id: tags:
``` python
cloned = NumpyCSVDataset( f'{PATH}/data/processed/bbox_fixed2_64_TRAIN_cloned/bla' ,
f"{PATH}/data/labels.csv" , "Locoregional", SIZE , mode="test")
```
%% Cell type:code id: tags:
``` python
cloned.size
```
%% Cell type:markdown id: tags:
DA COMMAND LINE:
%% Cell type:code id: tags:
``` python
'''
dsalvalai@Starscream:~$ cp -r bbox_fixed2_64_TRAIN bbox_fixed2_64_TRAIN_cloned
dsalvalai@Starscream:~$ cp HN-CHUS-003.npy HN-CHUS-003_cl1.npy
'''
```
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score as acc, precision_score as prec, recall_score as rec, matthews_corrcoef as mcc
from sklearn.metrics import confusion_matrix
#%%
y = pd.read_csv('/home/bizzego/Downloads/TT/TT_merged_features_B_UNCORR_selected_SVMlinear/predictions_test.csv')
y_true = y['true'].values
y_pred = y['pred'].values
#%%
print(classification_report(y_true, y_pred))
print(acc(y_true, y_pred), mcc(y_true, y_pred))
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
from IPython import get_ipython
# %% [markdown]
# ## Deep features extraction
# %%
get_ipython().run_line_magic('HN_env', '')
# %%
import os
PATH = os.path.abspath(os.path.curdir)
# %%
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# %% [markdown]
# ### Import
# %%
import os
import sys
from tqdm import tqdm
import numpy as np
import pandas as pd
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from networks import CiompiDO
from dataset import NumpyCSVDataset, augment_3D_HN
# %%
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
multigpu = True
# %%
DATASETDIR = '/thunderdisk/HN/processed/bbox_64_augmented_LR'
EXPERIMENT_DIR = f'{PATH}/experiments'
# %%
MODEL_NAME = 'LR_noTx_branch_wise_free_aug_CT_20191027-124913'
SIZE = 64
OUTDIR = f'{EXPERIMENT_DIR}/{MODEL_NAME}/features/'
OUTFILE = 'features_noTx_AUG.csv'
os.makedirs(OUTDIR, exist_ok=True)
# %%
dataset = NumpyCSVDataset(DATASETDIR , f'{PATH}/data/clinical_data_noTx.csv' , 'Locoregional', SIZE , mode='test')
loader = DataLoader(dataset, batch_size=8, num_workers=12, pin_memory=True, shuffle=False, drop_last=False)
model_weights = f'{EXPERIMENT_DIR}/{MODEL_NAME}/weights.pth'
# %%
model = CiompiDO(n_classes=2, n_channels=1, modality='CT')
if multigpu:
model = nn.DataParallel(model.to(device))
model = model.module
model.load_state_dict(torch.load(model_weights))
# %%
#%%
deep_features = []
sample_names = []
labels = []
patients = []
with torch.no_grad():
for batch in tqdm(loader):
names_batch = [name.split('.')[0] for name in batch['filename']]
images_batch = batch['data'].to(device)
labels_batch = batch['target']
patients_batch = batch['patient']
out = model.extract_features(images_batch.cuda())
deep_features.append(out.data.cpu().numpy())
sample_names.append(names_batch)
labels.append(labels_batch)
patients.append(patients_batch)
# %%
deep_features = np.concatenate(deep_features)
sample_names = np.concatenate(sample_names)
labels = np.concatenate(labels)
patients = np.concatenate(patients)
# %%
len(labels)