Commit 843f65e4 authored by MattiaPujatti's avatar MattiaPujatti
Browse files

Add docker interaction

parent 86049b46
[data_generation]
experiment = ["Prova 4 - BN90 - HF Sbilanciato/4_1 - BN90 - HF Sbilanciato 24h"]
rebuild_acquisition = [true]
rebuild_npy = true
version = ["p4_v1"]
is_resolver = [false]
split = ["standard"]
split_step = 7
train_val_pattern = ["p4_v1"]
test_pattern = ["p4_v1"]
classes_faulty = [1]
classes_healthy = [0]
[components]
feature = "tensionemotore"
feat_loc = "fase1"
[data_processing]
process = "raw"
[mfcc_params]
mfcc_length = 512
sample_rate = "sample_rate"
NFFT = 8192
frame_size = "mfcc_length / sample_rate"
frame_stride = "mfcc_length / sample_rate"
pre_emphasis = 0.9
nfilt = 50
num_ceps = 40
cep_lifter = 22
x1 = 5190
x2 = 1400
normalize = false
use_custom = false
signal_split = "sample_rate"
[stft_params]
nperseg = "sample_rate"
[raw_params]
length = "sample_rate"
[paths]
experiment_folder = "/home/data"
input = "/home/data/input/hf_4_1_24h/tensionemotore_fase1_raw/"
[filenames]
x_train = "x_train.npy"
y_train = "y_train.npy"
idx_train = "idx_train.npy"
x_val = "x_val.npy"
y_val = "y_val.npy"
idx_val = "idx_val.npy"
x_test = "x_test.npy"
y_test = "y_test.npy"
idx_test = "idx_test.npy"
x_inference = "x_test.npy"
y_inference = "y_test.npy"
idx_inference = "idx_test.npy"
prediction = "prediction.csv"
split_config = "split_config.json"
[__docker__]
options = "--rm -it"
volumes = []
[__udocker__]
# options =
[data_generation]
experiment = ["Prova 4 - BN90 - HF Sbilanciato/4_2 - BN90 - HF Sbilanciato 120h", "Prova 5 - MP105 - HF Cuscinetto/5_3 - MP105 - HF Cuscinetto 120h", "Prova 6 - BN90 - HF Cuscinetto/6_3 - BN90 - HF Cuscinetto 120h - partial - A", "Prova 6 - BN90 - HF Cuscinetto/6_5 - BN90 - HF Cuscinetto 120h - partial - B"]
rebuild_acquisition = [true, false, true, true]
rebuild_npy = true
version = ["p4_v2", "p5_v3", "p6_v3", "p6_v5"]
is_resolver = [false, false, false, false]
split = ["standard", "standard", "standard", "standard"]
split_step = 7
train_val_pattern = ["p4_v2", "p5_v3", "p6_v3", "p6_v5"]
test_pattern = ["p4_v2", "p5_v3", "p6_v3", "p6_v5"]
classes_faulty = [1, 1, 1, 1]
classes_healthy = [0, 0, 0, 0]
[components]
feature = "vibrazione"
feat_loc = "motore"
[data_processing]
process = "raw"
[mfcc_params]
mfcc_length = 512
sample_rate = "sample_rate"
NFFT = 8192
frame_size = "mfcc_length / sample_rate"
frame_stride = "mfcc_length / sample_rate"
pre_emphasis = 0.9
nfilt = 50
num_ceps = 40
cep_lifter = 22
x1 = 5190
x2 = 1400
normalize = false
use_custom = false
signal_split = "sample_rate"
[stft_params]
nperseg = "sample_rate"
[raw_params]
length = "sample_rate"
[paths]
experiment_folder = "/home/data"
input = "/home/data/input/hf_p4p5p6_120h/raw/"
[filenames]
x_train = "x_train.npy"
y_train = "y_train.npy"
idx_train = "idx_train.npy"
x_val = "x_val.npy"
y_val = "y_val.npy"
idx_val = "idx_val.npy"
x_test = "x_test.npy"
y_test = "y_test.npy"
idx_test = "idx_test.npy"
x_inference = "x_test.npy"
y_inference = "y_test.npy"
idx_inference = "idx_test.npy"
prediction = "prediction.csv"
split_config = "split_config.json"
[__docker__]
options = "--rm -it"
volumes = []
[__udocker__]
# options =
from extract_features import extract_mfcc
from tqdm import tqdm
import numpy as np
from scipy.signal import stft
def preprocess_dataset(acquisition_list, params, process):
if process == "mfcc":
return compute_mfcc(acquisition_list, params)
elif process == "stft":
return compute_stft(acquisition_list, params)
elif process == "raw":
return compute_raw(acquisition_list, params)
def compute_raw(acquisitions, raw_params):
length = raw_params["length"]
if type(length) != int:
length = raw_params["sample_rate"]
for acq in tqdm(acquisitions):
data = acq.values
windows_ls = []
for i in range(0, len(data) - length, length):
signal = data[i : i + length]
windows_ls.append( signal )
windows_ls.append(data[:length])
# Shape must be #samples x #channels x #signal
acq.values = np.expand_dims(np.stack(windows_ls), axis=1).astype(np.float32)
return acquisitions
def compute_stft(acquisitions, stft_params):
if type(stft_params["nperseg"]) != int:
stft_params["nperseg"] = stft_params["sample_rate"]
for acq in tqdm(acquisitions):
_, _, fourier_transform = stft(
x = acq.values,
fs = stft_params["sample_rate"],
nperseg = stft_params["nperseg"],
noverlap = None,
nfft = None,
)
# suggested shape is #samples x #1 x #segment_length
acq.values = np.expand_dims(fourier_transform.T, axis=1).astype(np.csingle)
return acquisitions
def compute_mfcc(acquisitions, mfcc_params):
# Complete the params with the missing values
sample_rate = mfcc_params["sample_rate"]
mfcc_params["frame_size"] = mfcc_params["mfcc_length"] / sample_rate
mfcc_params["frame_stride"] = mfcc_params["mfcc_length"] / sample_rate
# Extract the "use_custom" parameter
use_custom = mfcc_params["use_custom"]
keys_to_remove = ["use_custom"]
# Care that the key "NFFT" is set lower case by the ConfigParser
if "nfft" in mfcc_params:
mfcc_params["NFFT"] = mfcc_params.pop("nfft")
# Not all the acquisitions have the same length, and so we need to
# compute the MFCC across different fix sized windows of the signal
length = mfcc_params["signal_split"]
if use_custom:
keys_to_remove.append("mfcc_length")
keys_to_remove.append("signal_split")
if type(length) != int:
length = sample_rate
# extract_mfcc does not need some keys in the params dict
selected_params = {k : mfcc_params[k] for k in mfcc_params if k not in keys_to_remove}
for acq in tqdm(acquisitions, desc='Computing the MFCC'):
data = acq.values
mfcc_ls = []
for i in range(0, len(data) - length, length):
signal = data[i : i + length]
mfcc = extract_mfcc(signal=signal, params=selected_params, use_custom=use_custom)
mfcc_ls.append(mfcc)
# Shape must be #samples x #channels x #rows x #columns
acq.values = np.expand_dims(np.stack(mfcc_ls), axis=1).astype(np.float32)
return acquisitions
import argparse
import json
import os
import pickle
import numpy as np
from config_parser import ConfigParser
from load_h5 import read_hf
from tqdm import tqdm
from test_data_processing import preprocess_dataset
from test_split import split_dataset
def main(config):
"""
Function for creating the dataset for multiclass classification for HF experiment.
Parameters:
----------
config: ConfigParser = configuration read by the dlsetup. In this framework it uses only the data_generation session the parameters allowed are:
experiment: list of folders containing HF data (can be more than one!)
rebuild_acquisition: list of booleans telling if the acquisitions must be recomputed. One for each experiment
rebuild_npy: list of booleans telling if the numpy data have to been recomputed. One for each experiment
version: list of string identifying the name of acquisition file. One for each experiment
is_resolver: list of booleans. One for each experiment
split: list of int or string. One for each experiment. See **split_dataset**
split_step: int .See **split_dataset**
split_length: int. See **split_dataset**
mfcc_length: int. See **split_dataset**
use_custom: boolean. See **split_dataset**
train_val_pattern: Which experiment must be included in the train set. One for each experiment
test_pattern: Which experiment must be included in the test set. One for each experiment
classes_faulty: classes for faulty. One for each experiment
classes_healthy: classes for healthy. One for each experiment
"""
experiment = config["data_generation"]["experiment"]
rebuild_acquisition = config["data_generation"]["rebuild_acquisition"]
rebuild_npy = config["data_generation"]["rebuild_npy"]
version = config["data_generation"]["version"]
is_resolver = config["data_generation"]["is_resolver"]
split = config["data_generation"]["split"]
split_step = config["data_generation"]["split_step"]
train_val_pattern = config["data_generation"]["train_val_pattern"]
test_pattern = config["data_generation"]["test_pattern"]
classes_faulty = config["data_generation"]["classes_faulty"]
classes_healthy = config["data_generation"]["classes_healthy"]
feature = config["components"]["feature"]
feat_loc = config["components"]["feat_loc"]
process = config["data_processing"]["process"]
procs_params = config[process + "_params"]
experiment_folder = config["paths"]["experiment_folder"]
input_folder = config["paths"]["input"]
for i, e in enumerate(experiment):
folder = os.path.join(experiment_folder, e)
if rebuild_acquisition[i]:
print(f"Building acquisition for experiment {e}, it might take a long time")
setup, faulty, healthy, ciclo = read_hf(
folder,
e,
preprocessed_sample_rate=1,
inverter_sample_rate=1,
resolver=is_resolver[i],
)
# Access the needed variable in the data structure
sample_f = faulty.__getattribute__(feature).__getattribute__(feat_loc).get_data(raw=True)
sample_h = healthy.__getattribute__(feature).__getattribute__(feat_loc).get_data(raw=True)
filename_healthy = f"healty_{feature}_{feat_loc}_{version[i]}.pkl"
filename_faulty = f"faulty_{feature}_{feat_loc}_{version[i]}.pkl"
with open(os.path.join(input_folder, filename_healthy), "wb") as f:
pickle.dump(sample_h, f)
with open(os.path.join(input_folder, filename_faulty), "wb") as f:
pickle.dump(sample_f, f)
if rebuild_npy:
x_train_tot = []
y_train_tot = []
idx_train_tot = []
x_test_tot = []
y_test_tot = []
idx_test_tot = []
x_val_tot = []
y_val_tot = []
idx_val_tot = []
for i, e in enumerate(experiment):
folder = os.path.join(experiment_folder, e)
setup, faulty, healthy, ciclo = read_hf(
folder,
e,
preprocessed_sample_rate=1,
inverter_sample_rate=1,
resolver=is_resolver[i],
)
filename_healthy = f"healty_{feature}_{feat_loc}_{version[i]}.pkl"
filename_faulty = f"faulty_{feature}_{feat_loc}_{version[i]}.pkl"
with open(os.path.join(input_folder, filename_healthy), "rb") as f:
sample_h = pickle.load(f)
with open(os.path.join(input_folder, filename_faulty), "rb") as f:
sample_f = pickle.load(f)
sample_rate = faulty.__getattribute__(feature).__getattribute__(feat_loc).sample_rate
procs_params["sample_rate"] = sample_rate
# Process the raw time signals (care: you are modifying inplace the .values attributes)
sample_h = preprocess_dataset(sample_h, procs_params, process)
sample_f = preprocess_dataset(sample_f, procs_params, process)
(
x_train,
y_train,
idx_train,
x_val,
y_val,
idx_val,
x_test,
y_test,
idx_test,
) = split_dataset(
sample_h,
sample_f,
split=split[i],
split_step=split_step,
class_h=classes_healthy[i],
class_f=classes_faulty[i],
)
if version[i] in train_val_pattern:
x_train_tot.append(x_train)
x_val_tot.append(x_val)
y_train_tot.append(y_train)
y_val_tot.append(y_val)
idx_train_tot.append(idx_train)
idx_val_tot.append(idx_val)
if version[i] in test_pattern:
x_test_tot.append(x_test)
y_test_tot.append(y_test)
idx_test_tot.append(idx_test)
x_train_tot = np.concatenate(x_train_tot, axis=0)
x_val_tot = np.concatenate(x_val_tot, axis=0)
x_test_tot = np.concatenate(x_test_tot, axis=0)
y_train_tot = np.concatenate(y_train_tot, axis=0)
y_val_tot = np.concatenate(y_val_tot, axis=0)
y_test_tot = np.concatenate(y_test_tot, axis=0)
idx_train_tot = np.concatenate(idx_train_tot, axis=0)
idx_val_tot = np.concatenate(idx_val_tot, axis=0)
idx_test_tot = np.concatenate(idx_test_tot, axis=0)
np.save(os.path.join(input_folder, config["filenames"]["x_train"]), x_train_tot)
np.save(os.path.join(input_folder, config["filenames"]["y_train"]), y_train_tot)
np.save(os.path.join(input_folder, config["filenames"]["idx_train"]), idx_train_tot)
np.save(os.path.join(input_folder, config["filenames"]["x_val"]), x_val_tot)
np.save(os.path.join(input_folder, config["filenames"]["y_val"]), y_val_tot)
np.save(os.path.join(input_folder, config["filenames"]["idx_val"]), idx_val_tot)
np.save(os.path.join(input_folder, config["filenames"]["x_test"]), x_test_tot)
np.save(os.path.join(input_folder, config["filenames"]["y_test"]), y_test_tot)
np.save(os.path.join(input_folder, config["filenames"]["idx_test"]), idx_test_tot)
with open(os.path.join(input_folder, config["filenames"]["split_config"]), "w") as f:
json.dump(config.get_config(), f)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Data preparation")
parser.add_argument(
"-c",
"--config",
type=str,
required=True,
help="Config file",
)
args = parser.parse_args()
config = ConfigParser(args.config)
main(config)
import numpy as np
from tqdm import tqdm
def split_dataset(
sample_h, sample_f, split, split_step, class_h, class_f ):
"""
Function for splitting the dataset, calls the correct function based on **split** value
Parameters:
-----------
sample_h: list = list of acquisition for the healthy axis
sample_f: list = list of acquisition for the faulty axis
params: dict = parameters for **extract_mfcc** function
use_custom: boolean = for **extract_mfcc** function
split: str = kind of split, if 'standard' it performs the split with proportions 0.5, 0.25, 0.25 according to the time, otherwise a custom method is performed. For now the custom method takes for train the first 3 cycles, the fouth for validation and the last for test. Moreover, it can be splitted using also some class of step (for now only steps < **split_step**)
split_step: int = max id for step for nonstandard split
class_h: list = class of the healthy data
class_f: list = class of the faulty data
"""
if split == "standard":
return standard_split(
sample_h, sample_f, class_h, class_f
)
else:
return partial_split(
sample_h, sample_f, split_step, class_h, class_f
)
def standard_split(sample_h, sample_f, class_h, class_f):
"""
Time based split for healty faulty data, the proportion here are hard coded 0.5, 0.25, 0.25.
"""
healthy_x = []
healthy_y = []
healthy_id = []
healthy_t = [a.date for a in sample_h]
faulty_x = []
faulty_y = []
faulty_id = []
faulty_t = [a.date for a in sample_f]
for h in tqdm(sample_h, desc='Splitting the dataset'):
# class 0 for healthy
healthy_x.append(h.values)
healthy_y.append(np.array([class_h] * len(h.values)))
healthy_id.append([h.id] * len(h.values))
for f in tqdm(sample_f, desc='Splitting the dataset'):
##class 1 for faulty
faulty_x.append(f.values)
faulty_y.append(np.array([class_f] * len(f.values)))
faulty_id.append([f.id] * len(f.values))
lh = len(healthy_x)
lf = len(faulty_x)
x_train = np.vstack(healthy_x[0 : lh // 2] + faulty_x[0 : lf // 2])
y_train = np.hstack(healthy_y[0 : lh // 2] + faulty_y[0 : lf // 2]).astype(np.int)
idx_train = np.hstack(healthy_id[0 : lh // 2] + faulty_id[0 : lf // 2])
x_val = np.vstack( healthy_x[lh // 2 : (3 * lh // 4)] + faulty_x[lf // 2 : (3 * lh // 4)])
y_val = np.hstack( healthy_y[lh // 2 : (3 * lh // 4)] + faulty_y[lf // 2 : (3 * lh // 4)]).astype(np.int)
idx_val = np.hstack( healthy_id[lh // 2 : (3 * lh // 4)] + faulty_id[lf // 2 : (3 * lf // 4)])
x_test = np.vstack(healthy_x[(3 * lh // 4) :] + faulty_x[(3 * lf // 4) :])
y_test = np.hstack(healthy_y[(3 * lh // 4) :] + faulty_y[(3 * lf // 4) :]).astype(np.int)
idx_test = np.hstack(healthy_id[(3 * lh // 4) :] + faulty_id[(3 * lf // 4) :])
print(
f" Train size: {len(y_train)} Val size: {len(y_val)} Test size: {len(y_test)} "
)
return x_train, y_train, idx_train, x_val, y_val, idx_val, x_test, y_test, idx_test
def partial_split(sample_h, sample_f, step, class_h, class_f):
"""
In this case the split is done with this rule: cycle number 1,2,3 and step<step --> train
cycle number 4 and step<step --> validation
otherwise --> test
"""
x_train = []
y_train = []
idx_train = []
x_test = []
y_test = []
idx_test = []
x_val = []
y_val = []
idx_val = []
for h in tqdm(sample_h):
if h.cycle_number < 3 and h.step < step:
x_train.append(h.values)
y_train.append(np.array([class_h] * len(h.values)))
idx_train += [h.id] * len(h.values)
elif h.cycle_number == 3 and h.step < step:
print(h.values)
x_val.append(h.values)
y_val.append(np.array([class_h] * len(h.values)))
idx_val += [h.id] * len(h.values)
else:
x_test.append(h.values)
y_test.append(np.array([class_h] * len(h.values)))
idx_test += [h.id] * len(h.values)
for f in tqdm(sample_f):
if f.cycle_number < 3 and f.step < step:
x_train.append(f.values)
y_train.append(np.array([class_f] * len(f.values)))
idx_train += [f.id] * len(f.values)
elif f.cycle_number == 3 and f.step < step:
x_val.append(f.values)
y_val.append(np.array([class_f] * len(f.values)))
idx_val += [f.id] * len(f.values)
else:
x_test.append(f.values)
y_test.append(np.array([class_f] * len(f.values)))
idx_test += [f.id] * len(f.values)
x_train = np.vstack(x_train)
y_train = np.hstack(y_train).astype(np.int)
x_val = np.vstack(x_val)
y_val = np.hstack(y_val).astype(np.int)
x_test = np.vstack(x_test)
y_test = np.hstack(y_test).astype(np.int)
print(
f" Train size: {len(y_train)} Val size: {len(y_val)} Test size: {len(y_test)}"
)
return x_train, y_train, idx_train, x_val, y_val, idx_val, x_test, y_test, idx_test
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment