Commit 815c99c8 authored by Pietro Tosato's avatar Pietro Tosato
Browse files

fix file name

parent ad18d95f
......@@ -32,7 +32,7 @@
"metadata": {},
"outputs": [],
"source": [
"# in case of colab:\n",
"# in case of colab, upload the file you need:\n",
"from google.colab import files\n",
"files.upload()"
]
......@@ -194,7 +194,7 @@
"outputs": [],
"source": [
"# example of read of a single excel sheet\n",
"df = pd.read_excel(\"Ternaria_Dati.xlsx\", sheet_name=\"TR_Air_LeGrazie\", header=2)\n",
"df = pd.read_excel(\"ARPA_Data.xlsx\", sheet_name=\"TR_Air_LeGrazie\", header=2)\n",
"df.drop(0, inplace=True)\n",
"df.rename(columns={\"Parm\": \"ts\"}, inplace=True)\n",
"df['ts'] = pd.to_datetime( df['ts'] )\n"
......@@ -216,7 +216,7 @@
"source": [
"db = {}\n",
"for centralina, label in arpa.items():\n",
" df = pd.read_excel(\"Ternaria_Dati.xlsx\", sheet_name=label, header=2)\n",
" df = pd.read_excel(\"ARPA_Data.xlsx\", sheet_name=label, header=2)\n",
" df.drop(0, inplace=True)\n",
" df.rename(columns={\"Parm\": \"ts\"}, inplace=True)\n",
" df['ts'] = pd.to_datetime( df['ts'] )\n",
......@@ -747,7 +747,7 @@
}
],
"source": [
"#raw data are like that:\n",
"# raw data are like that:\n",
"plt.scatter(data_all['NOX'], data_all['S3_R1'])\n",
"plt.ylabel(\"Sensor Resistance S3 [Ohm]\")\n",
"plt.xlabel(\"NOX [ppb]\")\n",
......
%% Cell type:markdown id: tags:
 
# Data analysis for Ternaria
 
This notebook download and save data.
Then a simple linear regression is used to fit the data to ARPA pollution data.
 
%% Cell type:markdown id: tags:
 
## Initialization
 
Import some library to work with data: _pandas_, _numpy_, _bokeh_ and _matplotlib_ for visualization...
Info can be found here:
- [matplotlib user guide](https://matplotlib.org/stable/users/index.html)
- [bokeh documentation](https://docs.bokeh.org/en/latest/)
- [pandas user guide](https://pandas.pydata.org/docs/user_guide/index.html)
- [numpy user guide](https://numpy.org/doc/stable/user/index.html)
 
_TB_ is a custom module with some functions for downloading data from Thingsboard.
 
%% Cell type:code id: tags:
 
``` python
# in case of colab:
# in case of colab, upload the file you need:
from google.colab import files
files.upload()
```
 
%% Cell type:code id: tags:
 
``` python
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import TB as TB
 
import matplotlib.pyplot as plt
 
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row, gridplot
from bokeh.models import Step, Span, HoverTool, PointDrawTool, FreehandDrawTool, ColumnDataSource, DatetimeTickFormatter, BoxAnnotation #DaysTicker, FuncTickFormatter
from bokeh.plotting import figure, show, output_file, save
from bokeh.models.widgets import CheckboxGroup
 
output_notebook()
```
 
%%%% Output: display_data
 
 
%%%% Output: display_data
 
 
%% Cell type:code id: tags:
 
``` python
def plot(title: str, data: pd.DataFrame, data2plot: list, ylog=False) -> figure:
"""
Just an helper function to easily plot a dataframe with bokeh
 
Args:
title (str): Title of the plot
data (pd.DataFrame): DataFrame as data source
data2plot (list): list of dataframe columns to plot
ylog (bool, optional): use logarithmic y axis. Defaults to False.
 
Returns:
figure: figure object to be shown
"""
 
TOOLS = "pan,crosshair,wheel_zoom,box_zoom,reset,save,box_select"
p1 = figure(title=title, tools=TOOLS, x_axis_type='datetime', plot_width=1050, plot_height=600, y_axis_type="log" if ylog else "linear") #tooltips=TOOLTIPS
 
source = ColumnDataSource(data)
# data2plot = ["S1_R1", "S2_R1", "S3_R1", "S4_R1", "S5_R1", "S6_R1", "S7_R1", "S8_R1"]
colour = ["#ff0000", "#ffa600", "#00ff0d", "#eeff00", "#1100ff", "#00e1ff", "#ff00ff", "#000000"]
 
# minor y grid
p1.ygrid.minor_grid_line_alpha = 0.1
p1.ygrid.minor_grid_line_color = 'navy'
 
# tick x grid
p1.xaxis.formatter=DatetimeTickFormatter(days="%m/%d",hours="%H",minutes="%H:%M")
 
# plot data
for entry in data2plot:
p1.line('ts', entry, source=source, legend_label=entry, line_alpha=0.8, line_width=2.5, color=colour[data2plot.index(entry)])#, color='#9e2222')
 
# custom hover tool
p1.add_tools(HoverTool(
tooltips=[
( 'date', '$x{%Y-%m-%d %H:%M}' ),
( 'value', '$y'),
],
formatters={
'$x' : 'datetime', # use 'datetime' formatter for '@date' field
# use default 'numeral' formatter for other fields
},
# display a tooltip whenever the cursor is vertically in line with a glyph
mode='mouse', #vline,
# point_policy='snap_to_data',
# line_policy='none'
))
 
# custom point tool
source = ColumnDataSource({
'x': [], 'y': []
})
 
# renderer = p1.circle('x', 'y', source=source, size=10)
# p1.add_tools(PointDrawTool(renderers=[renderer]))
 
renderer2 = p1.multi_line('x', 'y', source=source, line_color="#4073ff", line_alpha=0.6, line_dash='dashed', line_width=2.5)
p1.add_tools(FreehandDrawTool(renderers=[renderer2]))
 
p1.legend.click_policy = "hide"
 
return p1
```
 
%% Cell type:markdown id: tags:
 
## Import and clean up data
 
%% Cell type:markdown id: tags:
 
The data must be retrieved by both Thingsboard (TB) and ARPA (as excel data). Then imported into python using pandas to be further processed.
 
%% Cell type:code id: tags:
 
``` python
# labels here (same as excel file)
centraline = ["Grazie", "Carrara", "BorgoRivo", "Prisciano", "Maratta"]
labels = ["TR_Air_LeGrazie", "TR_Air_Carrara", "TR_Air_Borgorivo", "TR_Air_Prisciano", "TR_Air_Maratta"]
```
 
%% Cell type:code id: tags:
 
``` python
# this is a dict!
arpa = {centraline[i]: labels[i] for i in range(len(labels))}
```
 
%% Cell type:code id: tags:
 
``` python
# example of read of a single excel sheet
df = pd.read_excel("Ternaria_Dati.xlsx", sheet_name="TR_Air_LeGrazie", header=2)
df = pd.read_excel("ARPA_Data.xlsx", sheet_name="TR_Air_LeGrazie", header=2)
df.drop(0, inplace=True)
df.rename(columns={"Parm": "ts"}, inplace=True)
df['ts'] = pd.to_datetime( df['ts'] )
```
 
%% Cell type:markdown id: tags:
 
Import data from excel
The file must be in the same directory, in case upload it to colab
 
%% Cell type:code id: tags:
 
``` python
db = {}
for centralina, label in arpa.items():
df = pd.read_excel("Ternaria_Dati.xlsx", sheet_name=label, header=2)
df = pd.read_excel("ARPA_Data.xlsx", sheet_name=label, header=2)
df.drop(0, inplace=True)
df.rename(columns={"Parm": "ts"}, inplace=True)
df['ts'] = pd.to_datetime( df['ts'] )
cols = df.columns.drop('ts')
df[cols] = df[cols].apply(pd.to_numeric) # astype work as well
db[centralina] = df
```
 
%% Cell type:markdown id: tags:
 
Plot some random data to check everything is ok
 
%% Cell type:code id: tags:
 
``` python
fig = plt.figure(figsize=(12,6))
# TBD
```
 
%%%% Output: display_data
 
 
%% Cell type:markdown id: tags:
 
## Data preparation
Starting from all the data, select something to work on, i.e. a place (in this case Borgo Rivo) and a period (30-Gen to 9-Feb)
 
%% Cell type:code id: tags:
 
``` python
# from now on, work on data_arpa as selected here
data_arpa = db['BorgoRivo']
```
 
%% Cell type:code id: tags:
 
``` python
# inspect data like this:
data_arpa?
data_arpa.info()
```
 
%%%% Output: stream
 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1280 entries, 1 to 1280
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ts 1280 non-null datetime64[ns]
1 NOX 1280 non-null float64
2 NO 1280 non-null float64
3 NO2 1280 non-null float64
4 O3 1280 non-null float64
dtypes: datetime64[ns](1), float64(4)
memory usage: 60.0 KB
 
%%%% Output: stream
 
Type: DataFrame
String form:
ts NOX NO NO2 O3
1 2021-12-14 00:00:00 59 <...> 257
1280 2022-02-09 08:00:00 66.095161 26.390261 25.631506 11.775866
[1280 rows x 5 columns]
Length: 1280
File: c:\users\ptosato\anaconda3\envs\pydev\lib\site-packages\pandas\core\frame.py
Docstring:
Two-dimensional, size-mutable, potentially heterogeneous tabular data.
Data structure also contains labeled axes (rows and columns).
Arithmetic operations align on both row and column labels. Can be
thought of as a dict-like container for Series objects. The primary
pandas data structure.
Parameters
----------
data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
Dict can contain Series, arrays, constants, dataclass or list-like objects. If
data is a dict, column order follows insertion-order.
.. versionchanged:: 0.25.0
If data is a list of dicts, column order follows insertion-order.
index : Index or array-like
Index to use for resulting frame. Will default to RangeIndex if
no indexing information part of input data and no index provided.
columns : Index or array-like
Column labels to use for resulting frame when data does not have them,
defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
will perform column selection instead.
dtype : dtype, default None
Data type to force. Only a single dtype is allowed. If None, infer.
copy : bool or None, default None
Copy data from inputs.
For dict data, the default of None behaves like ``copy=True``. For DataFrame
or 2d ndarray input, the default of None behaves like ``copy=False``.
.. versionchanged:: 1.3.0
See Also
--------
DataFrame.from_records : Constructor from tuples, also record arrays.
DataFrame.from_dict : From dicts of Series, arrays, or dicts.
read_csv : Read a comma-separated values (csv) file into DataFrame.
read_table : Read general delimited file into DataFrame.
read_clipboard : Read text from clipboard into DataFrame.
Examples
--------
Constructing DataFrame from a dictionary.
>>> d = {'col1': [1, 2], 'col2': [3, 4]}
>>> df = pd.DataFrame(data=d)
>>> df
col1 col2
0 1 3
1 2 4
Notice that the inferred dtype is int64.
>>> df.dtypes
col1 int64
col2 int64
dtype: object
To enforce a single dtype:
>>> df = pd.DataFrame(data=d, dtype=np.int8)
>>> df.dtypes
col1 int8
col2 int8
dtype: object
Constructing DataFrame from numpy ndarray:
>>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
... columns=['a', 'b', 'c'])
>>> df2
a b c
0 1 2 3
1 4 5 6
2 7 8 9
Constructing DataFrame from a numpy ndarray that has labeled columns:
>>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
>>> df3 = pd.DataFrame(data, columns=['c', 'a'])
...
>>> df3
c a
0 3 1
1 6 4
2 9 7
Constructing DataFrame from dataclass:
>>> from dataclasses import make_dataclass
>>> Point = make_dataclass("Point", [("x", int), ("y", int)])
>>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
x y
0 0 0
1 0 3
2 2 3
 
%% Cell type:code id: tags:
 
``` python
# select time period
timestart = datetime(2022, 1, 30)
timeend = datetime(2022, 2, 9)
 
delta = timeend - timestart
print("number of days: " + str(delta.days))
```
 
%%%% Output: stream
 
number of days: 10
 
%% Cell type:code id: tags:
 
``` python
# slicing data based on timestamp
data_arpa_slice = data_arpa[(data_arpa.ts >= datetime(2022, 1, 30)) & (data_arpa.ts <= datetime(2022, 2, 9))]
```
 
%% Cell type:markdown id: tags:
 
login to thingsboard, must use credentials stored in a file like:
 
```python
user = "something"
pwd = "password"
```
 
Then import this file and login with that credentials
 
%% Cell type:code id: tags:
 
``` python
import credentials as me
TB.login(me.user, me.pwd)
```
 
%% Cell type:markdown id: tags:
 
Look at TB python module to understand how the query to thingsboard is performed.
 
In this example we use Ternaria04, its id is `2657fb00-5372-11ec-bc84-d56db5e6a8e3` (you can find it on the TB website).
The available keys can be retrieved using `TB.get_dev_keys` function, in this example we need the following:
- sensor resistance (Sx_R1)
- sensor heater resistance (Sx_R2)
- temperature and humidity
 
%% Cell type:code id: tags:
 
``` python
agg = 60*60 # 60 minutes average function
devname = "Ternaria04"
devid = "2657fb00-5372-11ec-bc84-d56db5e6a8e3"
keys = ["S1_R1", "S1_R2", "S2_R1", "S2_R2", "S3_R1", "S3_R2", "S4_R1", "S4_R2",
"S5_R1", "S5_R2", "S6_R1", "S6_R2", "S7_R1", "S7_R2", "S8_R1", "S8_R2", "T", "RH"]
```
 
%% Cell type:markdown id: tags:
 
Loop the selected period and save the data. The following code save it as a csv file
 
%% Cell type:code id: tags:
 
``` python
tmpd = []
for i in range(1,delta.days):
# slice time period to download day by day
looptimestart = int((timestart + timedelta(days=i)).timestamp())
looptimeend = int((timestart + timedelta(days=i+1)).timestamp())
 
print("process data from " + str(datetime.fromtimestamp(looptimestart)) + " to " + str(datetime.fromtimestamp(looptimeend)))
# download data from TB and store to tmp variable
tmp = TB.get_ts_data( start_day=looptimestart, end_day=looptimeend, device_id=devid, aggregation_interval=agg, keys=keys )
if ( len(tmp)>0 ): # check if data is ok and append to array
tmpd.append( TB.flatten_json(tmp, keys) )
 
# concat every dataframe created
data = pd.concat( tmpd, ignore_index=True )
 
# save data to csv file, just in case...
filepath = Path('data_export/'+devname+".csv")
filepath.parent.mkdir(parents=True, exist_ok=True)
data.to_csv(filepath)
```
 
%%%% Output: stream
 
process data from 2022-01-31 00:00:00 to 2022-02-01 00:00:00
 
%%%% Output: stream
 
c:\Users\ptosato\Code\GasSensor\Ternaria\Ternaria-data-analysis\TB.py:171: FutureWarning: Passing 'suffixes' which cause duplicate columns {'value_x'} in the result is deprecated and will raise a MergeError in a future version.
data = pd.merge(data, temp, on='ts', how='right')
 
%%%% Output: stream
 
process data from 2022-02-01 00:00:00 to 2022-02-02 00:00:00
process data from 2022-02-02 00:00:00 to 2022-02-03 00:00:00
process data from 2022-02-03 00:00:00 to 2022-02-04 00:00:00
process data from 2022-02-04 00:00:00 to 2022-02-05 00:00:00
process data from 2022-02-05 00:00:00 to 2022-02-06 00:00:00
process data from 2022-02-06 00:00:00 to 2022-02-07 00:00:00
process data from 2022-02-07 00:00:00 to 2022-02-08 00:00:00
process data from 2022-02-08 00:00:00 to 2022-02-09 00:00:00
 
%% Cell type:code id: tags:
 
``` python
# if you want do download from colab, here is the command
# files.download(filename="data_export/Ternaria04.csv")
```
 
%% Cell type:code id: tags:
 
``` python
# align data - TB offset, 30 min + UTC offset
data['ts'] = data['ts'] + timedelta(hours=1, minutes=30)
```
 
%% Cell type:markdown id: tags:
 
Plot some data to check everything is ok
 
%% Cell type:code id: tags:
 
``` python
# plot data from TB
show( plot("TB data", data, ["S5_R1", "S6_R1"], ylog=True) )
```
 
%%%% Output: display_data
 
 
%%%% Output: display_data
 
 
%% Cell type:code id: tags:
 
``` python
# plot data from ARPA
show( plot("ARPA data", data_arpa_slice, ["NOX","O3"]) )
```
 
%%%% Output: display_data
 
 
%%%% Output: display_data
 
 
%% Cell type:markdown id: tags:
 
Merge the two datasets
 
%% Cell type:code id: tags:
 
``` python
data_all = pd.merge(data_arpa_slice, data)
data_all.info()
```
 
%%%% Output: stream
 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 0 to 207
Data columns (total 23 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ts 208 non-null datetime64[ns]
1 NOX 208 non-null float64
2 NO 208 non-null float64
3 NO2 208 non-null float64
4 O3 208 non-null float64
5 S1_R1 208 non-null float64
6 S1_R2 208 non-null float64
7 S2_R1 208 non-null float64
8 S2_R2 208 non-null float64
9 S3_R1 208 non-null float64
10 S3_R2 208 non-null float64
11 S4_R1 208 non-null float64
12 S4_R2 208 non-null float64
13 S5_R1 208 non-null float64
14 S5_R2 208 non-null float64
15 S6_R1 208 non-null float64
16 S6_R2 208 non-null float64
17 S7_R1 208 non-null float64
18 S7_R2 208 non-null float64
19 S8_R1 208 non-null float64
20 S8_R2 208 non-null float64
21 T 208 non-null float64
22 RH 208 non-null float64
dtypes: datetime64[ns](1), float64(22)
memory usage: 39.0 KB
 
%% Cell type:markdown id: tags:
 
## Single variable linear regression
 
Given an indipendent variable $x$ and a dependent one $y$, one can describe its relation as a linear function:
$$y = mx + q$$
Moreover, given a number of observed values $x_i$ for $i=1,...,n$, it is possible to find a function $f(x) = mx + q$ that minimize the error between $f(x_i)$ and $y_i$ and therefore predict the value of $y_i$ given the input $x_i$.
 
%% Cell type:code id: tags:
 
``` python
#raw data are like that:
# raw data are like that:
plt.scatter(data_all['NOX'], data_all['S3_R1'])
plt.ylabel("Sensor Resistance S3 [Ohm]")
plt.xlabel("NOX [ppb]")
plt.grid()
# plt.yscale('log')
```
 
%%%% Output: display_data
 
 
%% Cell type:code id: tags:
 
``` python
from sklearn.linear_model import LinearRegression
 
# select input signals
x = data_all['S3_R1'].values.reshape(-1, 1)
y = data_all['NOX'].values.reshape(-1, 1)
 
# select the model, i.e. Linear regression
model = LinearRegression(fit_intercept=True)
# fit the model using the data selected
model.fit(x, y)
# predict the result
y_pred = model.predict(x)
 
plt.scatter(x, y)
plt.scatter(x, y_pred, color='red')
 
r_sq = model.score(x, y)
m_param = model.coef_[0]; q_param = model.intercept_
print('function is y = {:f} x + {:f}'.format(m_param[0], q_param[0]))
 
print('R2: {:.3f}'.format(r_sq))
```
 
%%%% Output: stream
 
function is y = 0.000239 x + -166.142296
R2: 0.512
 
%%%% Output: display_data
 
 
%% Cell type:markdown id: tags:
 
## Multiple variable regression model
 
Extending the same concept using arrays like that:
$\textbf{x}=(x_1,...x_r)$ with $r$ the number of independent variables, assuming a linear relation:
$$y = b_0 + b_1 x_1 + ... + b_r x_r$$
The Least square minimization is the same as before.
 
In our case, we can use the information provided by temperature, humidity and sensor heater.
First example: NOX vs S3, like before
 
%% Cell type:code id: tags:
 
``` python
x = data_all[['S3_R1','S3_R2','T','RH']].values.reshape(-1, 4)
y = data_all['NOX'].values.reshape(-1, 1)
 
model = LinearRegression(fit_intercept=True)
model.fit(x, y)
Y_pred = model.predict(x)
 
plt.scatter(x[:,0], y)
plt.scatter(x[:,0], Y_pred, color='red')
 
r_sq = model.score(x, y)
print('R2: {:.3f}'.format(r_sq))
```
 
%%%% Output: stream
 
R2: 0.586
 
%%%% Output: display_data
 
 
%% Cell type:markdown id: tags:
 
try to use another model
 
%% Cell type:code id: tags:
 
``` python
from sklearn.linear_model import Ridge
 
x = data_all[['S3_R1','S3_R2','T','RH']].values.reshape(-1, 4)
y = data_all['NOX'].values.reshape(-1, 1)
 
model = Ridge(fit_intercept=True, alpha=0.4)
model.fit(x, y)
 
Y_pred = model.predict(x)
 
plt.scatter(x[:,0], y)
plt.scatter(x[:,0], Y_pred, color='red')
# plt.plot((y-Y_pred)/y*100)
r_sq = model.score(x, y)
print('R2: {:.3f}'.format(r_sq))
```
 
%%%% Output: stream
 
R2: 0.586
 
%%%% Output: display_data
 
 
%% Cell type:markdown id: tags:
 
Try O3 vs S8
 
%% Cell type:code id: tags:
 
``` python
x = data_all[['S8_R1','S8_R2','T','RH']].values.reshape(-1, 4)
y = data_all['O3'].values.reshape(-1, 1)
 
model = LinearRegression(fit_intercept=True)
model.fit(x, y)
 
Y_pred = model.predict(x)
 
plt.scatter(x[:,0], y)
plt.scatter(x[:,0], Y_pred, color='red')
 
r_sq = model.score(x, y)
print('R2: {:.3f}'.format(r_sq))
```
 
%%%% Output: stream
 
R2: 0.802
 
%%%% Output: display_data
 
 
%% Cell type:markdown id: tags:
 
Try O3 vs S5
 
%% Cell type:code id: tags:
 
``` python
x = data_all[['S5_R1','S5_R2','T','RH']].values.reshape(-1, 4)
y = data_all['O3'].values.reshape(-1, 1)
 
model = LinearRegression(fit_intercept=True)
model.fit(x, y)
 
Y_pred = model.predict(x)
 
plt.scatter(x[:,0], y)
plt.scatter(x[:,0], Y_pred, color='red')
 
r_sq = model.score(x, y)
print('R2: {:.3f}'.format(r_sq))
```
 
%%%% Output: stream
 
R2: 0.857
 
%%%% Output: display_data
 
 
%% Cell type:markdown id: tags:
 
Plot vs time
 
%% Cell type:code id: tags:
 
``` python
plt.plot(Y_pred)
plt.plot(y)
plt.legend(["fit", "original"])
```
 
%%%% Output: execute_result
 
<matplotlib.legend.Legend at 0x190d50b9c90>
 
%%%% Output: display_data
 
 
%% Cell type:markdown id: tags:
 
Some random tests, S8 vs O3
 
%% Cell type:code id: tags:
 
``` python
x = data_all[['S8_R1','S8_R2','T','RH']].values.reshape(-1, 4)
y = data_all['O3'].values.reshape(-1, 1)
 
model = LinearRegression(fit_intercept=True)
model.fit(x, y)
 
Y_pred = model.predict(x)
 
plt.scatter(x[:,0], y)
plt.scatter(x[:,0], Y_pred, color='red')
 
r_sq = model.score(x, y)
print('R2: {:.3f}'.format(r_sq))
```
 
%%%% Output: stream
 
R2: 0.802
 
%%%% Output: display_data