import urllib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os.path
import matplotlib.dates as mdates
import datetime as dt
import matplotlib as mpl
from pandas.tseries.frequencies import to_offset
from scipy.signal import savgol_filter
1 First Steps — basic time series analysis
Import packages. If you don’t have a certain package, e.g. ‘newpackage’, just type
pip install newpackage
This is how you download data from Thingspeak
= "test_elad.csv"
filename1 # if file is not there, go fetch it from thingspeak
if not os.path.isfile(filename1):
# define what to download
= "1690490"
channels = "1,2,3,4,6,7"
fields = "30"
minutes
# https://www.mathworks.com/help/thingspeak/readdata.html
# format YYYY-MM-DD%20HH:NN:SS
= "2022-05-01%2000:00:00"
start = "2022-05-08%2000:00:00"
end
# download using Thingspeak's API
# url = f"https://api.thingspeak.com/channels/{channels}/fields/{fields}.csv?minutes={minutes}"
= f"https://api.thingspeak.com/channels/{channels}/fields/{fields}.csv?start={start}&end={end}"
url = urllib.request.urlopen(url)
data = data.read()
d
# save data to csv
file = open(filename1, "w")
file.write(d.decode('UTF-8'))
file.close()
You can load the data using Pandas. Here we create a “dataframe”, which is a fancy name for a table.
# load data
= pd.read_csv(filename1)
df # rename columns
= df.rename(columns={"created_at": "timestamp",
df "field1": "T1",
"field2": "RH",
"field3": "T2",
"field4": "motion_sensor",
"field6": "VWC",
"field7": "VPD",})
# set timestamp as index
'timestamp'] = pd.to_datetime(df['timestamp'])
df[= df.set_index('timestamp') df
1.0.1 First graph
# %matplotlib widget
= plt.subplots(1, figsize=(8,6))
fig, ax
'VPD'])
ax.plot(df[# add labels and title
set(xlabel = "time",
ax.= "VPD (kPa)",
ylabel = "my first graph")
title # makes slanted dates
plt.gcf().autofmt_xdate()
1.0.2 Two columns in the same graph
# %matplotlib widget
= plt.subplots(1, figsize=(8,6))
fig, ax
'T1'], color="tab:blue", label="SHT Temperature")
ax.plot(df['T2'], color="tab:orange", label="DS18B20 Temperature")
ax.plot(df[# add labels and title
set(xlabel = "Time",
ax.= "Temperature (deg C)",
ylabel = "two sensors",
title =[20,35],
ylim
)# makes slanted dates
plt.gcf().autofmt_xdate()="upper right") ax.legend(loc
<matplotlib.legend.Legend at 0x7fe6c9730610>
1.0.3 Calculate stuff
You can calculate new things and save them as new columns of your dataframe.
def calculate_es(T):
= np.exp((16.78 * T - 116.9) / (T + 237.3))
es return es
def calculate_ed(es, rh):
return es * rh / 100.0
= calculate_es(df['T1'])
es = calculate_ed(es, df['RH'])
ed 'VPD2'] = es - ed df[
See if what you calculated makes sense.
# %matplotlib widget
= plt.subplots(1, figsize=(8,6))
fig, ax
'VPD'], color="tab:red", label="VPD from ESP32")
ax.plot(df['VPD2'][::100], "o", color="black", label="VPD from python")
ax.plot(df[# add labels and title
set(xlabel = "Time",
ax.= "VPD (kPa)",
ylabel = "VPD calculated twice",
title =[0,5],
ylim
)# makes slanted dates
plt.gcf().autofmt_xdate()="upper right") ax.legend(loc
<matplotlib.legend.Legend at 0x7fe6989ca700>
1.0.4 Two y axes
# %matplotlib widget
= plt.subplots(1, figsize=(8,6))
fig, ax
'VPD'], color="tab:red", label="VPD")
ax.plot(df[
plt.gcf().autofmt_xdate()= ax.twinx()
ax2 'T1'], color="tab:cyan", label="Temperature")
ax2.plot(df[set(xlabel = "Time",
ax.= "two y axes",
title =[0,5],
ylim
)'VPD (kPa)', color='tab:red')
ax.set_ylabel('left'].set_color('red')
ax.spines[
'Temperature (deg C)', color='tab:cyan') ax2.set_ylabel(
Text(0, 0.5, 'Temperature (deg C)')
1.1 NaN, Missing data, Outliers
# %matplotlib widget
= "2022-05-03 12:00:00"
start = "2022-05-06 00:00:00"
end
= plt.subplots(1, figsize=(8,4))
fig, ax
# plot using pandas' plot method
'T2'].plot(ax=ax,
df.loc[start:end, ='-',
linestyle='o',
marker="tab:blue",
color="data")
label
# annotate examples here:
# https://jakevdp.github.io/PythonDataScienceHandbook/04.09-text-and-annotation.html
"NaN", # text to write, if nothing, then ""
ax.annotate(=('2022-05-03 20:30:00', 25), # (x,y coordinates for the tip of the arrow)
xy='data', # xy as 'data' coordinates
xycoords=(-20, 60), # xy coordinates for the text
xytext='offset points', # xytext relative to xy
textcoords=dict(arrowstyle="->") # pretty arrow
arrowprops
)"outlier",
ax.annotate(=('2022-05-03 22:30:00', 85),
xy='data',
xycoords=(40, -20),
xytext='offset points',
textcoords=dict(arrowstyle="->")
arrowprops
)"missing rows",
ax.annotate(=('2022-05-05 00:00:00', 25),
xy='data',
xycoords=(0, 40),
xytext='offset points',
textcoords=dict(arrowstyle="->")
arrowprops
)
'%d %b, %H:00'))
ax.xaxis.set_major_formatter(mdates.DateFormatter(
plt.gcf().autofmt_xdate()set(xlabel="",
ax.="Temperature (deg C)") ylabel
[Text(0.5, 0, ''), Text(0, 0.5, 'Temperature (deg C)')]
The arrows (annotate) work because the plot was
df['column'].plot()
If you use the usual
ax.plot(df['column'])
then you matplotlib will not understand timestamps as x-positions. In this case follow the instructions below.
# %matplotlib widget
= "2022-05-03 12:00:00"
start = "2022-05-06 00:00:00"
end
= plt.subplots(1, figsize=(8,4))
fig, ax
'T2'], linestyle='-', marker='o', color="tab:blue", label="data")
ax.plot(df.loc[start:end,
= '2022-05-03 20:30:00'
t_nan = mdates.date2num(dt.datetime.strptime(t_nan, "%Y-%m-%d %H:%M:%S"))
x_nan "NaN",
ax.annotate(=(x_nan, 25),
xy='data',
xycoords=(-20, 60),
xytext='offset points',
textcoords=dict(arrowstyle="->")
arrowprops
)= '2022-05-03 22:30:00'
t_outlier = mdates.date2num(dt.datetime.strptime(t_outlier, "%Y-%m-%d %H:%M:%S"))
x_outlier "outlier",
ax.annotate(=(x_outlier, 85),
xy='data',
xycoords=(40, -20),
xytext='offset points',
textcoords=dict(arrowstyle="->")
arrowprops
)= '2022-05-05 00:00:00'
t_missing = mdates.date2num(dt.datetime.strptime(t_missing, "%Y-%m-%d %H:%M:%S"))
x_missing "missing rows",
ax.annotate(=(x_missing, 25),
xy='data',
xycoords=(0, 40),
xytext='offset points',
textcoords=dict(arrowstyle="->")
arrowprops
)# code for hours, days, etc
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
'%d %b, %H:00'))
ax.xaxis.set_major_formatter(mdates.DateFormatter(
plt.gcf().autofmt_xdate()set(xlabel="",
ax.="Temperature (deg C)") ylabel
[Text(0.5, 0, ''), Text(0, 0.5, 'Temperature (deg C)')]
# %matplotlib widget
= plt.subplots(1, figsize=(8,4))
fig, ax
= (df.index.to_series().diff() / pd.Timedelta('1 sec') ).values
delta_index
ax.plot(delta_index)set(ylim=[0, 100],
ax.="running index",
xlabel=r"$\Delta t$ (s)",
ylabel="Time difference between consecutive rows") title
[(0.0, 100.0),
Text(0.5, 0, 'running index'),
Text(0, 0.5, '$\\Delta t$ (s)'),
Text(0.5, 1.0, 'Time difference between consecutive rows')]
1.2 Resample
1.2.1 Downsampling
# %matplotlib widget
= plt.subplots(1, figsize=(8,4))
fig, ax
# Downsample to spaced out data points. Change the number below, see what happens.
= '15min'
window_size = (df['T2'].resample(window_size) # resample doesn't do anything yet, just divides data into buckets
df_resampled # this is where stuff happens. you can also choose "sum", "max", etc
.mean()
)# optional, add half a window size to timestamp
= df_resampled.index + to_offset(window_size) / 2
df_resampled.index
'T2'], color="tab:blue", label="original data")
ax.plot(df[='x', color="tab:orange", zorder=-1,
ax.plot(df_resampled, marker=f"resampled {window_size} data")
label
ax.legend()
set(xlabel="time",
ax.="temperature (deg C)") ylabel
[Text(0.5, 0, 'time'), Text(0, 0.5, 'temperature (deg C)')]
1.2.2 Filling missing data
# %matplotlib widget
= plt.subplots(1, figsize=(8,4))
fig, ax
# see options for interpolation methods here:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html
= df_resampled.interpolate(method='time')
df_interpolated1 = df_resampled.interpolate(method='nearest')
df_interpolated2
="tab:orange", label="resampled")
ax.plot(df_resampled, color'.', color="tab:purple", zorder=-1,
ax.plot(df_interpolated1, =f"time-interpolated")
label'.', color="tab:cyan", zorder=-2,
ax.plot(df_interpolated2, =f"nearest-interpolated")
label
ax.legend()
set(xlabel="time",
ax.="temperature (deg C)") ylabel
[Text(0.5, 0, 'time'), Text(0, 0.5, 'temperature (deg C)')]
1.3 Smoothing noisy data
Let’s first download data from a different project.
= "test_peleg.csv"
filename2 # if file is not there, go fetch it from thingspeak
if not os.path.isfile(filename2):
# define what to download
= "1708067"
channels = "1,2,3,4,5"
fields = "30"
minutes
# https://www.mathworks.com/help/thingspeak/readdata.html
# format YYYY-MM-DD%20HH:NN:SS
= "2022-05-15%2000:00:00"
start = "2022-05-25%2000:00:00"
end
# download using Thingspeak's API
# url = f"https://api.thingspeak.com/channels/{channels}/fields/{fields}.csv?minutes={minutes}"
= f"https://api.thingspeak.com/channels/{channels}/fields/{fields}.csv?start={start}&end={end}"
url = urllib.request.urlopen(url)
data = data.read()
d
# save data to csv
file = open(filename2, "w")
file.write(d.decode('UTF-8'))
file.close()
# load data
= pd.read_csv(filename2)
df # rename columns
= df.rename(columns={"created_at": "timestamp",
df "field1": "T",
"field2": "Tw",
"field3": "RH",
"field4": "VPD",
"field5": "dist",
})# set timestamp as index
'timestamp'] = pd.to_datetime(df['timestamp'])
df[= df.set_index('timestamp') df
df
entry_id | T | Tw | RH | VPD | dist | |
---|---|---|---|---|---|---|
timestamp | ||||||
2022-05-18 20:09:31+00:00 | 24716 | 23.85 | 23.3125 | 65.32 | 1.02532 | 7.208 |
2022-05-18 20:10:32+00:00 | 24717 | 23.88 | 23.2500 | 65.32 | 1.02717 | 7.208 |
2022-05-18 20:11:33+00:00 | 24718 | 23.90 | 23.2500 | 65.23 | 1.03107 | 7.276 |
2022-05-18 20:12:33+00:00 | 24719 | 23.90 | 23.2500 | 65.19 | 1.03226 | 7.208 |
2022-05-18 20:13:34+00:00 | 24720 | 23.89 | 23.2500 | 65.15 | 1.03282 | 7.633 |
... | ... | ... | ... | ... | ... | ... |
2022-05-24 12:18:35+00:00 | 32711 | 27.47 | 26.1250 | 47.49 | 1.92397 | 8.925 |
2022-05-24 12:19:36+00:00 | 32712 | 27.47 | 26.1250 | 47.62 | 1.91921 | 8.925 |
2022-05-24 12:20:39+00:00 | 32713 | 27.47 | 26.1250 | 47.96 | 1.90675 | 8.925 |
2022-05-24 12:21:40+00:00 | 32714 | 27.47 | 26.1875 | 47.75 | 1.91444 | 8.925 |
2022-05-24 12:22:41+00:00 | 32715 | 27.49 | 26.1875 | 47.94 | 1.90971 | 8.925 |
8000 rows × 6 columns
1.4 Smoothing noisy data
# %matplotlib widget
= plt.subplots(1, figsize=(8,4))
fig, ax
'RH'], '.')
ax.plot(df[# add labels and title
set(xlabel = "time",
ax.= "RH (%)",
ylabel = "Relative Humidity")
title # makes slanted dates
plt.gcf().autofmt_xdate()
1.4.1 Moving average and SavGol
# %matplotlib widget
= plt.subplots(1, figsize=(8,4))
fig, ax
# apply a rolling average of size "window_size",
# it can be either by number of points, or by window time
# window_size = 30 # number of measurements
= '120min' # minutes
window_size = df['RH'].rolling(window_size, center=True).mean().to_frame()
RH_smooth ={'RH': 'rolling_avg'}, inplace=True)
RH_smooth.rename(columns
'SG'] = savgol_filter(df['RH'], window_length=121, polyorder=2)
RH_smooth[
'RH'], color="tab:blue", label="data")
ax.plot(df['rolling_avg'], color="tab:orange", label="moving average")
ax.plot(RH_smooth['SG'], color="tab:red", label="Savitzky-Golay filter")
ax.plot(RH_smooth[# add labels and title
set(xlabel = "time",
ax.= "RH (%)",
ylabel = "Relative Humidity")
title # makes slanted dates
plt.gcf().autofmt_xdate() ax.legend()
<matplotlib.legend.Legend at 0x7fe6a0525730>
How does moving average work?
How does the Savitzky–Golay filter work?