Static test¶
Calibrate the UBC PM senors to the RAMP PM sensor using simple linear regression (MLR)
Load python modules and set inputs¶
import context
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from sklearn import linear_model, feature_selection
from utils.pm import read_ramp
from context import data_dir
file_date = "220515"
ss = ["2022-05-15 14:58:00", "2022-05-15 15:56:00"]
# ss = ['2022-05-17 12:00:00', '2022-05-17 15:01:00']
# ubc_pms = ["%.2d" % i for i in range(1, 7)]
ubc_pms = ["01", "02", "04", "05", "06"]
******************************
context imported. Front of path:
/Users/rodell/Documents/Arduino
/Users/rodell/Documents/Arduino/docs/source
******************************
through /Users/rodell/Documents/Arduino/docs/source/context.py -- pha
read data for the RAMP¶
ramp_pathlist = sorted(Path(str(data_dir) + f"/RAMP/").glob(f"{file_date}*.TXT"))
ramp_df = read_ramp(ramp_pathlist)
hours = pd.Timedelta("0 days 01:07:00")
ramp_df.index = ramp_df.index - hours
ramp_df = ramp_df.sort_index()[ss[0] : ss[1]]
ramp_df = ramp_df.resample("1Min").mean()
read data for the UBC pm sensors¶
ubc_pathlist = np.ravel(
[
sorted(Path(str(data_dir) + f"/UBC-PM-{pm}/").glob(f"20{file_date}*.TXT"))
for pm in ubc_pms
]
)
def read_ubcpm(path):
path = str(path)
print(path)
df = pd.read_csv(path)
del df["rtctime"]
df["test"] = "20" + file_date + "T" + df["millis"]
df["test"] = pd.to_datetime(df["test"])
pm_date_range = pd.Timestamp("20" + file_date + "T" + path[-12:-4])
diff = pm_date_range - df["test"][0]
df["datetime"] = df["test"] + diff
df = df.set_index(pd.DatetimeIndex(df["datetime"]))
df = df.sort_index()[ss[0] : ss[1]]
df = df.resample("1Min").mean()
return df
ubc_dfs = [read_ubcpm(path) for path in ubc_pathlist]
/Users/rodell/Documents/Arduino/data/UBC-PM-01/20220515-14:41:26.TXT
/Users/rodell/Documents/Arduino/data/UBC-PM-02/20220515-14:45:40.TXT
/Users/rodell/Documents/Arduino/data/UBC-PM-04/20220515-14:51:17.TXT
/Users/rodell/Documents/Arduino/data/UBC-PM-05/20220515-14:52:08.TXT
/Users/rodell/Documents/Arduino/data/UBC-PM-06/20220515-14:54:02.TXT
Plot time series¶
Plot timesires of the 1 min rolling average
fig = plt.figure(figsize=(8, 8)) # (Width, height) in inches.
fig.autofmt_xdate()
ax = fig.add_subplot(1, 1, 1)
for i in range(len(ubc_dfs)):
ax.plot(
ubc_dfs[i].index,
ubc_dfs[i]["pm25_env"],
label=f'UBC-PM-{ubc_pms[i]} Mean: {int(ubc_dfs[i]["pm25_env"].mean())}'
+ r" ($\frac{\mu g}{m^3}$)",
)
ax.plot(
ramp_df.index,
ramp_df["pm25"],
lw=2,
label=f'RAMP Mean: {int(ramp_df["pm25"].mean())}'
+ r" ($\frac{\mu g}{m^3}$)",
)
xfmt = DateFormatter("%H:%M")
ax.xaxis.set_major_formatter(xfmt)
ax.set_xlabel("Time (HH:MM)", fontsize=14)
ax.set_ylabel(r"PM 2.5 ($\frac{\mu g}{m^3}$)", fontsize=14)
ax.tick_params(axis="both", which="major", labelsize=13)
ax.xaxis.grid(color="gray", linestyle="dashed")
ax.yaxis.grid(color="gray", linestyle="dashed")
ax.legend(
loc="upper right",
# bbox_to_anchor=(0.48, 1.15),
ncol=1,
fancybox=True,
shadow=True,
)
fig.tight_layout()
Combine PM 2.5 data for all senors (UBCs and RAMP)into one df This dataframe will be use to create linear regression models for every ubc sensor
df = ramp_df.filter(["pm25"], axis=1)
for i in range(len(ubc_pms)):
df[f"pm25_{ubc_pms[i]}"] = ubc_dfs[i]["pm25_env"]
df.head()
for i in range(len(ubc_pms)):
print(np.unique(np.isnan(ubc_dfs[i]["pm25_env"]), return_counts=True))
print(ubc_dfs[i].index[-1])
(array([False]), array([59]))
2022-05-15 15:56:00
(array([False]), array([58]))
2022-05-15 15:55:00
(array([False]), array([59]))
2022-05-15 15:56:00
(array([False]), array([59]))
2022-05-15 15:56:00
(array([False]), array([59]))
2022-05-15 15:56:00
Pearson correlation¶
Solve Pearson correlation prior to linear regression $\( r_{x y}=\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)\left(y_{i}-\bar{y}\right)}{\sqrt{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{2}} \sqrt{\sum_{i=1}^{n}\left(y_{i}-\bar{y}\right)^{2}}} \)$
pm_rs = []
for i in range(len(ubc_pms)):
pm_r = round(df[f"pm25_{ubc_pms[i]}"].corr(df["pm25"]), 4)
pm_rs.append(pm_r)
print(f"Pearson correlation for (ubc_{ubc_pms[i]},ramp) = {pm_r}")
Pearson correlation for (ubc_01,ramp) = 0.9961
Pearson correlation for (ubc_02,ramp) = 0.9963
Pearson correlation for (ubc_04,ramp) = 0.9986
Pearson correlation for (ubc_05,ramp) = 0.9987
Pearson correlation for (ubc_06,ramp) = 0.9982
Scatter plots¶
Make scatter plots of the data points for each pm sensor and plot the linear regression line in the scatter plots.
ny = len(ubc_pms) // 2
nx = len(ubc_pms) - ny
colors = plt.rcParams["axes.prop_cycle"].by_key()["color"]
fig = plt.figure(figsize=(nx * 5, ny * 5)) # (Width, height) in inches.
for i in range(len(ubc_pms)):
ax = fig.add_subplot(ny, nx, i + 1)
sns.regplot(x=df[f"pm25_{ubc_pms[i]}"], y=df["pm25"], color=colors[i])
ax.set_title(f"(ubc_{ubc_pms[i]},ramp) $r$ = {pm_rs[i]}")
ax.set_ylabel(r"RAMP PM 2.5 ($\frac{\mu g}{m^3}$)", fontsize=14)
ax.set_xlabel(r"UBC PM 2.5 ($\frac{\mu g}{m^3}$)", fontsize=14)
ax.tick_params(axis="both", which="major", labelsize=13)
ax.xaxis.grid(color="gray", linestyle="dashed")
ax.yaxis.grid(color="gray", linestyle="dashed")
fig.tight_layout()
Linear Regression¶
Normalize data and check it out
# df_norm = (df - df.mean())/df.std()
# df_norm.head()
Target variable: y; predictor variable(s): x
def make_mlr(i):
X = df[f"pm25_{ubc_pms[i]}"].values[:, np.newaxis]
y = df["pm25"].values
lm_MLR = linear_model.LinearRegression()
model = lm_MLR.fit(X, y)
ypred_MLR = lm_MLR.predict(X) # y predicted by MLR
intercept_MLR = lm_MLR.intercept_ # intercept predicted by MLR
coef_MLR = lm_MLR.coef_ # regression coefficients in MLR model
R2_MLR = lm_MLR.score(X, y) # R-squared value from MLR model
print("MLR results:")
print(f"a0 = {intercept_MLR}")
coeff = {"a0": intercept_MLR}
for j in range(len(coef_MLR)):
coeff.update({f"a{j+1}": coef_MLR[j]})
print(f"a{j+1} = {coef_MLR[j]}")
if len(coeff) > 2:
raise ValueError("This is a linear model, code only does single")
else:
pass
ax = fig.add_subplot(ny, nx, i + 1)
ax.set_ylabel(r"RAMP PM 2.5 ($\frac{\mu g}{m^3}$)", fontsize=14)
ax.set_xlabel(r"UBC PM 2.5 ($\frac{\mu g}{m^3}$)", fontsize=14)
ax.tick_params(axis="both", which="major", labelsize=13)
ax.xaxis.grid(color="gray", linestyle="dashed")
ax.yaxis.grid(color="gray", linestyle="dashed")
sns.regplot(x=y, y=ypred_MLR, color=colors[i])
# print(coeff)
# print(coef_MLR)
ax.set_title(
f"UBC-PM-{ubc_pms[i]} MLR "
+ r"$R^{2}$"
+ f"= {round(R2_MLR,4)} \n y = {round(coeff['a0'],4)} + {round(coeff['a1'],4)}x"
)
return
fig = plt.figure(figsize=(nx * 5, ny * 5)) # (Width, height) in inches.
for i in range(len(ubc_pms)):
make_mlr(i)
fig.tight_layout()
MLR results:
a0 = 107.01020933094469
a1 = 0.8255943625009389
MLR results:
a0 = -12.239445253744634
a1 = 0.8413829615623187
MLR results:
a0 = 25.801497218892337
a1 = 1.014699224230121
MLR results:
a0 = 95.76314424350699
a1 = 0.8357832389222485
MLR results:
a0 = 35.80729279946695
a1 = 0.9480784074865625