Predicting Concentrations of PM2.5
Contents
Predicting Concentrations of PM2.5¶
import openaq
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from aqtools import aqutils as u
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, GRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import SGD
from noaa_sdk import noaa
api = openaq.OpenAQ()
Version 1¶
# Parameter setting
date_from = '2021-09-01T00:00:00Z' # Default: PST
date_to = '2022-03-01T00:00:00Z'
city = 'San Francisco-Oakland-Fremont'
location = 'Oakland'
data_query_limit = 4000
"""
Important! You may get 'ApiError: A bad request was made: 500'.
It's not because of our code but of API server issue.
Please take a moment, and retry executing the cell.
"""
# co
pollutant = 'co'
status, resp = api.measurements(city=city,
location=location, parameter=pollutant,
date_from=date_from,
date_to=date_to,
limit=data_query_limit)
r = resp['results']
df_co = u.date_pollutant_value(r, pollutant)
df_co.head(3)
date | co | |
---|---|---|
3600 | 2021-08-31 17:00:00 | 0.3 |
3599 | 2021-08-31 18:00:00 | 0.3 |
3598 | 2021-08-31 19:00:00 | 0.3 |
# no2
pollutant = 'no2'
status, resp = api.measurements(city=city,
location=location, parameter=pollutant,
date_from=date_from,
date_to=date_to,
limit=data_query_limit)
r = resp['results']
df_no2 = u.date_pollutant_value(r, pollutant)
df_no2.head(3)
date | no2 | |
---|---|---|
3599 | 2021-08-31 17:00:00 | 0.002 |
3598 | 2021-08-31 18:00:00 | 0.003 |
3597 | 2021-08-31 19:00:00 | 0.004 |
# o3
pollutant = 'o3'
status, resp = api.measurements(city=city,
location=location, parameter=pollutant,
date_from=date_from,
date_to=date_to,
limit=data_query_limit)
r = resp['results']
df_o3 = pd.DataFrame(data=r)
df_o3 = u.date_pollutant_value(r, pollutant)
df_o3.head(3)
date | o3 | |
---|---|---|
3600 | 2021-08-31 17:00:00 | 0.031 |
3599 | 2021-08-31 18:00:00 | 0.031 |
3598 | 2021-08-31 19:00:00 | 0.030 |
# pm25
pollutant = 'pm25'
status, resp = api.measurements(city=city,
location=location, parameter=pollutant,
date_from=date_from,
date_to=date_to,
limit=data_query_limit)
r = resp['results']
df_pm25 = pd.DataFrame(data=r)
df_pm25 = u.date_pollutant_value(r, pollutant)
df_pm25.head(3)
date | pm25 | |
---|---|---|
3751 | 2021-08-31 17:00:00 | 11 |
3750 | 2021-08-31 18:00:00 | 12 |
3749 | 2021-08-31 19:00:00 | 15 |
# Merge dataframes on 'date' (find the intersection of values based on 'date')
df = df_co.merge(df_no2, how='inner', on='date')
df = df.merge(df_o3, how='inner', on='date')
df = df.merge(df_pm25, how='inner', on='date')
df = df.set_index(['date'])
df
co | no2 | o3 | pm25 | |
---|---|---|---|---|
date | ||||
2021-08-31 17:00:00 | 0.30 | 0.002 | 0.031 | 11 |
2021-08-31 18:00:00 | 0.30 | 0.003 | 0.031 | 12 |
2021-08-31 19:00:00 | 0.30 | 0.004 | 0.030 | 15 |
2021-08-31 20:00:00 | 0.36 | 0.006 | 0.029 | 14 |
2021-08-31 21:00:00 | 0.37 | 0.006 | 0.028 | 12 |
... | ... | ... | ... | ... |
2022-02-28 12:00:00 | 0.47 | 0.025 | 0.023 | 15 |
2022-02-28 13:00:00 | 0.46 | 0.025 | 0.026 | 20 |
2022-02-28 14:00:00 | 0.38 | 0.016 | 0.038 | 13 |
2022-02-28 15:00:00 | 0.32 | 0.012 | 0.043 | 9 |
2022-02-28 16:00:00 | 0.30 | 0.011 | 0.040 | 6 |
3526 rows × 4 columns
# MinMax Scaling
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['co'].values.reshape(-1, 1))
df['co'] = co_scaled
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['no2'].values.reshape(-1, 1))
df['no2'] = co_scaled
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['o3'].values.reshape(-1, 1))
df['o3'] = co_scaled
df.plot(subplots=True, title='Robust Scaler')
fig = plt.gcf()
fig.autofmt_xdate()
fig.savefig("./figures/scaling_ver1.png")
plt.show()

# Make data stationary
# Differencing technique was applied
co_diff = u.differencing(df['co'].values)
no2_diff = u.differencing(df['no2'].values)
o3_diff = u.differencing(df['o3'].values)
# Delete the first row
df = df.iloc[:-1, :]
df['co'] = co_diff
df['no2'] = no2_diff
df['o3'] = o3_diff
df.plot(subplots=True, title='Differencing')
fig = plt.gcf()
fig.autofmt_xdate()
fig.savefig("./figures/differencing_ver1.png")
plt.show()
/tmp/ipykernel_7700/3874090531.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['co'] = co_diff
/tmp/ipykernel_7700/3874090531.py:11: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['no2'] = no2_diff
/tmp/ipykernel_7700/3874090531.py:12: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['o3'] = o3_diff

# correlation among features
df_corr = df.corr()
df_corr.to_csv('./tables/correlation_ver1.csv')
df_corr
co | no2 | o3 | pm25 | |
---|---|---|---|---|
co | 1.000000 | 0.673827 | -0.623135 | -0.118771 |
no2 | 0.673827 | 1.000000 | -0.770821 | -0.113108 |
o3 | -0.623135 | -0.770821 | 1.000000 | 0.080400 |
pm25 | -0.118771 | -0.113108 | 0.080400 | 1.000000 |
# feature vectors: shape (num of data, window size, num of features)
feature_np = df[['co', 'no2', 'o3']].to_numpy()
# label vactors: shape (num of data,)
label_np = df[['pm25']].to_numpy()
X = []
y = []
# how many timesteps we want to look at --> default 8 (hours)
for i in range(8, len(feature_np)):
X.append(feature_np[i-8:i, :])
y.append(label_np[i])
X, y = np.array(X, dtype=np.float64), np.array(y, dtype=np.float64)
X.shape, y.shape
((3517, 8, 3), (3517, 1))
TEST_SIZE = 300
X_train = X[:-TEST_SIZE]
y_train = y[:-TEST_SIZE]
X_test = X[-TEST_SIZE:]
y_test = y[-TEST_SIZE:]
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((3217, 8, 3), (3217, 1), (300, 8, 3), (300, 1))
# Predict pm2.5 using Gated Recurrent Unit
model = Sequential()
model.add(GRU(units=50,
return_sequences=True,
input_shape=X_train[0].shape,
activation='tanh'))
model.add(GRU(units=50, activation='tanh'))
model.add(Dense(units=2))
# Compiling the GRU
model.compile(optimizer=SGD(learning_rate=0.01, decay=1e-7,
momentum=0.9, nesterov=False),
loss='mse')
model.summary()
2022-05-12 03:39:34.489909: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
gru (GRU) (None, 8, 50) 8250
gru_1 (GRU) (None, 50) 15300
dense (Dense) (None, 2) 102
=================================================================
Total params: 23,652
Trainable params: 23,652
Non-trainable params: 0
_________________________________________________________________
# model training
#early_stop = EarlyStopping(monitor='loss', mode='min', verbose=0, patience=10)
#model.fit(X_train, y_train, epochs=100, batch_size=150, verbose=1, callbacks=[early_stop])
model.fit(X_train, y_train, epochs=100, batch_size=150, verbose=1)
Epoch 1/100
22/22 [==============================] - 3s 15ms/step - loss: 61.7825
Epoch 2/100
22/22 [==============================] - 0s 14ms/step - loss: 47.2600
Epoch 3/100
22/22 [==============================] - 0s 15ms/step - loss: 46.5220
Epoch 4/100
22/22 [==============================] - 0s 14ms/step - loss: 46.0431
Epoch 5/100
22/22 [==============================] - 0s 14ms/step - loss: 45.9086
Epoch 6/100
22/22 [==============================] - 0s 14ms/step - loss: 45.9669
Epoch 7/100
22/22 [==============================] - 0s 14ms/step - loss: 46.2019
Epoch 8/100
22/22 [==============================] - 0s 13ms/step - loss: 43.4890
Epoch 9/100
22/22 [==============================] - 0s 14ms/step - loss: 41.3602
Epoch 10/100
22/22 [==============================] - 0s 15ms/step - loss: 39.4300
Epoch 11/100
22/22 [==============================] - 0s 15ms/step - loss: 38.2915
Epoch 12/100
22/22 [==============================] - 0s 15ms/step - loss: 40.1313
Epoch 13/100
22/22 [==============================] - 0s 15ms/step - loss: 37.6508
Epoch 14/100
22/22 [==============================] - 0s 14ms/step - loss: 36.5795
Epoch 15/100
22/22 [==============================] - 0s 14ms/step - loss: 36.3274
Epoch 16/100
22/22 [==============================] - 0s 14ms/step - loss: 35.6911
Epoch 17/100
22/22 [==============================] - 0s 14ms/step - loss: 35.0691
Epoch 18/100
22/22 [==============================] - 0s 14ms/step - loss: 35.2598
Epoch 19/100
22/22 [==============================] - 0s 14ms/step - loss: 36.0980
Epoch 20/100
22/22 [==============================] - 0s 15ms/step - loss: 34.3189
Epoch 21/100
22/22 [==============================] - 0s 15ms/step - loss: 33.0377
Epoch 22/100
22/22 [==============================] - 0s 14ms/step - loss: 32.3313
Epoch 23/100
22/22 [==============================] - 0s 14ms/step - loss: 31.3445
Epoch 24/100
22/22 [==============================] - 0s 14ms/step - loss: 30.9620
Epoch 25/100
22/22 [==============================] - 0s 15ms/step - loss: 29.8989
Epoch 26/100
22/22 [==============================] - 0s 16ms/step - loss: 28.9788
Epoch 27/100
22/22 [==============================] - 0s 14ms/step - loss: 28.2696
Epoch 28/100
22/22 [==============================] - 0s 14ms/step - loss: 27.7180
Epoch 29/100
22/22 [==============================] - 0s 14ms/step - loss: 28.8570
Epoch 30/100
22/22 [==============================] - 0s 15ms/step - loss: 26.2209
Epoch 31/100
22/22 [==============================] - 0s 15ms/step - loss: 25.1053
Epoch 32/100
22/22 [==============================] - 0s 15ms/step - loss: 24.3861
Epoch 33/100
22/22 [==============================] - 0s 14ms/step - loss: 23.9058
Epoch 34/100
22/22 [==============================] - 0s 15ms/step - loss: 23.0863
Epoch 35/100
22/22 [==============================] - 0s 14ms/step - loss: 22.9559
Epoch 36/100
22/22 [==============================] - 0s 14ms/step - loss: 22.4067
Epoch 37/100
22/22 [==============================] - 0s 15ms/step - loss: 19.7727
Epoch 38/100
22/22 [==============================] - 0s 15ms/step - loss: 18.2430
Epoch 39/100
22/22 [==============================] - 0s 14ms/step - loss: 16.6916
Epoch 40/100
22/22 [==============================] - 0s 15ms/step - loss: 17.0721
Epoch 41/100
22/22 [==============================] - 0s 14ms/step - loss: 17.3268
Epoch 42/100
22/22 [==============================] - 0s 15ms/step - loss: 16.3780
Epoch 43/100
22/22 [==============================] - 0s 14ms/step - loss: 14.5547
Epoch 44/100
22/22 [==============================] - 0s 15ms/step - loss: 13.6701
Epoch 45/100
22/22 [==============================] - 0s 15ms/step - loss: 13.8296
Epoch 46/100
22/22 [==============================] - 0s 14ms/step - loss: 13.4327
Epoch 47/100
22/22 [==============================] - 0s 15ms/step - loss: 12.2368
Epoch 48/100
22/22 [==============================] - 0s 15ms/step - loss: 11.6621
Epoch 49/100
22/22 [==============================] - 0s 14ms/step - loss: 10.4138
Epoch 50/100
22/22 [==============================] - 0s 15ms/step - loss: 9.2011
Epoch 51/100
22/22 [==============================] - 0s 15ms/step - loss: 9.1597
Epoch 52/100
22/22 [==============================] - 0s 15ms/step - loss: 8.2274
Epoch 53/100
22/22 [==============================] - 0s 16ms/step - loss: 7.9333
Epoch 54/100
22/22 [==============================] - 0s 15ms/step - loss: 7.3819
Epoch 55/100
22/22 [==============================] - 0s 15ms/step - loss: 7.3128
Epoch 56/100
22/22 [==============================] - 0s 15ms/step - loss: 6.6691
Epoch 57/100
22/22 [==============================] - 0s 15ms/step - loss: 6.8974
Epoch 58/100
22/22 [==============================] - 0s 15ms/step - loss: 6.2350
Epoch 59/100
22/22 [==============================] - 0s 15ms/step - loss: 5.3208
Epoch 60/100
22/22 [==============================] - 0s 15ms/step - loss: 5.3226
Epoch 61/100
22/22 [==============================] - 0s 15ms/step - loss: 4.9757
Epoch 62/100
22/22 [==============================] - 0s 15ms/step - loss: 4.7543
Epoch 63/100
22/22 [==============================] - 0s 16ms/step - loss: 4.3134
Epoch 64/100
22/22 [==============================] - 0s 15ms/step - loss: 3.9720
Epoch 65/100
22/22 [==============================] - 0s 15ms/step - loss: 3.4406
Epoch 66/100
22/22 [==============================] - 0s 16ms/step - loss: 3.1337
Epoch 67/100
22/22 [==============================] - 0s 15ms/step - loss: 3.2613
Epoch 68/100
22/22 [==============================] - 0s 15ms/step - loss: 4.1395
Epoch 69/100
22/22 [==============================] - 0s 15ms/step - loss: 4.7379
Epoch 70/100
22/22 [==============================] - 0s 14ms/step - loss: 4.3833
Epoch 71/100
22/22 [==============================] - 0s 15ms/step - loss: 3.7696
Epoch 72/100
22/22 [==============================] - 0s 15ms/step - loss: 3.3501
Epoch 73/100
22/22 [==============================] - 0s 16ms/step - loss: 2.7507
Epoch 74/100
22/22 [==============================] - 0s 15ms/step - loss: 2.3119
Epoch 75/100
22/22 [==============================] - 0s 15ms/step - loss: 1.9574
Epoch 76/100
22/22 [==============================] - 0s 14ms/step - loss: 1.7500
Epoch 77/100
22/22 [==============================] - 0s 14ms/step - loss: 1.7945
Epoch 78/100
22/22 [==============================] - 0s 15ms/step - loss: 1.7374
Epoch 79/100
22/22 [==============================] - 0s 14ms/step - loss: 1.5639
Epoch 80/100
22/22 [==============================] - 0s 15ms/step - loss: 1.4988
Epoch 81/100
22/22 [==============================] - 0s 15ms/step - loss: 1.5989
Epoch 82/100
22/22 [==============================] - 0s 14ms/step - loss: 1.3911
Epoch 83/100
22/22 [==============================] - 0s 15ms/step - loss: 1.2364
Epoch 84/100
22/22 [==============================] - 0s 15ms/step - loss: 1.1558
Epoch 85/100
22/22 [==============================] - 0s 14ms/step - loss: 1.1362
Epoch 86/100
22/22 [==============================] - 0s 15ms/step - loss: 1.0861
Epoch 87/100
22/22 [==============================] - 0s 15ms/step - loss: 0.9349
Epoch 88/100
22/22 [==============================] - 0s 14ms/step - loss: 0.8588
Epoch 89/100
22/22 [==============================] - 0s 15ms/step - loss: 0.7909
Epoch 90/100
22/22 [==============================] - 0s 15ms/step - loss: 0.7393
Epoch 91/100
22/22 [==============================] - 0s 14ms/step - loss: 0.8159
Epoch 92/100
22/22 [==============================] - 0s 14ms/step - loss: 0.7322
Epoch 93/100
22/22 [==============================] - 0s 14ms/step - loss: 0.7075
Epoch 94/100
22/22 [==============================] - 0s 14ms/step - loss: 0.6991
Epoch 95/100
22/22 [==============================] - 0s 15ms/step - loss: 0.6262
Epoch 96/100
22/22 [==============================] - 0s 15ms/step - loss: 0.6037
Epoch 97/100
22/22 [==============================] - 0s 15ms/step - loss: 0.7232
Epoch 98/100
22/22 [==============================] - 0s 15ms/step - loss: 0.7145
Epoch 99/100
22/22 [==============================] - 0s 14ms/step - loss: 0.5586
Epoch 100/100
22/22 [==============================] - 0s 14ms/step - loss: 0.5060
<keras.callbacks.History at 0x7f66744f5100>
# Results for version 1
pred = model.predict(X_test)
pred = [p.mean() for p in pred]
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='actual', color='blue')
plt.plot(pred, label='prediction', color='orange')
plt.title('Prediction on concentrations of pm 2.5 using GRU ver.1')
plt.xlabel('time step (hour)')
plt.ylabel('concentrations of pm 2.5 (ug/m^3)')
plt.grid()
plt.legend(loc='best')
plt.savefig('./figures/prediction_results_ver1.png')
plt.show()

Version 2¶
Regression with wind speed and relative humidity data¶
date_from = '2022-05-02T00:00:00Z'
date_to = '2022-05-10T00:00:00Z'
city = 'San Francisco-Oakland-Fremont'
location = 'Oakland'
data_query_limit = 150
date_from_utc = u.pst_to_utc(date_from)
date_to_utc = u.pst_to_utc(date_to)
# co
# From 2022-05-02 To 2022-05-10
pollutant = 'co'
status, resp = api.measurements(city=city,
location=location, parameter=pollutant,
date_from=date_from,
date_to=date_to,
limit=data_query_limit)
r = resp['results']
df_co = u.date_pollutant_value(r, pollutant)
df_co.head(3)
date | co | |
---|---|---|
149 | 2022-05-02 23:00:00 | 0.42 |
148 | 2022-05-03 00:00:00 | 0.29 |
147 | 2022-05-03 01:00:00 | 0.29 |
# no2
pollutant = 'no2'
status, resp = api.measurements(city=city,
location=location, parameter=pollutant,
date_from=date_from,
date_to=date_to,
limit=data_query_limit)
r = resp['results']
df_no2 = u.date_pollutant_value(r, pollutant)
df_no2.head(3)
date | no2 | |
---|---|---|
149 | 2022-05-02 23:00:00 | 0.013 |
148 | 2022-05-03 00:00:00 | 0.007 |
147 | 2022-05-03 01:00:00 | 0.006 |
# o3
pollutant = 'o3'
status, resp = api.measurements(city=city,
location=location, parameter=pollutant,
date_from=date_from,
date_to=date_to,
limit=data_query_limit)
r = resp['results']
df_o3 = pd.DataFrame(data=r)
df_o3 = u.date_pollutant_value(r, pollutant)
df_o3.head(3)
date | o3 | |
---|---|---|
149 | 2022-05-02 23:00:00 | 0.016 |
148 | 2022-05-03 00:00:00 | 0.019 |
147 | 2022-05-03 01:00:00 | 0.019 |
# pm25
pollutant = 'pm25'
status, resp = api.measurements(city=city,
location=location, parameter=pollutant,
date_from=date_from,
date_to=date_to,
limit=data_query_limit)
r = resp['results']
df_pm25 = pd.DataFrame(data=r)
df_pm25 = u.date_pollutant_value(r, pollutant)
df_pm25.head(3)
date | pm25 | |
---|---|---|
149 | 2022-05-03 04:00:00 | 8 |
148 | 2022-05-03 05:00:00 | 8 |
147 | 2022-05-03 06:00:00 | 7 |
# Collect NOAA (daily weather) data using NOAA Python SDK contributed by Paulo Kuong(2018)
n = noaa.NOAA()
res = n.get_observations('94603', 'US', start=date_from_utc, end=date_to_utc, num_of_stations=1)
dates = []
windspeed = []
relativehum = []
for i in res:
dates.append(u.utc_to_pst(i['timestamp']))
windspeed.append(i['windSpeed']['value'])
relativehum.append(i['relativeHumidity']['value'])
df_w = pd.DataFrame()
df_w['date'] = dates
df_w['wind speed'] = windspeed
df_w['relative humidity'] = relativehum
df_w['date'] = pd.to_datetime(df_w['date'])
# fill na with median
df_w[['wind speed', 'relative humidity']] = df_w[['wind speed', 'relative humidity']].fillna(df_w[['wind speed', 'relative humidity']].median())
df_w = df_w.sort_values(by="date")
# df.to_csv('./fillna weather.csv')
df_w['date'] = df_w['date'].apply(lambda x: str(x).split('.', 1)[0].replace('T', ' '))
df_w.head(n=3)
date | wind speed | relative humidity | |
---|---|---|---|
72 | 2022-05-04 18:00:00 | 25.92 | 71.830366 |
52 | 2022-05-04 19:00:00 | 20.52 | 80.266011 |
43 | 2022-05-04 20:00:00 | 29.52 | 86.313549 |
# merge every dataframe on a date
df = df_co.merge(df_no2, how='inner', on='date')
df = df.merge(df_o3, how='inner', on='date')
df = df.merge(df_w,how='inner', on='date')
df = df.merge(df_pm25, how='inner', on='date')
df = df.set_index(['date'])
df
co | no2 | o3 | wind speed | relative humidity | pm25 | |
---|---|---|---|---|---|---|
date | ||||||
2022-05-04 18:00:00 | 0.29 | 0.009 | 0.027 | 25.92 | 71.830366 | 11 |
2022-05-04 19:00:00 | 0.28 | 0.009 | 0.023 | 20.52 | 80.266011 | 12 |
2022-05-04 20:00:00 | 0.30 | 0.009 | 0.023 | 29.52 | 86.313549 | 13 |
2022-05-04 21:00:00 | 0.26 | 0.010 | 0.023 | 24.12 | 89.229634 | 9 |
2022-05-04 22:00:00 | 0.27 | 0.011 | 0.022 | 18.36 | 89.229634 | 10 |
... | ... | ... | ... | ... | ... | ... |
2022-05-09 12:00:00 | 0.22 | 0.002 | 0.041 | 25.92 | 47.248268 | 3 |
2022-05-09 13:00:00 | 0.22 | 0.002 | 0.040 | 18.36 | 43.875674 | 6 |
2022-05-09 14:00:00 | 0.21 | 0.002 | 0.040 | 27.72 | 43.875674 | 7 |
2022-05-09 15:00:00 | 0.21 | 0.003 | 0.039 | 25.92 | 42.044566 | 4 |
2022-05-09 16:00:00 | 0.21 | 0.003 | 0.039 | 18.36 | 45.606476 | 3 |
112 rows × 6 columns
# Scaling
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['co'].values.reshape(-1, 1))
df['co'] = co_scaled
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['no2'].values.reshape(-1, 1))
df['no2'] = co_scaled
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['o3'].values.reshape(-1, 1))
df['o3'] = co_scaled
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['o3'].values.reshape(-1, 1))
df['wind speed'] = co_scaled
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['o3'].values.reshape(-1, 1))
df['relative humidity'] = co_scaled
plot = df.plot(subplots=True, title='Robust Scaler')
fig = plt.gcf()
fig.autofmt_xdate()
fig.savefig("./figures/scaling_ver2.png")
plt.show()

# Make data stationary
# Differencing technique was applied
co_diff = u.differencing(df['co'].values)
no2_diff = u.differencing(df['no2'].values)
o3_diff = u.differencing(df['o3'].values)
ws_diff = u.differencing(df['wind speed'].values)
rh_diff = u.differencing(df['relative humidity'].values)
# Delete the first row
df = df.iloc[:-1, :]
df['co'] = co_diff
df['no2'] = no2_diff
df['o3'] = o3_diff
df['wind speed'] = ws_diff
df['relative humidity'] = rh_diff
plot = df.plot(subplots=True, title='Differencing')
fig = plt.gcf()
fig.autofmt_xdate()
fig.savefig("./figures/differencing_ver2.png")
plt.show()
/tmp/ipykernel_7700/1728641024.py:12: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['co'] = co_diff
/tmp/ipykernel_7700/1728641024.py:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['no2'] = no2_diff
/tmp/ipykernel_7700/1728641024.py:14: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['o3'] = o3_diff
/tmp/ipykernel_7700/1728641024.py:15: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['wind speed'] = ws_diff
/tmp/ipykernel_7700/1728641024.py:16: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['relative humidity'] = rh_diff

# correlation among features
df_corr = df.corr()
df_corr.to_csv('./tables/correlation_ver2.csv')
df_corr
co | no2 | o3 | wind speed | relative humidity | pm25 | |
---|---|---|---|---|---|---|
co | 1.000000 | 0.614983 | -0.587944 | -0.587944 | -0.587944 | -0.132807 |
no2 | 0.614983 | 1.000000 | -0.796106 | -0.796106 | -0.796106 | -0.168784 |
o3 | -0.587944 | -0.796106 | 1.000000 | 1.000000 | 1.000000 | 0.120429 |
wind speed | -0.587944 | -0.796106 | 1.000000 | 1.000000 | 1.000000 | 0.120429 |
relative humidity | -0.587944 | -0.796106 | 1.000000 | 1.000000 | 1.000000 | 0.120429 |
pm25 | -0.132807 | -0.168784 | 0.120429 | 0.120429 | 0.120429 | 1.000000 |
# feature vectors: shape (num of data, window size, num of features)
feature_np = df[['co', 'no2', 'o3', 'wind speed', 'relative humidity']].to_numpy()
# label vactors: shape (num of data,)
label_np = df[['pm25']].to_numpy()
X = []
y = []
# how many timesteps we want to look at --> default 8 (hours)
for i in range(8, len(feature_np)):
X.append(feature_np[i-8:i, :])
y.append(label_np[i])
X, y = np.array(X, dtype=np.float64), np.array(y, dtype=np.float64)
X.shape, y.shape
((103, 8, 5), (103, 1))
TEST_SIZE = 30
X_train = X[:-TEST_SIZE]
y_train = y[:-TEST_SIZE]
X_test = X[-TEST_SIZE:]
y_test = y[-TEST_SIZE:]
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((73, 8, 5), (73, 1), (30, 8, 5), (30, 1))
# Predict pm2.5 using Gated Recurrent Unit
model = Sequential()
model.add(GRU(units=50,
return_sequences=True,
input_shape=X_train[0].shape,
activation='tanh'))
model.add(GRU(units=50, activation='tanh'))
model.add(Dense(units=2))
# Compiling the GRU
model.compile(optimizer=SGD(learning_rate=0.01, decay=1e-7,
momentum=0.9, nesterov=False),
loss='mse')
model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
gru_2 (GRU) (None, 8, 50) 8550
gru_3 (GRU) (None, 50) 15300
dense_1 (Dense) (None, 2) 102
=================================================================
Total params: 23,952
Trainable params: 23,952
Non-trainable params: 0
_________________________________________________________________
# model training
early_stop = EarlyStopping(monitor='loss', mode='min', verbose=0, patience=10)
model.fit(X_train, y_train, epochs=50, batch_size=100, verbose=1, callbacks=[early_stop])
Epoch 1/50
1/1 [==============================] - 3s 3s/step - loss: 39.2118
Epoch 2/50
1/1 [==============================] - 0s 16ms/step - loss: 35.9047
Epoch 3/50
1/1 [==============================] - 0s 15ms/step - loss: 30.0764
Epoch 4/50
1/1 [==============================] - 0s 15ms/step - loss: 22.2573
Epoch 5/50
1/1 [==============================] - 0s 15ms/step - loss: 13.3274
Epoch 6/50
1/1 [==============================] - 0s 15ms/step - loss: 6.6729
Epoch 7/50
1/1 [==============================] - 0s 14ms/step - loss: 6.5386
Epoch 8/50
1/1 [==============================] - 0s 15ms/step - loss: 10.8090
Epoch 9/50
1/1 [==============================] - 0s 16ms/step - loss: 12.7183
Epoch 10/50
1/1 [==============================] - 0s 15ms/step - loss: 10.1887
Epoch 11/50
1/1 [==============================] - 0s 15ms/step - loss: 6.9064
Epoch 12/50
1/1 [==============================] - 0s 15ms/step - loss: 5.9878
Epoch 13/50
1/1 [==============================] - 0s 14ms/step - loss: 7.1485
Epoch 14/50
1/1 [==============================] - 0s 13ms/step - loss: 8.6062
Epoch 15/50
1/1 [==============================] - 0s 13ms/step - loss: 8.9615
Epoch 16/50
1/1 [==============================] - 0s 13ms/step - loss: 7.9354
Epoch 17/50
1/1 [==============================] - 0s 13ms/step - loss: 6.3954
Epoch 18/50
1/1 [==============================] - 0s 13ms/step - loss: 5.6047
Epoch 19/50
1/1 [==============================] - 0s 13ms/step - loss: 6.0716
Epoch 20/50
1/1 [==============================] - 0s 13ms/step - loss: 7.0572
Epoch 21/50
1/1 [==============================] - 0s 13ms/step - loss: 7.4110
Epoch 22/50
1/1 [==============================] - 0s 14ms/step - loss: 6.8170
Epoch 23/50
1/1 [==============================] - 0s 13ms/step - loss: 5.9507
Epoch 24/50
1/1 [==============================] - 0s 14ms/step - loss: 5.5706
Epoch 25/50
1/1 [==============================] - 0s 14ms/step - loss: 5.7996
Epoch 26/50
1/1 [==============================] - 0s 14ms/step - loss: 6.2321
Epoch 27/50
1/1 [==============================] - 0s 12ms/step - loss: 6.4381
Epoch 28/50
1/1 [==============================] - 0s 13ms/step - loss: 6.2904
Epoch 29/50
1/1 [==============================] - 0s 14ms/step - loss: 5.9492
Epoch 30/50
1/1 [==============================] - 0s 13ms/step - loss: 5.6633
Epoch 31/50
1/1 [==============================] - 0s 14ms/step - loss: 5.5910
Epoch 32/50
1/1 [==============================] - 0s 14ms/step - loss: 5.7226
Epoch 33/50
1/1 [==============================] - 0s 13ms/step - loss: 5.9103
Epoch 34/50
1/1 [==============================] - 0s 13ms/step - loss: 5.9842
<keras.callbacks.History at 0x7f66806cc400>
# Results for version 2
pred = model.predict(X_test)
pred = [p.mean() for p in pred]
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='actual', color='blue')
plt.plot(pred, label='prediction', color='orange')
plt.title('Prediction on concentrations of pm 2.5 using GRU ver.2')
plt.xlabel('time step (hour)')
plt.ylabel('concentrations of pm 2.5 (ug/m^3)')
plt.grid()
plt.legend(loc='best')
plt.savefig('./figures/prediction_results_ver2.png')
plt.show()
