Predicting Concentrations of PM2.5

import openaq
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from aqtools import aqutils as u

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, GRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import SGD

from noaa_sdk import noaa

api = openaq.OpenAQ()

Version 1

# Parameter setting

date_from = '2021-09-01T00:00:00Z' # Default: PST
date_to = '2022-03-01T00:00:00Z'
city = 'San Francisco-Oakland-Fremont'
location = 'Oakland'
data_query_limit = 4000
"""
Important! You may get 'ApiError: A bad request was made: 500'.
It's not because of our code but  of API server issue.
Please take a moment, and retry executing the cell.
"""
# co
pollutant = 'co'
status, resp = api.measurements(city=city,
                                location=location, parameter=pollutant,
                                date_from=date_from,
                                date_to=date_to,
                                limit=data_query_limit)
r = resp['results']
df_co = u.date_pollutant_value(r, pollutant)
df_co.head(3)
date co
3600 2021-08-31 17:00:00 0.3
3599 2021-08-31 18:00:00 0.3
3598 2021-08-31 19:00:00 0.3
# no2
pollutant = 'no2'
status, resp = api.measurements(city=city,
                                location=location, parameter=pollutant,
                                date_from=date_from,
                                date_to=date_to,
                                limit=data_query_limit)
r = resp['results']
df_no2 = u.date_pollutant_value(r, pollutant)

df_no2.head(3)
date no2
3599 2021-08-31 17:00:00 0.002
3598 2021-08-31 18:00:00 0.003
3597 2021-08-31 19:00:00 0.004
# o3
pollutant = 'o3'
status, resp = api.measurements(city=city,
                                location=location, parameter=pollutant,
                                date_from=date_from,
                                date_to=date_to,
                                limit=data_query_limit)
r = resp['results']
df_o3 = pd.DataFrame(data=r)
df_o3 = u.date_pollutant_value(r, pollutant)

df_o3.head(3)
date o3
3600 2021-08-31 17:00:00 0.031
3599 2021-08-31 18:00:00 0.031
3598 2021-08-31 19:00:00 0.030
# pm25
pollutant = 'pm25'
status, resp = api.measurements(city=city,
                                location=location, parameter=pollutant,
                                date_from=date_from,
                                date_to=date_to,
                                limit=data_query_limit)
r = resp['results']
df_pm25 = pd.DataFrame(data=r)
df_pm25 = u.date_pollutant_value(r, pollutant)

df_pm25.head(3)
date pm25
3751 2021-08-31 17:00:00 11
3750 2021-08-31 18:00:00 12
3749 2021-08-31 19:00:00 15
# Merge dataframes on 'date' (find the intersection of values based on 'date')
df = df_co.merge(df_no2, how='inner', on='date')
df = df.merge(df_o3, how='inner', on='date')
df = df.merge(df_pm25, how='inner', on='date')
df = df.set_index(['date'])
df
co no2 o3 pm25
date
2021-08-31 17:00:00 0.30 0.002 0.031 11
2021-08-31 18:00:00 0.30 0.003 0.031 12
2021-08-31 19:00:00 0.30 0.004 0.030 15
2021-08-31 20:00:00 0.36 0.006 0.029 14
2021-08-31 21:00:00 0.37 0.006 0.028 12
... ... ... ... ...
2022-02-28 12:00:00 0.47 0.025 0.023 15
2022-02-28 13:00:00 0.46 0.025 0.026 20
2022-02-28 14:00:00 0.38 0.016 0.038 13
2022-02-28 15:00:00 0.32 0.012 0.043 9
2022-02-28 16:00:00 0.30 0.011 0.040 6

3526 rows × 4 columns

# MinMax Scaling
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['co'].values.reshape(-1, 1))
df['co'] = co_scaled

scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['no2'].values.reshape(-1, 1))
df['no2'] = co_scaled

scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['o3'].values.reshape(-1, 1))
df['o3'] = co_scaled

df.plot(subplots=True, title='Robust Scaler')
fig = plt.gcf()
fig.autofmt_xdate()
fig.savefig("./figures/scaling_ver1.png")
plt.show()
_images/MainAnalysis_9_0.png
# Make data stationary
# Differencing technique was applied

co_diff = u.differencing(df['co'].values)
no2_diff = u.differencing(df['no2'].values)
o3_diff = u.differencing(df['o3'].values)
# Delete the first row
df = df.iloc[:-1, :]

df['co'] = co_diff
df['no2'] = no2_diff
df['o3'] = o3_diff
df.plot(subplots=True, title='Differencing')
fig = plt.gcf()
fig.autofmt_xdate()
fig.savefig("./figures/differencing_ver1.png")
plt.show()
/tmp/ipykernel_7700/3874090531.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co'] = co_diff
/tmp/ipykernel_7700/3874090531.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['no2'] = no2_diff
/tmp/ipykernel_7700/3874090531.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['o3'] = o3_diff
_images/MainAnalysis_10_1.png
# correlation among features
df_corr = df.corr()
df_corr.to_csv('./tables/correlation_ver1.csv')
df_corr
co no2 o3 pm25
co 1.000000 0.673827 -0.623135 -0.118771
no2 0.673827 1.000000 -0.770821 -0.113108
o3 -0.623135 -0.770821 1.000000 0.080400
pm25 -0.118771 -0.113108 0.080400 1.000000
# feature vectors: shape (num of data, window size, num of features)
feature_np = df[['co', 'no2', 'o3']].to_numpy()

# label vactors: shape (num of data,)
label_np = df[['pm25']].to_numpy()
X = []
y = []

# how many timesteps we want to look at --> default 8 (hours)
for i in range(8, len(feature_np)):
    X.append(feature_np[i-8:i, :])
    y.append(label_np[i])

X, y = np.array(X, dtype=np.float64), np.array(y, dtype=np.float64)
X.shape, y.shape
((3517, 8, 3), (3517, 1))
TEST_SIZE = 300

X_train = X[:-TEST_SIZE]
y_train = y[:-TEST_SIZE]
X_test = X[-TEST_SIZE:]
y_test = y[-TEST_SIZE:]


X_train.shape, y_train.shape, X_test.shape, y_test.shape
((3217, 8, 3), (3217, 1), (300, 8, 3), (300, 1))
# Predict pm2.5 using Gated Recurrent Unit

model = Sequential()
model.add(GRU(units=50,
              return_sequences=True,
              input_shape=X_train[0].shape,
              activation='tanh'))
model.add(GRU(units=50, activation='tanh'))
model.add(Dense(units=2))

# Compiling the GRU
model.compile(optimizer=SGD(learning_rate=0.01, decay=1e-7,
                            momentum=0.9, nesterov=False),
              loss='mse')

model.summary()
2022-05-12 03:39:34.489909: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 gru (GRU)                   (None, 8, 50)             8250      
                                                                 
 gru_1 (GRU)                 (None, 50)                15300     
                                                                 
 dense (Dense)               (None, 2)                 102       
                                                                 
=================================================================
Total params: 23,652
Trainable params: 23,652
Non-trainable params: 0
_________________________________________________________________
# model training
#early_stop = EarlyStopping(monitor='loss', mode='min', verbose=0, patience=10)
#model.fit(X_train, y_train, epochs=100, batch_size=150, verbose=1, callbacks=[early_stop])

model.fit(X_train, y_train, epochs=100, batch_size=150, verbose=1)
Epoch 1/100
22/22 [==============================] - 3s 15ms/step - loss: 61.7825
Epoch 2/100
22/22 [==============================] - 0s 14ms/step - loss: 47.2600
Epoch 3/100
22/22 [==============================] - 0s 15ms/step - loss: 46.5220
Epoch 4/100
22/22 [==============================] - 0s 14ms/step - loss: 46.0431
Epoch 5/100
22/22 [==============================] - 0s 14ms/step - loss: 45.9086
Epoch 6/100
22/22 [==============================] - 0s 14ms/step - loss: 45.9669
Epoch 7/100
22/22 [==============================] - 0s 14ms/step - loss: 46.2019
Epoch 8/100
22/22 [==============================] - 0s 13ms/step - loss: 43.4890
Epoch 9/100
22/22 [==============================] - 0s 14ms/step - loss: 41.3602
Epoch 10/100
22/22 [==============================] - 0s 15ms/step - loss: 39.4300
Epoch 11/100
22/22 [==============================] - 0s 15ms/step - loss: 38.2915
Epoch 12/100
22/22 [==============================] - 0s 15ms/step - loss: 40.1313
Epoch 13/100
22/22 [==============================] - 0s 15ms/step - loss: 37.6508
Epoch 14/100
22/22 [==============================] - 0s 14ms/step - loss: 36.5795
Epoch 15/100
22/22 [==============================] - 0s 14ms/step - loss: 36.3274
Epoch 16/100
22/22 [==============================] - 0s 14ms/step - loss: 35.6911
Epoch 17/100
22/22 [==============================] - 0s 14ms/step - loss: 35.0691
Epoch 18/100
22/22 [==============================] - 0s 14ms/step - loss: 35.2598
Epoch 19/100
22/22 [==============================] - 0s 14ms/step - loss: 36.0980
Epoch 20/100
22/22 [==============================] - 0s 15ms/step - loss: 34.3189
Epoch 21/100
22/22 [==============================] - 0s 15ms/step - loss: 33.0377
Epoch 22/100
22/22 [==============================] - 0s 14ms/step - loss: 32.3313
Epoch 23/100
22/22 [==============================] - 0s 14ms/step - loss: 31.3445
Epoch 24/100
22/22 [==============================] - 0s 14ms/step - loss: 30.9620
Epoch 25/100
22/22 [==============================] - 0s 15ms/step - loss: 29.8989
Epoch 26/100
22/22 [==============================] - 0s 16ms/step - loss: 28.9788
Epoch 27/100
22/22 [==============================] - 0s 14ms/step - loss: 28.2696
Epoch 28/100
22/22 [==============================] - 0s 14ms/step - loss: 27.7180
Epoch 29/100
22/22 [==============================] - 0s 14ms/step - loss: 28.8570
Epoch 30/100
22/22 [==============================] - 0s 15ms/step - loss: 26.2209
Epoch 31/100
22/22 [==============================] - 0s 15ms/step - loss: 25.1053
Epoch 32/100
22/22 [==============================] - 0s 15ms/step - loss: 24.3861
Epoch 33/100
22/22 [==============================] - 0s 14ms/step - loss: 23.9058
Epoch 34/100
22/22 [==============================] - 0s 15ms/step - loss: 23.0863
Epoch 35/100
22/22 [==============================] - 0s 14ms/step - loss: 22.9559
Epoch 36/100
22/22 [==============================] - 0s 14ms/step - loss: 22.4067
Epoch 37/100
22/22 [==============================] - 0s 15ms/step - loss: 19.7727
Epoch 38/100
22/22 [==============================] - 0s 15ms/step - loss: 18.2430
Epoch 39/100
22/22 [==============================] - 0s 14ms/step - loss: 16.6916
Epoch 40/100
22/22 [==============================] - 0s 15ms/step - loss: 17.0721
Epoch 41/100
22/22 [==============================] - 0s 14ms/step - loss: 17.3268
Epoch 42/100
22/22 [==============================] - 0s 15ms/step - loss: 16.3780
Epoch 43/100
22/22 [==============================] - 0s 14ms/step - loss: 14.5547
Epoch 44/100
22/22 [==============================] - 0s 15ms/step - loss: 13.6701
Epoch 45/100
22/22 [==============================] - 0s 15ms/step - loss: 13.8296
Epoch 46/100
22/22 [==============================] - 0s 14ms/step - loss: 13.4327
Epoch 47/100
22/22 [==============================] - 0s 15ms/step - loss: 12.2368
Epoch 48/100
22/22 [==============================] - 0s 15ms/step - loss: 11.6621
Epoch 49/100
22/22 [==============================] - 0s 14ms/step - loss: 10.4138
Epoch 50/100
22/22 [==============================] - 0s 15ms/step - loss: 9.2011
Epoch 51/100
22/22 [==============================] - 0s 15ms/step - loss: 9.1597
Epoch 52/100
22/22 [==============================] - 0s 15ms/step - loss: 8.2274
Epoch 53/100
22/22 [==============================] - 0s 16ms/step - loss: 7.9333
Epoch 54/100
22/22 [==============================] - 0s 15ms/step - loss: 7.3819
Epoch 55/100
22/22 [==============================] - 0s 15ms/step - loss: 7.3128
Epoch 56/100
22/22 [==============================] - 0s 15ms/step - loss: 6.6691
Epoch 57/100
22/22 [==============================] - 0s 15ms/step - loss: 6.8974
Epoch 58/100
22/22 [==============================] - 0s 15ms/step - loss: 6.2350
Epoch 59/100
22/22 [==============================] - 0s 15ms/step - loss: 5.3208
Epoch 60/100
22/22 [==============================] - 0s 15ms/step - loss: 5.3226
Epoch 61/100
22/22 [==============================] - 0s 15ms/step - loss: 4.9757
Epoch 62/100
22/22 [==============================] - 0s 15ms/step - loss: 4.7543
Epoch 63/100
22/22 [==============================] - 0s 16ms/step - loss: 4.3134
Epoch 64/100
22/22 [==============================] - 0s 15ms/step - loss: 3.9720
Epoch 65/100
22/22 [==============================] - 0s 15ms/step - loss: 3.4406
Epoch 66/100
22/22 [==============================] - 0s 16ms/step - loss: 3.1337
Epoch 67/100
22/22 [==============================] - 0s 15ms/step - loss: 3.2613
Epoch 68/100
22/22 [==============================] - 0s 15ms/step - loss: 4.1395
Epoch 69/100
22/22 [==============================] - 0s 15ms/step - loss: 4.7379
Epoch 70/100
22/22 [==============================] - 0s 14ms/step - loss: 4.3833
Epoch 71/100
22/22 [==============================] - 0s 15ms/step - loss: 3.7696
Epoch 72/100
22/22 [==============================] - 0s 15ms/step - loss: 3.3501
Epoch 73/100
22/22 [==============================] - 0s 16ms/step - loss: 2.7507
Epoch 74/100
22/22 [==============================] - 0s 15ms/step - loss: 2.3119
Epoch 75/100
22/22 [==============================] - 0s 15ms/step - loss: 1.9574
Epoch 76/100
22/22 [==============================] - 0s 14ms/step - loss: 1.7500
Epoch 77/100
22/22 [==============================] - 0s 14ms/step - loss: 1.7945
Epoch 78/100
22/22 [==============================] - 0s 15ms/step - loss: 1.7374
Epoch 79/100
22/22 [==============================] - 0s 14ms/step - loss: 1.5639
Epoch 80/100
22/22 [==============================] - 0s 15ms/step - loss: 1.4988
Epoch 81/100
22/22 [==============================] - 0s 15ms/step - loss: 1.5989
Epoch 82/100
22/22 [==============================] - 0s 14ms/step - loss: 1.3911
Epoch 83/100
22/22 [==============================] - 0s 15ms/step - loss: 1.2364
Epoch 84/100
22/22 [==============================] - 0s 15ms/step - loss: 1.1558
Epoch 85/100
22/22 [==============================] - 0s 14ms/step - loss: 1.1362
Epoch 86/100
22/22 [==============================] - 0s 15ms/step - loss: 1.0861
Epoch 87/100
22/22 [==============================] - 0s 15ms/step - loss: 0.9349
Epoch 88/100
22/22 [==============================] - 0s 14ms/step - loss: 0.8588
Epoch 89/100
22/22 [==============================] - 0s 15ms/step - loss: 0.7909
Epoch 90/100
22/22 [==============================] - 0s 15ms/step - loss: 0.7393
Epoch 91/100
22/22 [==============================] - 0s 14ms/step - loss: 0.8159
Epoch 92/100
22/22 [==============================] - 0s 14ms/step - loss: 0.7322
Epoch 93/100
22/22 [==============================] - 0s 14ms/step - loss: 0.7075
Epoch 94/100
22/22 [==============================] - 0s 14ms/step - loss: 0.6991
Epoch 95/100
22/22 [==============================] - 0s 15ms/step - loss: 0.6262
Epoch 96/100
22/22 [==============================] - 0s 15ms/step - loss: 0.6037
Epoch 97/100
22/22 [==============================] - 0s 15ms/step - loss: 0.7232
Epoch 98/100
22/22 [==============================] - 0s 15ms/step - loss: 0.7145
Epoch 99/100
22/22 [==============================] - 0s 14ms/step - loss: 0.5586
Epoch 100/100
22/22 [==============================] - 0s 14ms/step - loss: 0.5060
<keras.callbacks.History at 0x7f66744f5100>
# Results for version 1
pred = model.predict(X_test)
pred = [p.mean() for p in pred]
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='actual', color='blue')
plt.plot(pred, label='prediction', color='orange')
plt.title('Prediction on concentrations of pm 2.5 using GRU ver.1')
plt.xlabel('time step (hour)')
plt.ylabel('concentrations of pm 2.5 (ug/m^3)')
plt.grid()
plt.legend(loc='best')
plt.savefig('./figures/prediction_results_ver1.png')
plt.show()
_images/MainAnalysis_16_0.png

Version 2

Regression with wind speed and relative humidity data

date_from = '2022-05-02T00:00:00Z'
date_to = '2022-05-10T00:00:00Z'
city = 'San Francisco-Oakland-Fremont'
location = 'Oakland'
data_query_limit = 150

date_from_utc = u.pst_to_utc(date_from)
date_to_utc = u.pst_to_utc(date_to)
# co
# From 2022-05-02 To 2022-05-10

pollutant = 'co'
status, resp = api.measurements(city=city,
                                location=location, parameter=pollutant,
                                date_from=date_from,
                                date_to=date_to,
                                limit=data_query_limit)
r = resp['results']
df_co = u.date_pollutant_value(r, pollutant)
df_co.head(3)
date co
149 2022-05-02 23:00:00 0.42
148 2022-05-03 00:00:00 0.29
147 2022-05-03 01:00:00 0.29
# no2
pollutant = 'no2'
status, resp = api.measurements(city=city,
                                location=location, parameter=pollutant,
                                date_from=date_from,
                                date_to=date_to,
                                limit=data_query_limit)
r = resp['results']
df_no2 = u.date_pollutant_value(r, pollutant)

df_no2.head(3)
date no2
149 2022-05-02 23:00:00 0.013
148 2022-05-03 00:00:00 0.007
147 2022-05-03 01:00:00 0.006
# o3
pollutant = 'o3'
status, resp = api.measurements(city=city,
                                location=location, parameter=pollutant,
                                date_from=date_from,
                                date_to=date_to,
                                limit=data_query_limit)
r = resp['results']
df_o3 = pd.DataFrame(data=r)
df_o3 = u.date_pollutant_value(r, pollutant)

df_o3.head(3)
date o3
149 2022-05-02 23:00:00 0.016
148 2022-05-03 00:00:00 0.019
147 2022-05-03 01:00:00 0.019
# pm25
pollutant = 'pm25'
status, resp = api.measurements(city=city,
                                location=location, parameter=pollutant,
                                date_from=date_from,
                                date_to=date_to,
                                limit=data_query_limit)
r = resp['results']
df_pm25 = pd.DataFrame(data=r)
df_pm25 = u.date_pollutant_value(r, pollutant)

df_pm25.head(3)
date pm25
149 2022-05-03 04:00:00 8
148 2022-05-03 05:00:00 8
147 2022-05-03 06:00:00 7
# Collect NOAA (daily weather) data using NOAA Python SDK contributed by Paulo Kuong(2018)

n = noaa.NOAA()
res = n.get_observations('94603', 'US', start=date_from_utc, end=date_to_utc, num_of_stations=1)
dates = []
windspeed = []
relativehum = []

for i in res:
    dates.append(u.utc_to_pst(i['timestamp']))
    windspeed.append(i['windSpeed']['value'])
    relativehum.append(i['relativeHumidity']['value'])
df_w = pd.DataFrame()
df_w['date'] = dates
df_w['wind speed'] = windspeed
df_w['relative humidity'] = relativehum
df_w['date'] = pd.to_datetime(df_w['date'])

# fill na with median
df_w[['wind speed', 'relative humidity']] = df_w[['wind speed', 'relative humidity']].fillna(df_w[['wind speed', 'relative humidity']].median())
df_w = df_w.sort_values(by="date")
# df.to_csv('./fillna weather.csv')


df_w['date'] = df_w['date'].apply(lambda x: str(x).split('.', 1)[0].replace('T', ' '))

df_w.head(n=3)
date wind speed relative humidity
72 2022-05-04 18:00:00 25.92 71.830366
52 2022-05-04 19:00:00 20.52 80.266011
43 2022-05-04 20:00:00 29.52 86.313549
# merge every dataframe on a date
df = df_co.merge(df_no2, how='inner', on='date')
df = df.merge(df_o3, how='inner', on='date')
df = df.merge(df_w,how='inner', on='date')
df = df.merge(df_pm25, how='inner', on='date')
df = df.set_index(['date'])
df
co no2 o3 wind speed relative humidity pm25
date
2022-05-04 18:00:00 0.29 0.009 0.027 25.92 71.830366 11
2022-05-04 19:00:00 0.28 0.009 0.023 20.52 80.266011 12
2022-05-04 20:00:00 0.30 0.009 0.023 29.52 86.313549 13
2022-05-04 21:00:00 0.26 0.010 0.023 24.12 89.229634 9
2022-05-04 22:00:00 0.27 0.011 0.022 18.36 89.229634 10
... ... ... ... ... ... ...
2022-05-09 12:00:00 0.22 0.002 0.041 25.92 47.248268 3
2022-05-09 13:00:00 0.22 0.002 0.040 18.36 43.875674 6
2022-05-09 14:00:00 0.21 0.002 0.040 27.72 43.875674 7
2022-05-09 15:00:00 0.21 0.003 0.039 25.92 42.044566 4
2022-05-09 16:00:00 0.21 0.003 0.039 18.36 45.606476 3

112 rows × 6 columns

# Scaling
scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['co'].values.reshape(-1, 1))
df['co'] = co_scaled

scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['no2'].values.reshape(-1, 1))
df['no2'] = co_scaled

scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['o3'].values.reshape(-1, 1))
df['o3'] = co_scaled

scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['o3'].values.reshape(-1, 1))
df['wind speed'] = co_scaled

scaler = RobustScaler()
co_scaled = scaler.fit_transform(df['o3'].values.reshape(-1, 1))
df['relative humidity'] = co_scaled

plot = df.plot(subplots=True, title='Robust Scaler')
fig = plt.gcf()
fig.autofmt_xdate()
fig.savefig("./figures/scaling_ver2.png")
plt.show()
_images/MainAnalysis_25_0.png
# Make data stationary
# Differencing technique was applied

co_diff = u.differencing(df['co'].values)
no2_diff = u.differencing(df['no2'].values)
o3_diff = u.differencing(df['o3'].values)
ws_diff = u.differencing(df['wind speed'].values)
rh_diff = u.differencing(df['relative humidity'].values)
# Delete the first row
df = df.iloc[:-1, :]

df['co'] = co_diff
df['no2'] = no2_diff
df['o3'] = o3_diff
df['wind speed'] = ws_diff
df['relative humidity'] = rh_diff
plot = df.plot(subplots=True, title='Differencing')
fig = plt.gcf()
fig.autofmt_xdate()
fig.savefig("./figures/differencing_ver2.png")
plt.show()
/tmp/ipykernel_7700/1728641024.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['co'] = co_diff
/tmp/ipykernel_7700/1728641024.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['no2'] = no2_diff
/tmp/ipykernel_7700/1728641024.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['o3'] = o3_diff
/tmp/ipykernel_7700/1728641024.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wind speed'] = ws_diff
/tmp/ipykernel_7700/1728641024.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['relative humidity'] = rh_diff
_images/MainAnalysis_26_1.png
# correlation among features
df_corr = df.corr()
df_corr.to_csv('./tables/correlation_ver2.csv')
df_corr
co no2 o3 wind speed relative humidity pm25
co 1.000000 0.614983 -0.587944 -0.587944 -0.587944 -0.132807
no2 0.614983 1.000000 -0.796106 -0.796106 -0.796106 -0.168784
o3 -0.587944 -0.796106 1.000000 1.000000 1.000000 0.120429
wind speed -0.587944 -0.796106 1.000000 1.000000 1.000000 0.120429
relative humidity -0.587944 -0.796106 1.000000 1.000000 1.000000 0.120429
pm25 -0.132807 -0.168784 0.120429 0.120429 0.120429 1.000000
# feature vectors: shape (num of data, window size, num of features)
feature_np = df[['co', 'no2', 'o3', 'wind speed', 'relative humidity']].to_numpy()

# label vactors: shape (num of data,)
label_np = df[['pm25']].to_numpy()
X = []
y = []

# how many timesteps we want to look at --> default 8 (hours)
for i in range(8, len(feature_np)):
    X.append(feature_np[i-8:i, :])
    y.append(label_np[i])

X, y = np.array(X, dtype=np.float64), np.array(y, dtype=np.float64)
X.shape, y.shape
((103, 8, 5), (103, 1))
TEST_SIZE = 30

X_train = X[:-TEST_SIZE]
y_train = y[:-TEST_SIZE]
X_test = X[-TEST_SIZE:]
y_test = y[-TEST_SIZE:]


X_train.shape, y_train.shape, X_test.shape, y_test.shape
((73, 8, 5), (73, 1), (30, 8, 5), (30, 1))
# Predict pm2.5 using Gated Recurrent Unit

model = Sequential()
model.add(GRU(units=50,
              return_sequences=True,
              input_shape=X_train[0].shape,
              activation='tanh'))
model.add(GRU(units=50, activation='tanh'))
model.add(Dense(units=2))

# Compiling the GRU
model.compile(optimizer=SGD(learning_rate=0.01, decay=1e-7,
                            momentum=0.9, nesterov=False),
              loss='mse')

model.summary()
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 gru_2 (GRU)                 (None, 8, 50)             8550      
                                                                 
 gru_3 (GRU)                 (None, 50)                15300     
                                                                 
 dense_1 (Dense)             (None, 2)                 102       
                                                                 
=================================================================
Total params: 23,952
Trainable params: 23,952
Non-trainable params: 0
_________________________________________________________________
# model training
early_stop = EarlyStopping(monitor='loss', mode='min', verbose=0, patience=10)
model.fit(X_train, y_train, epochs=50, batch_size=100, verbose=1, callbacks=[early_stop])
Epoch 1/50
1/1 [==============================] - 3s 3s/step - loss: 39.2118
Epoch 2/50
1/1 [==============================] - 0s 16ms/step - loss: 35.9047
Epoch 3/50
1/1 [==============================] - 0s 15ms/step - loss: 30.0764
Epoch 4/50
1/1 [==============================] - 0s 15ms/step - loss: 22.2573
Epoch 5/50
1/1 [==============================] - 0s 15ms/step - loss: 13.3274
Epoch 6/50
1/1 [==============================] - 0s 15ms/step - loss: 6.6729
Epoch 7/50
1/1 [==============================] - 0s 14ms/step - loss: 6.5386
Epoch 8/50
1/1 [==============================] - 0s 15ms/step - loss: 10.8090
Epoch 9/50
1/1 [==============================] - 0s 16ms/step - loss: 12.7183
Epoch 10/50
1/1 [==============================] - 0s 15ms/step - loss: 10.1887
Epoch 11/50
1/1 [==============================] - 0s 15ms/step - loss: 6.9064
Epoch 12/50
1/1 [==============================] - 0s 15ms/step - loss: 5.9878
Epoch 13/50
1/1 [==============================] - 0s 14ms/step - loss: 7.1485
Epoch 14/50
1/1 [==============================] - 0s 13ms/step - loss: 8.6062
Epoch 15/50
1/1 [==============================] - 0s 13ms/step - loss: 8.9615
Epoch 16/50
1/1 [==============================] - 0s 13ms/step - loss: 7.9354
Epoch 17/50
1/1 [==============================] - 0s 13ms/step - loss: 6.3954
Epoch 18/50
1/1 [==============================] - 0s 13ms/step - loss: 5.6047
Epoch 19/50
1/1 [==============================] - 0s 13ms/step - loss: 6.0716
Epoch 20/50
1/1 [==============================] - 0s 13ms/step - loss: 7.0572
Epoch 21/50
1/1 [==============================] - 0s 13ms/step - loss: 7.4110
Epoch 22/50
1/1 [==============================] - 0s 14ms/step - loss: 6.8170
Epoch 23/50
1/1 [==============================] - 0s 13ms/step - loss: 5.9507
Epoch 24/50
1/1 [==============================] - 0s 14ms/step - loss: 5.5706
Epoch 25/50
1/1 [==============================] - 0s 14ms/step - loss: 5.7996
Epoch 26/50
1/1 [==============================] - 0s 14ms/step - loss: 6.2321
Epoch 27/50
1/1 [==============================] - 0s 12ms/step - loss: 6.4381
Epoch 28/50
1/1 [==============================] - 0s 13ms/step - loss: 6.2904
Epoch 29/50
1/1 [==============================] - 0s 14ms/step - loss: 5.9492
Epoch 30/50
1/1 [==============================] - 0s 13ms/step - loss: 5.6633
Epoch 31/50
1/1 [==============================] - 0s 14ms/step - loss: 5.5910
Epoch 32/50
1/1 [==============================] - 0s 14ms/step - loss: 5.7226
Epoch 33/50
1/1 [==============================] - 0s 13ms/step - loss: 5.9103
Epoch 34/50
1/1 [==============================] - 0s 13ms/step - loss: 5.9842
<keras.callbacks.History at 0x7f66806cc400>
# Results for version 2
pred = model.predict(X_test)
pred = [p.mean() for p in pred]
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='actual', color='blue')
plt.plot(pred, label='prediction', color='orange')
plt.title('Prediction on concentrations of pm 2.5 using GRU ver.2')
plt.xlabel('time step (hour)')
plt.ylabel('concentrations of pm 2.5 (ug/m^3)')
plt.grid()
plt.legend(loc='best')
plt.savefig('./figures/prediction_results_ver2.png')
plt.show()
_images/MainAnalysis_32_0.png