Hide the code
%cd /wd/4_2_Stationarity/wd/4_2_Stationarity
Machine Learning
%cd /wd/4_2_Stationarity/wd/4_2_Stationarity
import subprocess
def pip_install(package):
result = subprocess.run(
["pip", "install", package],
capture_output=True,
text=True
)
if result.returncode != 0:
print(f"Error installing {package}: {result.stderr}")
else:
print(f"Successfully installed {package}")
pip_install("fpppy")
pip_install("tsfeatures")
pip_install("pyreadr")Successfully installed fpppy
Successfully installed tsfeatures
Successfully installed pyreadr
# 1. System and Environment Setup
import os
import sys
import warnings
import random
import datetime
from tqdm import tqdm
# Set environment variables and ignore specific warnings
os.environ["NIXTLA_ID_AS_COL"] = "true"
warnings.filterwarnings(
"ignore",
category=UserWarning,
message=".*FigureCanvasAgg is non-interactive.*"
)
# 2. Core Data Science Stack
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.seasonal import seasonal_decompose
from statsforecast.arima import ARIMASummary, ndiffs, nsdiffs
# 3. Time Series & Machine Learning
import pmdarima as pmd
import yfinance as yf
import rdatasets
import pyreadr
from sktime.datasets import load_airline
from sktime.utils.plotting import plot_series as plot_series_sktime
# 4. Custom Utilities
from fc_4_2_utils.utils import *
from fpppy.utils import plot_series as plot_series_fpp
from utilsforecast.plotting import plot_series as plot_series_utils
# 5. Global Reproducibility & Display Settings
np.random.seed(1)
random.seed(1)
np.set_printoptions(suppress=True)
pd.set_option("max_colwidth", 100)
pd.set_option("display.precision", 3)
# 6. Plotting Configuration
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from cycler import cycler
# Jupyter Magic
%matplotlib inline
%config InlineBackend.figure_format = 'png' # High-res plots
# Style and Aesthetics
plt.style.use("ggplot")
sns.set_style("whitegrid")
plt.rcParams.update({
"figure.figsize": (8, 5),
"figure.dpi": 100,
"savefig.dpi": 300,
"figure.constrained_layout.use": True,
"axes.titlesize": 12,
"axes.labelsize": 10,
"xtick.labelsize": 9,
"ytick.labelsize": 9,
"legend.fontsize": 9,
"legend.title_fontsize": 10,
})
# Custom color cycle (Example: monochromatic black as per your original)
mpl.rcParams['axes.prop_cycle'] = cycler(color=["#000000", "#000000"])n = 1500
k = 10
X = np.zeros((n, k))
rng = np.random.default_rng(2024)
Ts = np.arange(n)
for i in range(k):
A = rng.uniform(low=0, high=2 * np.pi, size = 1)
X[:, i] = 3 * np.sin(2 * np.pi * Ts/500 + A)Let us store the resulting values in a pandas Dataframe to make processing easier:
Xdf = pd.DataFrame(X, columns=["X" + str(i) for i in range(k)])
Xdf.head()| X0 | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -2.680 | 2.925 | 2.793 | -2.856 | -0.079 | 2.338 | 1.424 | 2.721 | 2.316 | 2.625 |
| 1 | -2.697 | 2.933 | 2.779 | -2.845 | -0.041 | 2.362 | 1.457 | 2.737 | 2.292 | 2.643 |
| 2 | -2.713 | 2.941 | 2.765 | -2.832 | -0.004 | 2.385 | 1.490 | 2.752 | 2.267 | 2.661 |
| 3 | -2.729 | 2.948 | 2.750 | -2.820 | 0.034 | 2.407 | 1.523 | 2.767 | 2.242 | 2.678 |
| 4 | -2.744 | 2.955 | 2.735 | -2.807 | 0.072 | 2.430 | 1.555 | 2.781 | 2.217 | 2.695 |
And let us plot the different realizations as time series:
fig, ax = plt.subplots()
Xdf.plot(ax=ax, zorder=-10, colormap="tab10")
t_a = 325
plt.axvline(x = t_a, ls="--")
plt.scatter(x=[t_a]*k, y=X[t_a, :], zorder=1)
if (k > 5):
ax.get_legend().remove()
ax.grid(visible=True, which='Both', axis='x')
plt.show();plt.close()We have added a dashed line at \(t_a = 325\) to help you visualize the random variable \(X_{325}\), which is part of this stochastic process. The dots illustrate that we have generated a \(k\)-sized sample of \(X_{t_a}\). The sample corresponds to a row of the pandas dataset (while the different time series corresponds to columns). We could use this sample to estimate the mean, variance and other statistical properties of \(X_{t_a}\). The key point is that in this example we have access to different realizations of the stochastic process, different time series of the process.
W = np.zeros((n, k))
for i in range(n):
W[i, :] = np.random.default_rng(i).normal(loc = 0, scale = 0.25, size = k)
Wdf = pd.DataFrame(W, columns=["W" + str(i) for i in range(k)])
Wdf.head() | W0 | W1 | W2 | W3 | W4 | W5 | W6 | W7 | W8 | W9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.031 | -0.033 | 0.160 | 0.026 | -0.134 | 0.090 | 0.326 | 0.237 | -0.176 | -0.316 |
| 1 | 0.086 | 0.205 | 0.083 | -0.326 | 0.226 | 0.112 | -0.134 | 0.145 | 0.091 | 0.074 |
| 2 | 0.047 | -0.131 | -0.103 | -0.610 | 0.450 | 0.286 | -0.081 | 0.193 | 0.070 | -0.138 |
| 3 | 0.510 | -0.639 | 0.105 | -0.142 | -0.113 | -0.054 | -0.505 | -0.058 | -0.216 | 0.831 |
| 4 | -0.163 | -0.044 | 0.416 | 0.165 | -0.410 | -0.001 | -0.156 | 0.037 | -0.402 | 0.060 |
Let us plot the \(k\) time series that we have obtained as realizations of this stochastic process. Now that looks noisy!
Wdf.plot(colormap="tab10")
plt.show();plt.close()Y = X + W
Ydf = pd.DataFrame(Y, columns=["Y" + str(i) for i in range(k)])
Ydf.plot(colormap="tab10")
plt.show();plt.close() You can see that the time series in this last plot are beginning to look much more alike the ones we saw in the previous session. Each time series (= realization of the process = column of the dataset) displays a signal (the \(X\) part) and a noise (the \(W\) part). Most of our time series models in the next sessions will be similar: they include a signal part and a noise part that we sum to get the complete model structure (in additive models; we multiply them in multiplicative models).
can_gas = pd.read_csv('../4_1_Introduction_to_Forecasting/tsdata/fpp3/Canadian_gas.csv')
can_gas["Month"] = pd.to_datetime(can_gas["Month"], format="%Y %b")
can_gas.set_index('Month', drop=True, inplace=True)
can_gas.head()| Volume | |
|---|---|
| Month | |
| 1960-01-01 | 1.431 |
| 1960-02-01 | 1.306 |
| 1960-03-01 | 1.402 |
| 1960-04-01 | 1.170 |
| 1960-05-01 | 1.116 |
Let us also create the nixtlaverse version of this.
fpppy_path = '/wd/data/fpppy/'
can_gas_nx = pd.read_csv(fpppy_path + '/canadian_gas.csv', parse_dates=['ds'])
can_gas_nx.head()| unique_id | ds | y | |
|---|---|---|---|
| 0 | Canadian gas production | 1960-01-01 | 1.431 |
| 1 | Canadian gas production | 1960-02-01 | 1.306 |
| 2 | Canadian gas production | 1960-03-01 | 1.402 |
| 3 | Canadian gas production | 1960-04-01 | 1.170 |
| 4 | Canadian gas production | 1960-05-01 | 1.116 |
Now we create the lags, for \(h= 0\) (a copy of the original time series) to \(h = 15\). We use a dictionary for this. Note that the range in the for loop uses 16 to get 15 lags, and the way we add names to the columns. We will examine the result and analyze the consequences below.
can_gas_lags = pd.DataFrame({"lag" + str(lag): can_gas["Volume"].shift(lag) for lag in range(16)})
can_gas_lags| lag0 | lag1 | lag2 | lag3 | lag4 | lag5 | lag6 | lag7 | lag8 | lag9 | lag10 | lag11 | lag12 | lag13 | lag14 | lag15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Month | ||||||||||||||||
| 1960-01-01 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1960-02-01 | 1.306 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1960-03-01 | 1.402 | 1.306 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1960-04-01 | 1.170 | 1.402 | 1.306 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1960-05-01 | 1.116 | 1.170 | 1.402 | 1.306 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2004-10-01 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 | 17.792 | 16.905 | 17.588 | 17.384 |
| 2004-11-01 | 17.832 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 | 17.792 | 16.905 | 17.588 |
| 2004-12-01 | 19.453 | 17.832 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 | 17.792 | 16.905 |
| 2005-01-01 | 19.528 | 19.453 | 17.832 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 | 17.792 |
| 2005-02-01 | 16.944 | 19.528 | 19.453 | 17.832 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 |
542 rows × 16 columns
And the nixtlaverse version would be:
can_gas_nx_lags = can_gas_nx.join(pd.DataFrame({"lag" + str(lag): can_gas_nx["y"].shift(lag) for lag in range(1, 16)}))
can_gas_nx_lags | unique_id | ds | y | lag1 | lag2 | lag3 | lag4 | lag5 | lag6 | lag7 | lag8 | lag9 | lag10 | lag11 | lag12 | lag13 | lag14 | lag15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Canadian gas production | 1960-01-01 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | Canadian gas production | 1960-02-01 | 1.306 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | Canadian gas production | 1960-03-01 | 1.402 | 1.306 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | Canadian gas production | 1960-04-01 | 1.170 | 1.402 | 1.306 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | Canadian gas production | 1960-05-01 | 1.116 | 1.170 | 1.402 | 1.306 | 1.431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 537 | Canadian gas production | 2004-10-01 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 | 17.792 | 16.905 | 17.588 | 17.384 |
| 538 | Canadian gas production | 2004-11-01 | 17.832 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 | 17.792 | 16.905 | 17.588 |
| 539 | Canadian gas production | 2004-12-01 | 19.453 | 17.832 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 | 17.792 | 16.905 |
| 540 | Canadian gas production | 2005-01-01 | 19.528 | 19.453 | 17.832 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 | 17.792 |
| 541 | Canadian gas production | 2005-02-01 | 16.944 | 19.528 | 19.453 | 17.832 | 17.827 | 16.907 | 17.641 | 17.878 | 17.166 | 18.093 | 17.848 | 18.333 | 17.782 | 19.244 | 19.144 | 17.720 |
542 rows × 18 columns
can_gas_ACF = sm.tsa.stattools.acf(can_gas["Volume"])
can_gas_ACFarray([1. , 0.98630929, 0.97389895, 0.95920413, 0.94504748,
0.93438939, 0.92468341, 0.9240095 , 0.92392526, 0.92766874,
0.93077195, 0.93117848, 0.93330983, 0.91975445, 0.90729176,
0.8922351 , 0.87805474, 0.866886 , 0.85682908, 0.85573537,
0.85563592, 0.85900748, 0.86164275, 0.86181158, 0.86357829,
0.85043021, 0.83791706, 0.82291474])
The library also includes a function for a nice graphical representation of the ACF values, a so called ACF plot, that appears in the left panel below. In the right panel we show the plot of the original time series and the 4-lag time series:
which_lag = 4
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(can_gas["Volume"], ax=ax0, lags=25, zero=False, title='ACF for the Canadian Gas Example')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = can_gas["Volume"].head(100).plot(color="blue", label="Original")
ax1 = can_gas_lags[f"lag{which_lag}"].head(100).plot(color="red", label=f"lag{which_lag}")
ax1.set_title('Canadian Monthly Gas Production')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
ax1.legend()
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/2468042639.py:14: UserWarning: The figure layout has changed to tight
plt.tight_layout()
can_gas_lags = pd.DataFrame({"lag" + str(lag): can_gas["Volume"].shift(lag) for lag in range(25)})
fig, axs = plt.subplots(5, 5, figsize=(12,20), sharex=True, sharey=True)
for (i, ax) in enumerate(axs.ravel()):
sns.regplot(x = can_gas_lags.iloc[:, i], y = can_gas_lags.iloc[:, 0], order=1, ax = ax,
line_kws=dict(linewidth=1, color="yellow"), scatter_kws=dict(s=1))
lim = ax.get_xlim()
ax.plot(lim, lim, 'k--', alpha=.25, zorder=-10)
ax.set(xlim=lim, ylim=lim, title=f'lag {i}', aspect='equal')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/1828456769.py:9: UserWarning: The figure layout has changed to tight
plt.tight_layout()
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(Wdf["W0"], ax=ax0, lags=25, zero=False, title='ACF for Gausian White Noise')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = Wdf["W0"].plot(color="blue", label="Original")
ax1.set_title('White Noise')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/2990417472.py:12: UserWarning: The figure layout has changed to tight
plt.tight_layout()
EQ5 = pd.read_csv('./EQ5.csv')
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(EQ5["seismograph"], ax=ax0, lags=100, zero=False, title='ACF for the Earthquake Example')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = EQ5["seismograph"].plot(color="blue", label="Original")
ax1.set_title('Original Time Series')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/3315002690.py:12: UserWarning: The figure layout has changed to tight
plt.tight_layout()
rng = np.random.default_rng(2024)
RW = Wdf["W1"].cumsum()
RWdf = pd.DataFrame({'RW':RW})And now lets plot the time series and its ACF. Note again this pattern of slow decay in the ACF values:
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(RWdf["RW"], ax=ax0, lags=25, zero=False, title='ACF for Random Walk')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = RWdf["RW"].plot(color="blue")
ax1.set_title('Random Walk')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/1939923379.py:12: UserWarning: The figure layout has changed to tight
plt.tight_layout()
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(Xdf["X2"], ax=ax0, lags= 1000, zero=False, title='ACF for Sinusoidal with Random Uniform Phase')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = Xdf["X2"].plot(color="blue")
ax1.set_title('Original Time Series')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/4208411932.py:12: UserWarning: The figure layout has changed to tight
plt.tight_layout()
Let us consider a second example of a sinusoidal stochastic process \(Z-t\) given by \[Z_t = 3 \sin(2\pi t/100 + B)\] but this time let us assume that the phase \(B\) is a random normal variable with mean 0 and standard deviation 0.5. We repeat the same steps, fist plotting a large number of realizations of this process (many time series that only differ on the roll of the dice).
n = 1500
k = 1000
Z = np.zeros((n, k))
rng = np.random.default_rng(2024)
Ts = np.arange(n)
for i in range(k):
A = rng.normal(loc= 0, scale=0.5, size=1)
Z[:, i] = 3 * np.sin(Ts/50 + A)
Zdf = pd.DataFrame(Z, columns=["Z" + str(i) for i in range(k)])
Zdf.head()| Z0 | Z1 | Z2 | Z3 | Z4 | Z5 | Z6 | Z7 | Z8 | Z9 | ... | Z990 | Z991 | Z992 | Z993 | Z994 | Z995 | Z996 | Z997 | Z998 | Z999 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.476 | 2.195 | 1.627 | -1.403 | -1.924 | 0.101 | 1.252 | 0.756 | 2.360 | 1.100 | ... | -2.424 | 1.482 | 0.224 | 0.769 | -1.446 | -1.551 | -0.755 | 0.762 | -0.305 | -0.602 |
| 1 | 1.528 | 2.236 | 1.677 | -1.350 | -1.878 | 0.161 | 1.307 | 0.813 | 2.396 | 1.156 | ... | -2.388 | 1.534 | 0.284 | 0.826 | -1.393 | -1.499 | -0.696 | 0.820 | -0.245 | -0.543 |
| 2 | 1.579 | 2.275 | 1.727 | -1.296 | -1.831 | 0.221 | 1.360 | 0.871 | 2.432 | 1.211 | ... | -2.351 | 1.585 | 0.343 | 0.884 | -1.340 | -1.447 | -0.638 | 0.878 | -0.186 | -0.484 |
| 3 | 1.630 | 2.314 | 1.776 | -1.241 | -1.783 | 0.280 | 1.414 | 0.928 | 2.466 | 1.265 | ... | -2.313 | 1.636 | 0.403 | 0.941 | -1.286 | -1.394 | -0.579 | 0.935 | -0.126 | -0.425 |
| 4 | 1.680 | 2.352 | 1.824 | -1.186 | -1.734 | 0.340 | 1.466 | 0.985 | 2.500 | 1.320 | ... | -2.275 | 1.686 | 0.462 | 0.998 | -1.232 | -1.340 | -0.520 | 0.992 | -0.066 | -0.366 |
5 rows × 1000 columns
And let us again plot the different realizations of \(Z_t\) as time series:
fig, ax = plt.subplots()
Zdf.plot(ax=ax, zorder=-10, colormap="tab20")
t_a = 325
plt.axvline(x = t_a, ls="--")
plt.scatter(x=[t_a]*k, y=Z[t_a, :], zorder = 1)
if (k > 10):
ax.get_legend().remove()
plt.show();plt.close()In this case it can also be shown formally that the process is not stationary. Now let us look at the corresponding ACF:
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(Zdf["Z2"], ax=ax0, lags=1000, zero=False, title='ACF for Sinusoidal with Normal Uniform Phase')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = Zdf["Z2"].plot(color="blue")
ax1.set_title('Original Time Series')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/3862686891.py:12: UserWarning: The figure layout has changed to tight
plt.tight_layout()
For the random walk (which we know for certain to be non stationary!) the resulting p-values confirm the non stationarity of the series.
RW_adf = adfuller(RWdf["RW"])
RW_kpss = kpss(RWdf["RW"])
RW_adf[1], RW_kpss[1]/tmp/ipykernel_10/596027656.py:2: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.
RW_kpss = kpss(RWdf["RW"])
(np.float64(0.7918327048062517), np.float64(0.01))
For the stationary sinusoidal (uniform random phase) the p-value of the ADF tells us to reject the null. This means that the series is either stationary or seasonal, the later being true. But the KPSS test fails to reject the null, even though we know it to be false because of (strong) seasonality.
X_2_adf = adfuller(Xdf["X2"])
X_2_kpss = kpss(Xdf["X2"], nlags="legacy")
X_2_adf[1], X_2_kpss[1]/tmp/ipykernel_10/2546339701.py:2: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.
X_2_kpss = kpss(Xdf["X2"], nlags="legacy")
(0.0, np.float64(0.1))
In the case of the non stationary sinusoid (normal random phase) the ADF again rejects the null, but the KPSS does not reject it, even though the series is not stationary and it is seasonal.
Z_adf = adfuller(Zdf["Z2"] )
Z_kpss = kpss(Zdf["Z2"], nlags="legacy")
Z_adf[1], Z_kpss[1]/tmp/ipykernel_10/2005724078.py:2: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.
Z_kpss = kpss(Zdf["Z2"], nlags="legacy")
(0.0, np.float64(0.1))
can_gas["Volume_d1"] = can_gas["Volume"].diff()
can_gas["Volume_d12"] = can_gas["Volume"].diff(periods=12)
can_gas.head(18)| Volume | Volume_d1 | Volume_d12 | |
|---|---|---|---|
| Month | |||
| 1960-01-01 | 1.431 | NaN | NaN |
| 1960-02-01 | 1.306 | -0.125 | NaN |
| 1960-03-01 | 1.402 | 0.096 | NaN |
| 1960-04-01 | 1.170 | -0.232 | NaN |
| 1960-05-01 | 1.116 | -0.054 | NaN |
| 1960-06-01 | 1.011 | -0.105 | NaN |
| 1960-07-01 | 0.966 | -0.045 | NaN |
| 1960-08-01 | 0.977 | 0.011 | NaN |
| 1960-09-01 | 1.031 | 0.054 | NaN |
| 1960-10-01 | 1.252 | 0.221 | NaN |
| 1960-11-01 | 1.442 | 0.190 | NaN |
| 1960-12-01 | 1.564 | 0.122 | NaN |
| 1961-01-01 | 1.745 | 0.181 | 0.314 |
| 1961-02-01 | 1.583 | -0.162 | 0.278 |
| 1961-03-01 | 1.677 | 0.094 | 0.275 |
| 1961-04-01 | 1.516 | -0.161 | 0.346 |
| 1961-05-01 | 1.405 | -0.110 | 0.289 |
| 1961-06-01 | 1.246 | -0.159 | 0.235 |
Let us plot the original time series and also the differenced ones.
fig, axs = plt.subplots(3, 1, figsize=(10,8), sharex=False, sharey=False)
ax0 = axs[0]
can_gas["Volume"].plot(color="blue", ax=ax0)
ax0.set_title('Canadian Monthly Gas Production')
ax1 = axs[1]
can_gas["Volume_d1"].plot(color="blue", ax=ax1)
ax1.set_title('First Regular Difference')
ax2 = axs[2]
can_gas["Volume_d12"].plot(color="blue", ax=ax2)
ax2.set_title('First Seasonal Difference')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/2050797953.py:15: UserWarning: The figure layout has changed to tight
plt.tight_layout()
ausprod = pd.read_csv('../4_1_Introduction_to_Forecasting/aus_production.csv')
ausprod| Quarter | Beer | Tobacco | Bricks | Cement | Electricity | Gas | |
|---|---|---|---|---|---|---|---|
| 0 | 1956 Q1 | 284 | 5225.0 | 189.0 | 465 | 3923 | 5 |
| 1 | 1956 Q2 | 213 | 5178.0 | 204.0 | 532 | 4436 | 6 |
| 2 | 1956 Q3 | 227 | 5297.0 | 208.0 | 561 | 4806 | 7 |
| 3 | 1956 Q4 | 308 | 5681.0 | 197.0 | 570 | 4418 | 6 |
| 4 | 1957 Q1 | 262 | 5577.0 | 187.0 | 529 | 4339 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 213 | 2009 Q2 | 398 | NaN | NaN | 2160 | 57471 | 238 |
| 214 | 2009 Q3 | 419 | NaN | NaN | 2325 | 58394 | 252 |
| 215 | 2009 Q4 | 488 | NaN | NaN | 2273 | 57336 | 210 |
| 216 | 2010 Q1 | 414 | NaN | NaN | 1904 | 58309 | 205 |
| 217 | 2010 Q2 | 374 | NaN | NaN | 2401 | 58041 | 236 |
218 rows × 7 columns
To parse the quarter data correctly we need to do a string replacement:
ausprod["Quarter"] = ausprod["Quarter"].str.replace(' ', '-')
ausprod.head()| Quarter | Beer | Tobacco | Bricks | Cement | Electricity | Gas | |
|---|---|---|---|---|---|---|---|
| 0 | 1956-Q1 | 284 | 5225.0 | 189.0 | 465 | 3923 | 5 |
| 1 | 1956-Q2 | 213 | 5178.0 | 204.0 | 532 | 4436 | 6 |
| 2 | 1956-Q3 | 227 | 5297.0 | 208.0 | 561 | 4806 | 7 |
| 3 | 1956-Q4 | 308 | 5681.0 | 197.0 | 570 | 4418 | 6 |
| 4 | 1957-Q1 | 262 | 5577.0 | 187.0 | 529 | 4339 | 5 |
ausprod["Date"] = pd.to_datetime(ausprod["Quarter"])
ausprod.head()/tmp/ipykernel_10/1995653433.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
ausprod["Date"] = pd.to_datetime(ausprod["Quarter"])
| Quarter | Beer | Tobacco | Bricks | Cement | Electricity | Gas | Date | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1956-Q1 | 284 | 5225.0 | 189.0 | 465 | 3923 | 5 | 1956-01-01 |
| 1 | 1956-Q2 | 213 | 5178.0 | 204.0 | 532 | 4436 | 6 | 1956-04-01 |
| 2 | 1956-Q3 | 227 | 5297.0 | 208.0 | 561 | 4806 | 7 | 1956-07-01 |
| 3 | 1956-Q4 | 308 | 5681.0 | 197.0 | 570 | 4418 | 6 | 1956-10-01 |
| 4 | 1957-Q1 | 262 | 5577.0 | 187.0 | 529 | 4339 | 5 | 1957-01-01 |
ausprod.set_index('Date', drop=True, inplace=True)
ausprod.head()| Quarter | Beer | Tobacco | Bricks | Cement | Electricity | Gas | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 1956-01-01 | 1956-Q1 | 284 | 5225.0 | 189.0 | 465 | 3923 | 5 |
| 1956-04-01 | 1956-Q2 | 213 | 5178.0 | 204.0 | 532 | 4436 | 6 |
| 1956-07-01 | 1956-Q3 | 227 | 5297.0 | 208.0 | 561 | 4806 | 7 |
| 1956-10-01 | 1956-Q4 | 308 | 5681.0 | 197.0 | 570 | 4418 | 6 |
| 1957-01-01 | 1957-Q1 | 262 | 5577.0 | 187.0 | 529 | 4339 | 5 |
We will focus in the time series corresponding to beer production. We can visualize the time series and its ACF. Both plots tell you about clear trend and seasonality with period 4.
df_ts = ausprod
var = "Beer"
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(df_ts[var], ax=ax0, lags=25, zero=False, title='ACF')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = df_ts[var].plot(color="blue")
ax1.set_title('Austalia Beer Production')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/4088216739.py:15: UserWarning: The figure layout has changed to tight
plt.tight_layout()
As described, we first take a seasonal difference:
ausprod["Beer_sd4"] = ausprod["Beer"].diff(periods=4)
ausprod.head(12)| Quarter | Beer | Tobacco | Bricks | Cement | Electricity | Gas | Beer_sd4 | |
|---|---|---|---|---|---|---|---|---|
| Date | ||||||||
| 1956-01-01 | 1956-Q1 | 284 | 5225.0 | 189.0 | 465 | 3923 | 5 | NaN |
| 1956-04-01 | 1956-Q2 | 213 | 5178.0 | 204.0 | 532 | 4436 | 6 | NaN |
| 1956-07-01 | 1956-Q3 | 227 | 5297.0 | 208.0 | 561 | 4806 | 7 | NaN |
| 1956-10-01 | 1956-Q4 | 308 | 5681.0 | 197.0 | 570 | 4418 | 6 | NaN |
| 1957-01-01 | 1957-Q1 | 262 | 5577.0 | 187.0 | 529 | 4339 | 5 | -22.0 |
| 1957-04-01 | 1957-Q2 | 228 | 5651.0 | 214.0 | 604 | 4811 | 7 | 15.0 |
| 1957-07-01 | 1957-Q3 | 236 | 5317.0 | 227.0 | 603 | 5259 | 7 | 9.0 |
| 1957-10-01 | 1957-Q4 | 320 | 6152.0 | 222.0 | 582 | 4735 | 6 | 12.0 |
| 1958-01-01 | 1958-Q1 | 272 | 5758.0 | 199.0 | 554 | 4608 | 5 | 10.0 |
| 1958-04-01 | 1958-Q2 | 233 | 5641.0 | 229.0 | 620 | 5196 | 7 | 5.0 |
| 1958-07-01 | 1958-Q3 | 237 | 5802.0 | 249.0 | 646 | 5609 | 8 | 1.0 |
| 1958-10-01 | 1958-Q4 | 313 | 6265.0 | 234.0 | 637 | 4977 | 6 | -7.0 |
Let us visualize the result. Note that we remove the missing data before plotting the ACF:
df_ts = ausprod
var = "Beer_sd4"
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(df_ts[var].dropna(), ax=ax0, lags=25, zero=False, title='ACF of Seasonally Differenced Series')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = df_ts[var].plot(color="blue")
ax1.set_title('Austalia Beer Production, after seasonal difference')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/213964748.py:15: UserWarning: The figure layout has changed to tight
plt.tight_layout()
The plots, the ACF in particular, shows that we have managed to remove seasonality. But since there is still some hint of a trend, we will now apply a regular difference (to the already seasonally differenced data) and plot the result again.
ausprod["Beer_sd4_d1"] = ausprod["Beer_sd4"].diff()
df_ts = ausprod
var = "Beer_sd4_d1"
fig, axs = plt.subplots(1, 2, figsize=(16,4), sharex=False, sharey=False)
ax0 = axs[0]
sm.graphics.tsa.plot_acf(df_ts[var].dropna(), ax=ax0, lags=25, zero=False, title='ACF of Seasonally and Regularly Differenced Series')
ax0.set(ylim=(-1,1), xlabel='Lag', ylabel='acf')
ax1 = axs[1]
ax1 = df_ts[var].plot(color="blue")
ax1.set_title('Austalia Beer Production, after Differencing (both seasonal and regular)')
ax1.grid(visible=True, which='both', axis='x')
ax1.grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/1882162051.py:17: UserWarning: The figure layout has changed to tight
plt.tight_layout()
As we will see in the next session, this resulting ACF with only a few significant initial terms is a good starting point for modeling with SARIMA models. Let us check the stationarity
ausbeer_diff = ausprod["Beer_sd4_d1"].dropna()
ausbeer_adf = adfuller(ausbeer_diff)
ausbeer_kpss = kpss(ausbeer_diff)
ausbeer_adf[1], ausbeer_kpss[1]/tmp/ipykernel_10/882291498.py:3: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.
ausbeer_kpss = kpss(ausbeer_diff)
(np.float64(9.951622692226875e-11), np.float64(0.1))
ADF rejects the null of non-stationarity (and non seasonality) and KPSS does not reject the null of stationarity (and non seasonality). Therefore they agree and give us another confirmation that the differenced time series appears like the the result of a stationary process.
pmd_ndiffs, pmd_nsdiffs = pmd.arima.ndiffs, pmd.arima.nsdiffs
print(f"Recommended number of regular differences: {pmd_ndiffs(ausprod['Beer'])}")
print(f"Recommended number of seasonal differences: {pmd_nsdiffs(ausprod['Beer'], 4)}") Recommended number of regular differences: 1
Recommended number of seasonal differences: 1
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
/opt/conda/envs/mlmiin/lib/python3.11/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
nsdiffs(ausprod['Beer'].values, period=4)1
Let us see a famous example of a time series whose varinace increases with time: the airline passengers data. We will first plot the time series. The time plot clearly shows an increasing variance.
air_passengers = load_airline()
air_passengers.head()Period
1949-01 112.0
1949-02 118.0
1949-03 132.0
1949-04 129.0
1949-05 121.0
Freq: M, Name: Number of airline passengers, dtype: float64
air_passengers = pd.DataFrame(air_passengers)
air_passengers.head()| Number of airline passengers | |
|---|---|
| Period | |
| 1949-01 | 112.0 |
| 1949-02 | 118.0 |
| 1949-03 | 132.0 |
| 1949-04 | 129.0 |
| 1949-05 | 121.0 |
air_passengers.index.name = "ds"
air_passengers.rename(columns={"Number of airline passengers": "y"}, inplace=True)
air_passengers.plot()
plt.show();plt.close() We can get the best value of \(\lambda\) as follows. Recall that if this value is close to 1, then the transformation is essentially the identity and therefore the variance of the time series is already stable.
from coreforecast.scalers import boxcox, boxcox_lambda
air_passengers_lmbda = boxcox_lambda(air_passengers['y'], method='guerrero', season_length=12)
air_passengers_lmbda-0.45177626609802246
air_passengers['y_boxcox'] = boxcox(air_passengers['y'], lmbda=air_passengers_lmbda)
air_passengers.head()| y | y_boxcox | |
|---|---|---|
| ds | ||
| 1949-01 | 112.0 | 1.951 |
| 1949-02 | 118.0 | 1.957 |
| 1949-03 | 132.0 | 1.970 |
| 1949-04 | 129.0 | 1.967 |
| 1949-05 | 121.0 | 1.960 |
# Plot the original and Box-Cox transformed series, using subplots to show them side by side
fig, axs = plt.subplots(2, 1, figsize=(8,4), sharey=False, sharex=True)
air_passengers['y'].plot(label="Original", ax=axs[0])
air_passengers['y_boxcox'].plot(label="Box-Cox Transformed", ax=axs[1])
axs[0].legend()
axs[1].legend()
axs[0].set_title('Original Series')
axs[1].set_title('Box-Cox Transformed Series')
axs[0].grid(visible=True, which='both', axis='x')
axs[0].grid(visible=False, which='Major', axis='y')
axs[1].grid(visible=True, which='both', axis='x')
axs[1].grid(visible=False, which='Major', axis='y')
plt.tight_layout()
plt.show();plt.close()/tmp/ipykernel_10/3604504944.py:13: UserWarning: The figure layout has changed to tight
plt.tight_layout()
To apply the test we need to fit a linear regression model to the time series and then apply the test to the residuals. We will use OLs from statsmodels to fit the model. First we create a pandas dataframe with the time series and an integer time index, using standard names to make the code reusable.
import statsmodels.formula.api as smfNow we fit the linear regression
# air_passengers.reset_index(inplace=True)
air_passengers["t"] = np.arange(len(air_passengers))
air_passengers.head()| y | y_boxcox | t | |
|---|---|---|---|
| ds | |||
| 1949-01 | 112.0 | 1.951 | 0 |
| 1949-02 | 118.0 | 1.957 | 1 |
| 1949-03 | 132.0 | 1.970 | 2 |
| 1949-04 | 129.0 | 1.967 | 3 |
| 1949-05 | 121.0 | 1.960 | 4 |
formula = 'y ~ t'
olsr = smf.ols(formula, air_passengers).fit()And we apply the Breusch-Pagan test to the residuals and extract the p-value:
import statsmodels.stats.api as sms
_, breusch_pagan_p_value, _, _ = sms.het_breuschpagan(olsr.resid, olsr.model.exog)
breusch_pagan_p_value # If the p-value is less than 0.05, then the residuals are heteroskedasticnp.float64(4.559001856883249e-07)
As expected for the airline passengers time series, the p-value is very small, indicating that the variance is not constant. A way to understand this is to plot the residuals of the linear regression model. We will do this below.
sns.scatterplot(x = air_passengers['t'], y = np.abs(olsr.resid), label="OLS Residuals vs time")
plt.show();plt.close()The funnel-like shape of the residuals plot is a clear indication that the variance is not constant. There are other tests that can be used to check for this, but the Breusch-Pagan test is the most common one. Check this Medium post by V. Cerqueira for further details (Cerqueira is one of the authors of (Cerqueira and Roque 2024)). You can aso play with the online interactive tool in Section 3.1 of Forecasting: Principles and Practice
This is (quote): A dataset containing the hourly pedestrian counts from 2015-01-01 to 2016-12-31 at 4 sensors in the city of Melbourne.
Check this link to the dataset for more information about the dataset. We will load the data and select one of the sensors () to work with.
pedestrians = pd.read_csv('../4_1_Introduction_to_Forecasting/tsdata/tsibble/pedestrian.csv')
pedestrians = pedestrians[pedestrians["Sensor"] == "Bourke Street Mall (North)"]
pedestrians.drop(columns=["Sensor"], inplace=True)
pedestrians| Date_Time | Date | Time | Count | |
|---|---|---|---|---|
| 14566 | 2015-02-16T13:00:00Z | 2015-02-17 | 0 | 61 |
| 14567 | 2015-02-16T14:00:00Z | 2015-02-17 | 1 | 16 |
| 14568 | 2015-02-16T15:00:00Z | 2015-02-17 | 2 | 15 |
| 14569 | 2015-02-16T16:00:00Z | 2015-02-17 | 3 | 17 |
| 14570 | 2015-02-16T17:00:00Z | 2015-02-17 | 4 | 9 |
| ... | ... | ... | ... | ... |
| 30975 | 2016-12-31T08:00:00Z | 2016-12-31 | 19 | 2467 |
| 30976 | 2016-12-31T09:00:00Z | 2016-12-31 | 20 | 1730 |
| 30977 | 2016-12-31T10:00:00Z | 2016-12-31 | 21 | 1342 |
| 30978 | 2016-12-31T11:00:00Z | 2016-12-31 | 22 | 1409 |
| 30979 | 2016-12-31T12:00:00Z | 2016-12-31 | 23 | 749 |
16414 rows × 4 columns
pedestrians["Date_Time"] = pd.to_datetime(pedestrians["Date_Time"])
pedestrians.info()<class 'pandas.core.frame.DataFrame'>
Index: 16414 entries, 14566 to 30979
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date_Time 16414 non-null datetime64[ns, UTC]
1 Date 16414 non-null object
2 Time 16414 non-null int64
3 Count 16414 non-null int64
dtypes: datetime64[ns, UTC](1), int64(2), object(1)
memory usage: 641.2+ KB
pedestrians.set_index('Date_Time', drop=True, inplace=True)
pedestrians.drop(columns=["Date", "Time"], inplace=True)
pedestrians| Count | |
|---|---|
| Date_Time | |
| 2015-02-16 13:00:00+00:00 | 61 |
| 2015-02-16 14:00:00+00:00 | 16 |
| 2015-02-16 15:00:00+00:00 | 15 |
| 2015-02-16 16:00:00+00:00 | 17 |
| 2015-02-16 17:00:00+00:00 | 9 |
| ... | ... |
| 2016-12-31 08:00:00+00:00 | 2467 |
| 2016-12-31 09:00:00+00:00 | 1730 |
| 2016-12-31 10:00:00+00:00 | 1342 |
| 2016-12-31 11:00:00+00:00 | 1409 |
| 2016-12-31 12:00:00+00:00 | 749 |
16414 rows × 1 columns
Now suppose that we want to convert this hourly time series to a daily time series. We can do this with the resample method in pandas. We will use the sum method to aggregate the hourly counts into daily counts, because we are interested in the total number of pedestrians per day. We would get it like this:
pedestrians_daily = pedestrians.resample('D').sum()
pedestrians_daily| Count | |
|---|---|
| Date_Time | |
| 2015-02-16 00:00:00+00:00 | 3056 |
| 2015-02-17 00:00:00+00:00 | 23591 |
| 2015-02-18 00:00:00+00:00 | 25733 |
| 2015-02-19 00:00:00+00:00 | 30238 |
| 2015-02-20 00:00:00+00:00 | 33199 |
| ... | ... |
| 2016-12-27 00:00:00+00:00 | 39335 |
| 2016-12-28 00:00:00+00:00 | 38138 |
| 2016-12-29 00:00:00+00:00 | 28675 |
| 2016-12-30 00:00:00+00:00 | 38681 |
| 2016-12-31 00:00:00+00:00 | 31855 |
685 rows × 1 columns
pedestrians.plot(label="Hourly")
plt.title("Hourly Pedestrian Counts")
pedestrians_daily.plot(label="Daily")
plt.title("Daily Pedestrian Counts")
plt.show();plt.close()You can see in this example how the daily resampling operation allows us to more clearly identify seasonality patterns in the time series.
NOAA_global_temp = pd.read_csv("../4_1_Introduction_to_Forecasting/noaa.globaltmp.comb.csv", parse_dates=True, index_col=0, na_values=-9999.)
NOAA_global_temp.columns = ['Temp_Anomaly']
NOAA_global_temp
| Temp_Anomaly | |
|---|---|
| Date | |
| 1850-01-01 | -0.775 |
| 1850-02-01 | -0.551 |
| 1850-03-01 | -0.566 |
| 1850-04-01 | -0.679 |
| 1850-05-01 | -0.609 |
| ... | ... |
| 2024-08-01 | 0.972 |
| 2024-09-01 | 0.964 |
| 2024-10-01 | 1.063 |
| 2024-11-01 | NaN |
| 2024-12-01 | NaN |
2100 rows × 1 columns
Now let us apply the resampling operation:
NOAA_global_temp_yearly = NOAA_global_temp.resample('Y').mean()
NOAA_global_temp_yearly/tmp/ipykernel_10/1696086621.py:1: FutureWarning: 'Y' is deprecated and will be removed in a future version, please use 'YE' instead.
NOAA_global_temp_yearly = NOAA_global_temp.resample('Y').mean()
| Temp_Anomaly | |
|---|---|
| Date | |
| 1850-12-31 | -0.501 |
| 1851-12-31 | -0.395 |
| 1852-12-31 | -0.357 |
| 1853-12-31 | -0.412 |
| 1854-12-31 | -0.372 |
| ... | ... |
| 2020-12-31 | 0.705 |
| 2021-12-31 | 0.553 |
| 2022-12-31 | 0.585 |
| 2023-12-31 | 0.875 |
| 2024-12-31 | 0.969 |
175 rows × 1 columns
NOAA_global_temp.plot()
plt.title("Monthly Global Temperature Anomalies")
NOAA_global_temp_yearly.plot()
plt.title("Yearly Global Temperature Anomalies")
plt.show();plt.close() Note again in this example how resampling can help reveal the long term trends and patterns in the time series. For example it is well known that the “El niño” phenomenon is associated with a rise in sea surface temperatures. Can you spot this in the yearly time series?