# Sourcing the file wich contains the functions to find the best parameters with regard to AIC and BIC
source("hyperparameters-selection.R")
# Loading the best hyperparameters for our model we previously found
load("hyperparameters.RData")Airport traffic
Applied
Data import
This dataset includes the monthly number of passengers from 1998 to 2023 in different european airports.
# Global data
traffic <- openxlsx::read.xlsx(xlsxFile="datasets/data_airports_APP.xlsx")
# Simplified label
traffic <- traffic %>% dplyr::rename("Airport" = "REP_AIRP.(Labels)")This dataset associates the airport with its country.
# Country
airports_names<- read.csv("datasets/airports_by_country.csv")
airports_names$Airport <- paste(airports_names$Airport, "airport", sep = " ")
airports_names <- airports_names %>% mutate(Country = ifelse(Country == "Chile", "Spain", Country))
Pre-processing
We choose to keep the data from 2002 to 2023. Before 2022 we have only few data available and it seems not interesting for our study.
# Selection
names_col <- names(traffic)
selected_col <- c(names_col[1], names_col[50:length(names_col)])
traffic <- traffic %>% dplyr::select(all_of(selected_col))
Here we aggregate the country associated to each airport. We will need them for our analysis and to create our segmentation by country afterward.
# Merged
traffic_mg <- merge(traffic, airports_names, by = "Airport", all.x = TRUE)
We first check if in our data we have some duplicated lines.
airports_dupli <- duplicated(traffic_mg)
length(traffic_mg[airports_dupli,])[1] 263
And then apply unique.
# Duplicates erase
traffic_mg <- unique(traffic_mg)
We check if some of the airports are not associated with a country in our dataset airports_names.
# Checking
airports_without_country <- traffic_mg[is.na(traffic_mg$Country), ]
as.vector(airports_without_country$Airport)[1] "MURCIA/AEROPUERTO DE LA REGION DE MURCIA AIRPORT"
[2] "RIZE-ARTVIN"
[3] "STRASBOURG airport System"
[4] "Unknown airport - FINLAND"
[5] "Unknown airport - NORWAY"
[6] "Unknown airport - POLAND"
[7] "Unknown airport - SLOVAKIA"
[8] "Unknown airport - SWITZERLAND"
Our dataset gives a report of the number of passengers carried by the airports each month starting in January of 1998 to september of 2023.
We there modify our dataset structure to prevent issues with pivot_longer.
# Modification
traffic_pivot <- tidyr::pivot_longer(traffic_mg, cols = -c("Airport", "Country"), names_to = "Date", values_to = "Passengers")
# Managing Nan
traffic_pivot$Passengers[traffic_pivot$Passengers == ":"] <- 0
# Numerical values
traffic_pivot$Passengers <- as.numeric(traffic_pivot$Passengers)
# Date
traffic_pivot$Date <- zoo::as.Date(paste0(traffic_pivot$Date, "-01"), format="%Y-%m-%d")Selection of the most relevant european airports
For this, our goal is to keep at least one airport by country. To do so, we will focus on the airports with the most attendace in every country.
First, we sum the total number of passengers between 2002 and 2023 :
# Sum
traffic_sum <- traffic_pivot %>% group_by(Airport, Country) %>% summarise(sumPassengers = sum(Passengers))`summarise()` has grouped output by 'Airport'. You can override using the
`.groups` argument.
Then, we select the most relevant airport of every country :
# Selection
airports_best_ranked <- traffic_sum %>% group_by(Country) %>% slice_max(order_by = sumPassengers)For 3 countries we have no data. Therefore, we delete them. At the same time, we also erase some territories of no interest and issues. For that we previously checked that in our list we do not have any big airport that could be pertinent.
# Erase
airports_best_ranked <- airports_best_ranked %>% filter(sumPassengers != 0)
list_countries = c("Faroe Islands (Denmark)", "Fictional/Private", "French Guiana",
"Guadeloupe (France)", "Martinique (France)", "Mayotte (France)", "Reunion (France)",
"Saint Barthelemy (France)", "Saint Martin (France)", "Svalbard (Norway)", NA)
airports_best_ranked <- airports_best_ranked %>% filter(!(Country %in% list_countries))Final dataset
Here is our dataset that we will use from now on to build our model and make our analysis.
# Final dataset
airports_final_list <- unique(airports_best_ranked$Airport)
traffic_checked <- traffic_pivot %>% filter(Airport %in% airports_final_list)
traffic_checked <- traffic_checked[traffic_checked$Date <= as.Date("2023-05-01"),]Plot of our data
Overview
Here we visualize how is the general tendance with all our airports.
Ranking on the total airport traffic
We ranked our countries by their total passenger traffic. This will help us to make an analysis on the most relevant airport of our dataset.
ggplot2::ggplot(airports_best_ranked, aes(x = sumPassengers, y = reorder(Country, sumPassengers))) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Ranking of Countries based on their traffic",
x = "Total passengers carried",
y = "Country")Global trend on our data
Here we plot the mean between most relevant airports (One for each european country).
Here we can see the trend of the 5 airports we kept for our analysis.
Focus
Quick view of our 5 airports.
PARIS-CHARLES DE GAULLE airport
paris <- traffic_checked %>% dplyr::filter(Airport == "PARIS-CHARLES DE GAULLE airport")Formating the dataset as a time serie variable
# Time serie function
paris_ts <- paris %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2023, 5))ADOLFO SUAREZ MADRID-BARAJAS airport
madrid <- traffic_checked %>% dplyr::filter(Airport == "ADOLFO SUAREZ MADRID-BARAJAS airport")Formating the dataset as a time serie variable
# Time serie function
madrid_ts <- madrid %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2023, 5))ROMA/FIUMICINO airport
roma <- traffic_checked %>% dplyr::filter(Airport == "ROMA/FIUMICINO airport")Formating the dataset as a time serie variable
# Time serie function
roma_ts <- roma %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2023, 5))KOBENHAVN/KASTRUP airport
copenhagen <- traffic_checked %>% dplyr::filter(Airport == "KOBENHAVN/KASTRUP airport")Formating the dataset as a time serie variable
# Time serie function
copenhagen_ts <- copenhagen %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2023, 5))OSLO/GARDERMOEN airport
oslo <- traffic_checked %>% dplyr::filter(Airport == "OSLO/GARDERMOEN airport")Formating the dataset as a time serie variable
# Time serie function
oslo_ts <- oslo %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2023, 5))Our Time Series model
First of all, we want to keep the data only until the end of 2019, period before COVID impact if we refer to our plots.
# Paris
paris_ts19 <- paris %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2019, 12))
paris_ts19 <- na.omit(paris_ts19)
# Madrid
madrid_ts19 <- madrid %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2019, 12))
madrid_ts19 <- na.omit(madrid_ts19)
# Roma
roma_ts19 <- roma %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2019, 12))
roma_ts19 <- na.omit(roma_ts19)
# Copenhagen
copenhagen_ts19 <- copenhagen %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2019, 12))
copenhagen_ts19 <- na.omit(copenhagen_ts19)
# Oslo
oslo_ts19 <- oslo %>% dplyr::select(4) %>% ts(frequency = 12, start = c(2002, 1), end = c(2019, 12))
oslo_ts19 <- na.omit(oslo_ts19)Study on our data
We will study the stationarity of our time series and then we will build our SARIMA model.
Stationarity and parameters
As first though, we saw previously on the graph of the Paris Charles de Gaulle airport that there is an ascending trend. This violates the assumption of same mean at all time in the stationarity properties. Also, distinct seasonal patterns are present which is also a violation of the previous requirement. We thus already think that our time series are not stationary.
The positive trend seems to be linear which suggests that a first difference could be sufficient to detrend it. In the case of curve we would have preferred to previously transform our data and then make a first difference. But before doing a first difference we have to take the seasonality into account. If after the seasonal differencing the trend remains, then we will apply a forst difference.
We will verify it as it follows.
First, we study the non-seasonal behavior. It is likely that the short run non-seasonal components will contribute to the model. We thus take a look at the ACF and PACF behavior under the seasonality lag (12) to assess what non-seasonal components might be.
No differencing
We start with analysing our time series without previous differencing.
# Trend
ts_decomposed <- decompose(paris_ts19)
# Plot
plot(paris_ts19, col = 'blue', xlab = "Year", ylab = "Passengers",
main = "CDG passengers before seasonal differencing")
lines(ts_decomposed$trend, col = 'red')
legend("topright", legend = c("Original", "Trend"), col = c("blue", "red"),
lty = c(1, 1), cex = 0.8)On both ACF and PACF graphs, the blue dashed lines represent values beyond which the ACF and PACF are significantly different from zero at \(5\%\) level. These lines represent the bounds of the \(95\%\) confidence intervals. A bar above are under these lines would suppose a correlation. On the contrary, a bar between these two lines would suppose zero correlation.
We test whether our time series is stationary or not with the Augmented Dickey-Fuller Test.
\(H_0 : Unit root\) \(H_1 : Stationary\)
adf_test <- adf.test(paris_ts19, alternative = "stationary")Warning in adf.test(paris_ts19, alternative = "stationary"): p-value smaller
than printed p-value
# Results of the ADF test
print(adf_test)
Augmented Dickey-Fuller Test
data: paris_ts19
Dickey-Fuller = -11.967, Lag order = 5, p-value = 0.01
alternative hypothesis: stationary
Our result with a \(p-value = 0.01 < 0.05\) let us suppose that there is stationarity. We success to reject the null hypothesis that states a unit root (non-stationarity) in the time serie.
This let us suppose that we could need to differenciate paris_ts19.
ACF for Autocorrelation Function
This graph is useful to identify the number of MA(q) (Moving Average) terms in our model.
We will interpret it as the following :
- If the ACF shows a slow exponential or sinusoidal decay, it could suggests an AR process.
- If the ACF cuts off after a specific delay (lag), it could suggests an AM process.
The ACF shows a gradual decline but with several significant spikes at various lags. There are significant lags at intervals that could suggest a seasonal pattern, which is in line with our year seasonality. Our data frequency is monthly, and these intervals correspond to the number 12.
The fact that there are significant autocorrelations at multiple lags might also suggest that our data is not stationary, and differencing (either seasonal or non-seasonal) might be required to achieve stationarity.
PACF for Partial Autocorrelation Function
This graph is useful to identify the number of AR(p) (AutoRegressive) terms in our model.
We will interpret it as the following :
- If the PACF shows a slow exponential or sinusoidal decay, it could indicates an MA process.
- If the PACF cuts off after a certain delay, it could indicates an AR process.
The PACF shows a significant spike at lag 1 and then cuts off, which indicates that a non-seasonal AR(1) component may be present in our time series. The other lags do not appear to be significantly different from zero, suggesting that no higher-order AR terms are needed.
Seasonal differencing
We will take the seasonality into account, which means that we make a difference at lag 12 because of our monthly data.
# Difference
paris_ts19_diff <- diff(paris_ts19, lag = 12)
# Trend
ts_stl <- stl(paris_ts19_diff, s.window = "periodic")
# Plot
plot(paris_ts19_diff, col = 'blue', xlab = "Year", ylab = "Passengers",
main = "CDG passengers after seasonal differencing")
lines(ts_stl$time.series[, "trend"], col = 'red')
legend("topright", legend = c("Original", "Trend"), col = c("blue", "red"),
lty = c(1, 1), cex = 0.8)After differencing, we check again for stationarity. Here, taking a look at the graph, it seems that there is no remaining trend. To validate our proposal, we run the ADF test with the ACF and PACF graphs and look for stationarity.
\(H_0 : Unit root\) \(H_1 : Stationary\)
adf_test2 <- adf.test(paris_ts19_diff, alternative = "stationary")
# Results of the ADF test
print(adf_test2)
Augmented Dickey-Fuller Test
data: paris_ts19_diff
Dickey-Fuller = -3.5432, Lag order = 5, p-value = 0.03996
alternative hypothesis: stationary
Again, we reject the null hypothesis at a \(5\%\) significance level. Which means that we still have stationarity in our time series.
Modeling
PARIS
Automated parameters choice
R has an automated function that can help us to build our model. We thus run it to get the suggested coefficients with : auto.arima.
# Get suggestion
paris_ts19_auto_params <- forecast::auto.arima(paris_ts19)Which gives us the following parameters : (1,0,1)x(0,1,1)[12]
But we are not gonna use it to build our SARIMA model for each airport. We will prefer the AIC and BIC criteria to choose the best parameters.
Manual parameters choice
# AIC parameters
paris_ts19_aic_params <- find_aic_params(paris_ts19)
# do not forget to save this new variable when you run this commandWith the Aikaike Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (2,1,1)x(0,1,1)[12]
# AIC parameters
paris_ts19_bic_params <- find_bic_params(paris_ts19)
# do not forget to save this new variable when you run this commandWith the Bayesian Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (0,1,1)x(0,1,1)[12]
MADRID
# AIC parameters
madrid_ts19_aic_params <- find_aic_params(madrid_ts19)
# do not forget to save this new variable when you run this commandWith the Aikaike Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (2,1,2)x(0,1,1)[12]
# AIC parameters
madrid_ts19_bic_params <- find_bic_params(madrid_ts19)
# do not forget to save this new variable when you run this commandWith the Bayesian Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (0,1,1)x(0,1,1)[12]
ROMA
# AIC parameters
roma_ts19_aic_params <- find_aic_params(roma_ts19)
# do not forget to save this new variable when you run this commandWith the Aikaike Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (1,1,2)x(0,1,1)[12]
# AIC parameters
roma_ts19_bic_params <- find_bic_params(roma_ts19)
# do not forget to save this new variable when you run this commandWith the Bayesian Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (0,1,1)x(0,1,1)[12]
COPENHAGEN
# AIC parameters
copenhagen_ts19_aic_params <- find_aic_params(copenhagen_ts19)
# do not forget to save this new variable when you run this commandWith the Aikaike Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (2,1,1)x(1,1,2)[12]
# AIC parameters
copenhagen_ts19_bic_params <- find_bic_params(copenhagen_ts19)
# do not forget to save this new variable when you run this commandWith the Bayesian Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (1,1,0)x(2,1,0)[12]
OSLO
# AIC parameters
oslo_ts19_aic_params <- find_aic_params(oslo_ts19)
# do not forget to save this new variable when you run this commandWith the Aikaike Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (0,1,1)x(2,1,2)[12]
# AIC parameters
oslo_ts19_bic_params <- find_bic_params(oslo_ts19)
# do not forget to save this new variable when you run this commandWith the Bayesian Information Criterion the goal is to minimize this criterion with the different parameters of our model.
The parameters obtained are the following : (0,1,1)x(0,1,1)[12]
SARIMA
Model for PARIS
Parameters with AIC
Parameters used : (2,1,1)x(0,1,1)[12]
# Setting the parameters
non_seasonal_order <- paris_ts19_aic_params$min_AIC_params[1:3]
seasonal_order <- paris_ts19_aic_params$min_AIC_params[4:6]
# Fitting the model
sarima_aic_paris <- Arima(paris_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
Ljung-Box test
data: Residuals from ARIMA(2,1,1)(0,1,1)[12]
Q* = 17.925, df = 20, p-value = 0.5924
Model df: 4. Total lags used: 24
The Ljung-Box test, \(H_0 : Independant\), \(H_1 : Not Independant\)
If we get \(p-value < 0.05\), we reject the null hypothesis. This suggests that the error has a significant autocorrelation at some of the lags used in the test (up to lag 24 here). Then our model would not capture adequately the time-dependency. Here, we have \(p-value = 0.5923651 > 0.5\).
The residuals time series plot, helps us to check if our residuals look like white noise, indicating that our model has captured the underlying process accurately and suggesting that the error variances are consistent over time—a condition known as homoskedasticity. In our case, the residuals fluctuate around zero without displaying any obvious patterns or systematic change in the spread over time, which is a good sign both for the model’s fit and the assumption of homoskedasticity.
The ACF of residuals, helps us to check the autocorrelation within our residuals. In a good model we would expect the bars to be between both dashed blue lines. In our case, we see some of the bars outside the confidence intervals. Thus, there may be some autocorrelation in the residuals that our model does not capture.
The histogram and density plot, shows the distribution of the residuals along with the density curve of the normal distribution for comparison. By assumption, our residuals should look like a normal distribution with a mean of zero. Here, we have large pikes around zero, but also some big deviations around. Our residuals may not be normally distributed.
# Prediction of the 12 next months
forecasts_aic_paris <- forecast(sarima_aic_paris, h=41)
#str(forecasts)
plot(forecasts_aic_paris)# Information on the model
sarima_aic_parisSeries: paris_ts19
ARIMA(2,1,1)(0,1,1)[12]
Coefficients:
ar1 ar2 ma1 sma1
0.1507 0.2023 -0.8253 -0.7307
s.e. 0.1080 0.0918 0.0741 0.0540
sigma^2 = 2.819e+10: log likelihood = -2733.31
AIC=5476.62 AICc=5476.92 BIC=5493.18
We first plot the forecasted data and the actual data to see the difference.
Then we plot the difference between the forecasted and actual values.
The we divide this difference by the forecasted value to see the scaled difference.
Parameters with BIC
Parameters used : (0,1,1)x(0,1,1)[12]
# Setting the parameters
non_seasonal_order <- paris_ts19_bic_params$min_BIC_params[1:3]
seasonal_order <- paris_ts19_bic_params$min_BIC_params[4:6]
# Fitting the model
sarima_bic_paris <- Arima(paris_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_bic_paris)
Ljung-Box test
data: Residuals from ARIMA(0,1,1)(0,1,1)[12]
Q* = 23.708, df = 22, p-value = 0.3627
Model df: 2. Total lags used: 24
# Prediction of the 12 next months
forecasts_bic_paris <- forecast(sarima_bic_paris, h=41)
#str(forecasts)
plot(forecasts_bic_paris)# Information on the model
sarima_bic_parisSeries: paris_ts19
ARIMA(0,1,1)(0,1,1)[12]
Coefficients:
ma1 sma1
-0.6740 -0.7099
s.e. 0.0599 0.0535
sigma^2 = 2.867e+10: log likelihood = -2735.58
AIC=5477.17 AICc=5477.29 BIC=5487.11
Model for MADRID
Parameters with AIC
Parameters used : (2,1,2)x(0,1,1)[12]
# Setting the parameters
non_seasonal_order <- madrid_ts19_aic_params$min_AIC_params[1:3]
seasonal_order <- madrid_ts19_aic_params$min_AIC_params[4:6]
# Fitting the model
sarima_aic_madrid <- Arima(madrid_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_aic_madrid)
Ljung-Box test
data: Residuals from ARIMA(2,1,2)(0,1,1)[12]
Q* = 25.384, df = 19, p-value = 0.1483
Model df: 5. Total lags used: 24
# Prediction of the 12 next months
forecasts_aic_madrid <- forecast(sarima_aic_madrid, h=41)
#str(forecasts)
plot(forecasts_aic_madrid)# Information on the model
sarima_aic_madridSeries: madrid_ts19
ARIMA(2,1,2)(0,1,1)[12]
Coefficients:
ar1 ar2 ma1 ma2 sma1
0.7523 -0.4368 -1.0757 0.7115 -0.6635
s.e. 0.1809 0.1303 0.1529 0.0975 0.0582
sigma^2 = 1.135e+10: log likelihood = -2639.1
AIC=5290.21 AICc=5290.64 BIC=5310.09
We take the same steps as for Paris CDG
Parameters with BIC
Parameters used : (0,1,1)x(0,1,1)[12]
# Setting the parameters
non_seasonal_order <- madrid_ts19_bic_params$min_BIC_params[1:3]
seasonal_order <- madrid_ts19_bic_params$min_BIC_params[4:6]
# Fitting the model
sarima_bic_madrid <- Arima(madrid_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_bic_madrid)
Ljung-Box test
data: Residuals from ARIMA(0,1,1)(0,1,1)[12]
Q* = 36.454, df = 22, p-value = 0.02713
Model df: 2. Total lags used: 24
# Prediction of the 12 next months
forecasts_bic_madrid <- forecast(sarima_bic_madrid, h=41)
#str(forecasts)
plot(forecasts_bic_madrid)# Information on the model
sarima_bic_madridSeries: madrid_ts19
ARIMA(0,1,1)(0,1,1)[12]
Coefficients:
ma1 sma1
-0.2692 -0.6648
s.e. 0.0628 0.0588
sigma^2 = 1.188e+10: log likelihood = -2645.17
AIC=5296.34 AICc=5296.46 BIC=5306.28
Model for ROMA
Parameters with AIC
Parameters used : (1,1,2)x(0,1,1)[12]
# Setting the parameters
non_seasonal_order <- roma_ts19_aic_params$min_AIC_params[1:3]
seasonal_order <- roma_ts19_aic_params$min_AIC_params[4:6]
# Fitting the model
sarima_aic_roma <- Arima(roma_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_aic_roma)
Ljung-Box test
data: Residuals from ARIMA(1,1,2)(0,1,1)[12]
Q* = 11.9, df = 20, p-value = 0.9195
Model df: 4. Total lags used: 24
# Prediction of the 12 next months
forecasts_aic_roma <- forecast(sarima_aic_roma, h=41)
#str(forecasts)
plot(forecasts_aic_roma)# Information on the model
sarima_aic_romaSeries: roma_ts19
ARIMA(1,1,2)(0,1,1)[12]
Coefficients:
ar1 ma1 ma2 sma1
0.6339 -1.3982 0.4176 -0.7427
s.e. 0.2098 0.2391 0.2229 0.0557
sigma^2 = 6.251e+10: log likelihood = -2815.63
AIC=5641.27 AICc=5641.57 BIC=5657.84
We take the same steps as for Paris CDG
Parameters with BIC
Parameters used : (0,1,1)x(0,1,1)[12]
# Setting the parameters
non_seasonal_order <- roma_ts19_bic_params$min_BIC_params[1:3]
seasonal_order <- roma_ts19_bic_params$min_BIC_params[4:6]
# Fitting the model
sarima_bic_roma <- Arima(roma_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_bic_roma)
Ljung-Box test
data: Residuals from ARIMA(0,1,1)(0,1,1)[12]
Q* = 22.391, df = 22, p-value = 0.4368
Model df: 2. Total lags used: 24
# Prediction of the 12 next months
forecasts_bic_roma <- forecast(sarima_bic_roma, h=41)
#str(forecasts)
plot(forecasts_bic_roma)# Information on the model
sarima_bic_romaSeries: roma_ts19
ARIMA(0,1,1)(0,1,1)[12]
Coefficients:
ma1 sma1
-0.8562 -0.7206
s.e. 0.0610 0.0504
sigma^2 = 6.486e+10: log likelihood = -2819.11
AIC=5644.22 AICc=5644.34 BIC=5654.16
Model for COPENHAGEN
Parameters with AIC
Parameters used : (2,1,1)x(1,1,2)[12]
# Setting the parameters
non_seasonal_order <- copenhagen_ts19_aic_params$min_AIC_params[1:3]
seasonal_order <- copenhagen_ts19_aic_params$min_AIC_params[4:6]
# Fitting the model
sarima_aic_copenhagen <- Arima(copenhagen_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_aic_copenhagen)
Ljung-Box test
data: Residuals from ARIMA(2,1,1)(1,1,2)[12]
Q* = 19.215, df = 18, p-value = 0.3787
Model df: 6. Total lags used: 24
# Prediction of the 12 next months
sarima_aic_copenhagen <- forecast(sarima_aic_copenhagen, h=41)
#str(forecasts)
plot(sarima_aic_copenhagen)# Information on the model
sarima_aic_copenhagen Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Jan 2020 1964826 1884155 2045497 1841450 2088201
Feb 2020 2002225 1912100 2092351 1864390 2140060
Mar 2020 2377801 2273255 2482348 2217911 2537692
Apr 2020 2536481 2423497 2649464 2363688 2709274
May 2020 2730314 2609415 2851214 2545414 2915215
Jun 2020 2965301 2838269 3092334 2771022 3159581
Jul 2020 3182616 3050285 3314948 2980232 3385000
Aug 2020 2970399 2833649 3107148 2761258 3179539
Sep 2020 2853102 2712563 2993640 2638166 3068037
Oct 2020 2797853 2654077 2941628 2577967 3017739
Nov 2020 2251896 2105329 2398463 2027741 2476051
Dec 2020 2135961 1986983 2284939 1908119 2363803
Jan 2021 2050543 1892876 2208211 1809412 2291675
Feb 2021 2084071 1922180 2245961 1836481 2331660
Mar 2021 2465582 2298937 2632227 2210721 2720443
Apr 2021 2633650 2463400 2803900 2373275 2894024
May 2021 2824912 2651364 2998460 2559493 3090331
Jun 2021 3065677 2889349 3242006 2796007 3335348
Jul 2021 3287818 3109043 3466593 3014405 3561231
Aug 2021 3071386 2890490 3252282 2794729 3348043
Sep 2021 2947482 2764726 3130238 2667981 3226983
Oct 2021 2884481 2700098 3068865 2602491 3166471
Nov 2021 2320577 2134762 2506392 2036397 2604757
Dec 2021 2207182 2020108 2394256 1921077 2493287
Jan 2022 2116244 1921910 2310578 1819035 2413453
Feb 2022 2149904 1952395 2347412 1847841 2451966
Mar 2022 2536129 2334751 2737507 2228147 2844110
Apr 2022 2713218 2508951 2917485 2400819 3025617
May 2022 2902902 2695928 3109875 2586363 3219440
Jun 2022 3149227 2939964 3358490 2829187 3469267
Jul 2022 3375967 3164665 3587268 3052808 3699125
Aug 2022 3156673 2943592 3369754 2830793 3482553
Sep 2022 3027864 2813209 3242519 2699577 3356150
Oct 2022 2958985 2742942 3175027 2628576 3289393
Nov 2022 2380735 2163462 2598008 2048445 2713025
Dec 2022 2269895 2051534 2488256 1935940 2603849
Jan 2023 2174819 1949016 2400622 1829483 2520155
Feb 2023 2208983 1980035 2437932 1858837 2559130
Mar 2023 2599473 2366589 2832358 2243307 2955640
Apr 2023 2784362 2548552 3020172 2423722 3145002
May 2023 2973061 2734486 3211635 2608193 3337928
We take the same steps as for Paris CDG
Parameters with BIC
Parameters used : (1,1,0)x(2,1,0)[12]
# Setting the parameters
non_seasonal_order <- copenhagen_ts19_bic_params$min_BIC_params[1:3]
seasonal_order <- copenhagen_ts19_bic_params$min_BIC_params[4:6]
# Fitting the model
sarima_bic_copenhagen <- Arima(copenhagen_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_bic_copenhagen)
Ljung-Box test
data: Residuals from ARIMA(1,1,0)(2,1,0)[12]
Q* = 22.164, df = 21, p-value = 0.3901
Model df: 3. Total lags used: 24
# Prediction of the 12 next months
forecasts_bic_copenhagen <- forecast(sarima_bic_copenhagen, h=41)
#str(forecasts)
plot(forecasts_bic_copenhagen)# Information on the model
sarima_bic_copenhagenSeries: copenhagen_ts19
ARIMA(1,1,0)(2,1,0)[12]
Coefficients:
ar1 sar1 sar2
-0.4463 -0.6536 -0.3239
s.e. 0.0633 0.0652 0.0645
sigma^2 = 4.101e+09: log likelihood = -2536.29
AIC=5080.59 AICc=5080.79 BIC=5093.84
Model for OSLO
Parameters with AIC
Parameters used : (0,1,1)x(2,1,2)[12]
# Setting the parameters
non_seasonal_order <- oslo_ts19_aic_params$min_AIC_params[1:3]
seasonal_order <- oslo_ts19_aic_params$min_AIC_params[4:6]
# Fitting the model
sarima_aic_oslo <- Arima(oslo_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_aic_oslo)
Ljung-Box test
data: Residuals from ARIMA(0,1,1)(2,1,2)[12]
Q* = 21.767, df = 19, p-value = 0.296
Model df: 5. Total lags used: 24
# Prediction of the 12 next months
forecasts_aic_oslo <- forecast(sarima_aic_oslo, h=41)
#str(forecasts)
plot(forecasts_aic_oslo)# Information on the model
sarima_aic_osloSeries: oslo_ts19
ARIMA(0,1,1)(2,1,2)[12]
Coefficients:
ma1 sar1 sar2 sma1 sma2
-0.6078 1.0119 -0.1696 -1.6701 0.7824
s.e. 0.0603 0.1506 0.1242 0.1562 0.1080
sigma^2 = 4.04e+09: log likelihood = -2537.15
AIC=5086.3 AICc=5086.73 BIC=5106.18
We take the same steps as for Paris CDG
Parameters with BIC
Parameters used : (0,1,1)x(0,1,1)[12]
# Setting the parameters
non_seasonal_order <- oslo_ts19_bic_params$min_BIC_params[1:3]
seasonal_order <- oslo_ts19_bic_params$min_BIC_params[4:6]
# Fitting the model
sarima_bic_oslo <- Arima(oslo_ts19, order=non_seasonal_order, seasonal=list(order=seasonal_order, period=12))Interpretation of the residuals :
checkresiduals(sarima_bic_oslo)
Ljung-Box test
data: Residuals from ARIMA(0,1,1)(0,1,1)[12]
Q* = 29.691, df = 22, p-value = 0.1262
Model df: 2. Total lags used: 24
# Prediction of the 12 next months
forecasts_bic_oslo <- forecast(sarima_bic_oslo, h=41)
#str(forecasts)
plot(forecasts_bic_oslo)# Information on the model
sarima_bic_osloSeries: oslo_ts19
ARIMA(0,1,1)(0,1,1)[12]
Coefficients:
ma1 sma1
-0.5858 -0.6006
s.e. 0.0624 0.0565
sigma^2 = 4.312e+09: log likelihood = -2541.67
AIC=5089.34 AICc=5089.46 BIC=5099.28
Comparing the difference
The scale differences are displayed on the same plot for comparison.
# Combine all forecast data
combined_data_all <- bind_rows(
mutate(forecast_data_paris, City = "Paris"),
mutate(forecast_data_madrid, City = "Madrid"),
mutate(forecast_data_roma, City = "Roma"),
mutate(forecast_data_copenhagen, City = "Copenhagen"),
mutate(forecast_data_oslo, City = "Oslo")
)
# Plot using ggplot2
ggplot(data = combined_data_all, aes(x = time, y = scalediff, color = City)) +
geom_line() +
labs(title = "Scaled difference between Forecasted and Actual Values",
x = "Time", y = "Difference") +
theme_minimal()SARIMAX
Policies
We add 3 politics to the dataset :
- Borders main EU Period
- Border non-EU Period
- Negative tests Period
We add them as dummies over the same period of our time series. We put a 1 when the policy is applied, and 0 when it is not. We did it by hand on Excel, it was simplier than with R.
Model
Integrating the COVID effect to our SARIMAX model
# Creating the CovidDummy
paris$CovidDummy <- ifelse(paris$Date >= as.Date("2020-03-01") & paris$Date <= as.Date("2020-12-01"), 1, 0)
madrid$CovidDummy <- ifelse(madrid$Date >= as.Date("2020-03-01") & madrid$Date <= as.Date("2020-12-01"), 1, 0)
roma$CovidDummy <- ifelse(roma$Date >= as.Date("2020-03-01") & roma$Date <= as.Date("2020-12-01"), 1, 0)
copenhagen$CovidDummy <- ifelse(copenhagen$Date >= as.Date("2020-03-01") & copenhagen$Date <= as.Date("2020-12-01"), 1, 0)
oslo$CovidDummy <- ifelse(oslo$Date >= as.Date("2020-03-01") & oslo$Date <= as.Date("2020-12-01"), 1, 0)## Models fitting ##
# Paris
sarimax_paris <- Arima(paris_ts, order=paris_auto_params[1:3], seasonal=paris_auto_params[4:6], xreg=paris_covid)
# Madrid
sarimax_madrid <- Arima(madrid_ts, order=madrid_auto_params[1:3], seasonal=madrid_auto_params[4:6], xreg=madrid_covid)
# Roma
sarimax_roma <- Arima(roma_ts, order=roma_auto_params[1:3], seasonal=roma_auto_params[4:6], xreg=roma_covid)
# Copenhagen
sarimax_copenhagen <- Arima(copenhagen_ts, order=copenhagen_auto_params[1:3], seasonal=copenhagen_auto_params[4:6], xreg=copenhagen_covid)
# Oslo
sarimax_oslo <- Arima(oslo_ts, order=oslo_auto_params[1:3], seasonal=oslo_auto_params[4:6], xreg=oslo_covid)Prediction
Here we predict for each city the number of passengers for the next 300 months (straight to 2045). To do so we make a forecast with our SARIMAX model for each country and we compare it with our forecast with the SARIMA model.
Saving our hyperparameters
Here we use a method to save our hyperparameters in a file so that we can use them later without running again our big loops. This helps us to get a more efficient code and to save time.
# Here we save all our parameters in a database file
save(paris_ts19_aic_params, paris_ts19_bic_params, paris_auto_params,
madrid_ts19_aic_params, madrid_ts19_bic_params, madrid_auto_params,
roma_ts19_aic_params, roma_ts19_bic_params, roma_auto_params,
copenhagen_ts19_aic_params, copenhagen_ts19_bic_params, copenhagen_auto_params,
oslo_ts19_aic_params, oslo_ts19_bic_params, oslo_auto_params,
file = "hyperparameters.RData")