library(forecast)
library(readxl)
library(tseries)
library(ggplot2)



All_Island_monthly_data <- read_excel("All Island monthly data.xlsx")
# View(All_Island_monthly_data)


All_Island_Data <- All_Island_monthly_data


#Translate data into time series
AllTS<-ts(All_Island_Data$Count,start = c(min(All_Island_Data$Year),1),
          end = c(max(All_Island_Data$Year),12), frequency = 12)
AllTS

class(AllTS)

plot(AllTS, 
     ylab = "Number of All Island Dengue Cases", 
     main = "AllTS - All Island Dengue Time Series", 
     xlab = "Time by Year/For Monthly Data",
    xaxt = "n")
# Custom x-tick values
axis(1, at = 2015:2024, labels = 2015:2024)

boxplot(AllTS, 
        main = "Boxplot of All Island Dengue Time Series", 
        ylab = "Case Count", 
        xlab = "Group-All Island Dengue Cases", 
        col = "lightgreen",
        outbg= "red", outpch = 21) # Changing outliers color to "red" color

adf.test(AllTS)

#log transform
AllTS_log<-log(AllTS)
AllTS_log

plot(AllTS_log,
     ylab = "Natural Log of Case Count", 
     main = "All Island Dengue Time Series \n After Log Transformation", 
     xlab = "Time by Year/For Monthly Data",
     xaxt = "n")
# Custom x-tick values
axis(1, at = 2015:2024, labels = 2015:2024)

boxplot(AllTS_log, 
        ylab = "Natural Log of Case Count", 
        main = "Boxplot of All Island Dengue Time Series \n After Log Transformation",
        xlab = "Group-All Island Dengue Cases (log)",
        col = "lightgreen") # Outliers suppressed in log transformed data.

###############################################################################################
###################################################################################

########################################################################################
########################### Divide Data Set #####################################
traning_end_year<-max(All_Island_Data$Year)-2 #80% of data (i.e. 7 years)
test_start_year<-max(All_Island_Data$Year)-1 

All_Train<-window(AllTS_log,end=c(traning_end_year,12))
All_Test<-window(AllTS_log, start=c(test_start_year,1))

plot(All_Train, 
     main = "All Island Dengue Time Series - Training Component", 
     ylab = "Natural Log of Case Count", 
     xlab = "Time by Year/For Monthly Data",
     xaxt = "n")
# Custom x-tick values
axis(1, at = 2015:2022, labels = 2015:2022)

Acf(All_Train, 
    main = "Autocorrelation Function Plot (ACF) \n Training Data Component", 
    ylab = "ACF - Correlation Values",
    xlab = "Lag Points")

Pacf(All_Train,
     main = "Partial Autocorrelation Function Plot (PACF) \n Training Data Component", 
     ylab = "PACF - Correlation Values",
     xlab = "Lag Points")

adf.test(All_Train)

diff(All_Train) %>% adf.test() # We should apply differencing to make data stationary

plot(diff(All_Train),
     xlab = "Time by Year/For Monthly Data",
     main = "First Order Differencing - Training Data Component",
     ylab = "Differenced Natural Log Values")

Acf(diff(All_Train),
    main = "Autocorrelation Function (ACF) \n First Order Differencing - Training Data Component",
    ylab = "ACF - Correlation Values",
    xlab = "Lag Points")      ############ Added

Pacf(diff(All_Train),
     main = "Partial Autocorrelation Function \n First Order Differencing - Training Data Component",
     ylab = "PACF - Correlation Values",
     xlab = "Lag Points")     ############ Added

#Automated model without differencing, allowed a contant term
auto.arima(All_Train,trace = TRUE, allowmean = TRUE, ic = "aicc")

#Automated model with manual 1st differencing
All_Auto<-auto.arima(All_Train, d = 1, trace = TRUE, allowmean = TRUE, ic = "aicc") 
# Constant term is allowed to be added to the arima model.

All_Auto # We get a SARIMA Model because there is seasonality in the original data

plot(All_Auto$residuals,
     main = "All_Auto Residual Plot - ARIMA Model(0,1,1)(0,0,2)[12]",
     xlab = "Time by Year/For Monthly Data",
     ylab = "Residual Value")       ############## Doesn't seem to have any pattern..

adf.test(All_Auto$residuals)   ############## To see if the residuals are stationary.. 

Acf(All_Auto$residuals, 
    main = "Autocorrelation Function Plot (ACF) of All_Auto \n ARIMA Model(0,1,1)(0,0,2)[12]",
    ylab = " ACF - Correlation Values",
    xlab = "Lag Points")

Pacf(All_Auto$residuals,
     main = "Partial Autocorrelation Function Plot (PACF) of All_Auto \n ARIMA Model(0,1,1)(0,0,2)[12]",
     ylab = "PACF - Correlation Values",
     xlab = "Lag Points")


#Acf and PA=acf cuts the blue line at lag 15, thus, we check p values for 20 lags
max_lag <- 20
lags <- 1:max_lag

# Perform Ljung-Box Q test for each lag value up to 20 lags of the All Auto Model
p_values <- sapply(lags, function(lag) {
  Box.test(All_Auto$residuals, lag = lag, type = "Ljung-Box")$p.value
})
p_values

plot(lags, p_values, type = "b", 
     xlab = "Lag Points", 
     ylab = "p-Value of Residuals", 
     main = "p-Values of Ljung-Box Test - ACF of All_Auto \n ARIMA Model(0,1,1)(0,0,2)[12]", 
     ylim = c(0,1))
abline(a=0.05, b= 0, col = "red", h= 0.05)
axis(2, at = 0.05, 0.05) # 0.05 on the y axis
text(x = 5, y = 0.08, 'p = 0.05', col = 'blue', cex = 1)
legend(x = "topright",          # Position
       legend = c("p-Value Threshold"),  # Legend texts
       lty = c(1),           # Line types
       col = c("red"),           # Line colors
       lwd = 2) 



#accuracy testing##
All_Auto_Forecast<-forecast(All_Auto, level = c(80,95), h=24)
All_Auto_Forecast




plot(All_Auto_Forecast, 
     ylab = "Natural Log of Case Count",
     xlab = "Time by Year/For Monthly Data",
     xaxt = "n"
)
axis(1, at = 2015:2024, labels = 2015:2024)
lines(All_Test,col="red")
legend("topleft", 
       legend = c("Forecast for Test Data", "Original Log Transformed Test Data"), 
       col = c("blue", "red"), 
       lty = 1, 
       cex = 0.7, 
       bty = "1", 
       inset = 0.02)

#Accuracy using MAPE
accuracy(All_Auto_Forecast,All_Test)



#Forecast for 2024-2025
Best_Model<-arima(AllTS_log, order = c(0, 1, 1), 
                  seasonal = list(order = c(0, 0, 2), period = 12), 
                  include.mean = TRUE)

Best_Model

All_Forecast<-forecast(Best_Model, level = c(80,95), h=12*2)
All_Forecast

#exponentiate the log values to original values
All_Forecast$x<-exp(All_Forecast$x)
All_Forecast$mean<-exp(All_Forecast$mean)
All_Forecast$upper<-exp(All_Forecast$upper)
All_Forecast$lower<-exp(All_Forecast$lower)

All_Forecast

autoplot(All_Forecast, 
         ylab = "Total Dengue Cases (Without Scaling)", 
         xlab = "Time by Year/For Monthly Data")+
  scale_x_continuous(breaks = seq(2015, 2026, by = 1))# without scaling

autoplot(All_Forecast, 
         ylab = "Total Dengue Cases (Scaled)", 
         xlab = "Time by Year/For Monthly Data")+
  scale_x_continuous(breaks = seq(2015, 2026, by = 1))+ # with scaling
  coord_cartesian(ylim = c(0, 50000))

autoplot(All_Forecast, 
         ylab = "Total Dengue Cases (Without Prediction Intervals)", 
         PI = FALSE, 
         xlab = "Time by Year/For Monthly Data")+
  scale_x_continuous(breaks = seq(2015, 2026, by = 1))




# We check residuals for Normal Distribution and outliers

adf.test(Best_Model$residuals)
plot(Best_Model$residuals,
     main = "Best_Model ADF Residual Plot - ARIMA Model(0,1,1)(0,0,2)[12]",
     xlab = "Time by Year/For Monthly Data",
     ylab = "Residual Value")


hist(Best_Model$residuals,
     col = "orange",
     xlab= "Residuals - Best_Model",
     main = "Best_Model ARIMA(0,1,1)(0,0,2)[12] Histogram of  Residuals",
     freq = FALSE)
lines(density(Best_Model$residuals))


################################################

shapiro.test(Best_Model$residuals) # Formally checking for Normal Distribution

boxplot(Best_Model$residuals, 
        ylab = "Residual Value", 
        main = "Boxplot of Residuals from the Best_Model ARIMA(0,1,1)(0,0,2)[12]", 
        xlab = "Group-Residuals of ARIMA Model(0,1,1)(0,0,2)[12]", 
        col = "orange",
        outbg= "red", outpch = 21)
