This vignette aims to illustrate how the ADLP
package
can be used to objectively combine multiple stochastic loss reserving
models such that the strengths offered by different models can be
utilised effectively.
To showcase the ADLP
package, we will import a test
claims dataset from SynthETIC
.
Note that some data manipulation is needed to transform the imported
object into a compatible format for ADLP
, however any
data.frame
with column 1 titled “origin” for accident
periods and column 2 titled “dev” for development periods will work. All
other columns can be constructed as required for model inputs.
library(ADLP)
if (!require(SynthETIC)) install.packages("SynthETIC")
#> Loading required package: SynthETIC
if (!require(tidyverse)) install.packages("tidyverse")
#> Loading required package: tidyverse
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
#> ✔ ggplot2 3.4.3 ✔ purrr 0.3.4
#> ✔ tibble 3.2.1 ✔ dplyr 1.1.2
#> ✔ tidyr 1.2.0 ✔ stringr 1.4.0
#> ✔ readr 2.1.2 ✔ forcats 0.5.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
library(tidyverse)
# Import dummy claims dataset
claims_dataset <- claims_dataset <- ADLP::test_claims_dataset
head(claims_dataset)
As the ADLP weightings are determined through testing on a validation set, we need to split the claims dataset into a training, validation and testing set.
# Included train/validation splitting function
train_val <- ADLP::train_val_split_method1(
df = claims_dataset,
tri.size = 40,
val_ratio = 0.3,
test = TRUE
)
train_data <- train_val$train
valid_data <- train_val$valid
insample_data <- rbind(train_data, valid_data)
test_data <- train_val$test
print("Number of observations in each row")
#> [1] "Number of observations in each row"
print(nrow(train_data))
#> [1] 575
print(nrow(valid_data))
#> [1] 245
print(nrow(test_data))
#> [1] 780
print("Approximation for val_ratio")
#> [1] "Approximation for val_ratio"
print(nrow(valid_data)/(nrow(insample_data)))
#> [1] 0.2987805
This example will construct an ADLP with three components. However, any number of base models will also function well.
While we use glm
style models in this example, the only
requirement for the base models is support for the S3 method ‘formula’,
and being able to be fitted on by the datasets of similar structure to
claims_dataset
. To demonstrate the veModels from the gamlss
package was used while proofing the ADLP
concept.
base_model1 <- glm(formula = claims~factor(dev),
family=gaussian(link = "identity"), data=train_data)
tau <- 1e-8
base_model2 <- glm(formula = (claims+tau)~calendar,
family=Gamma(link="inverse"), data=train_data)
base_model3 <- glm(formula = claims~factor(origin),
family=gaussian(link = "identity"), data=train_data)
base_model1_full <- update(base_model1, data = insample_data)
base_model2_full <- update(base_model2, data = insample_data)
base_model3_full <- update(base_model3, data = insample_data)
To also support desired outputs including densities and predictions,
the base components will also require calc_dens
,
calc_mu
, calc_cdf
and sim_fun
methods to be defined for each base component. See
?adlp_component
for more information.
################################################################################
# Normal distribution densities and functions
dens_normal <- function(y, model, newdata){
pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
mu <- pred_model$fit
sigma <- pred_model$residual.scale
return(dnorm(x=y, mean=mu, sd=sigma))
}
cdf_normal<-function(y, model, newdata){
pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
mu <- pred_model$fit
sigma <- pred_model$residual.scale
return(pnorm(q=y, mean=mu, sd=sigma))
}
mu_normal<-function(model, newdata){
mu <- predict(model, newdata=newdata, type="response")
mu <- pmax(mu, 0)
return(mu)
}
sim_normal<-function(model, newdata){
pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
mu <- pred_model$fit
sigma <- pred_model$residual.scale
sim <- rnorm(length(mu), mean=mu, sd=sigma)
sim <- pmax(sim, 0)
return(sim)
}
################################################################################
# Gamma distribution densities and functions
dens_gamma <- function(y, model, newdata){
pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
mu <- pred_model$fit
sigma <- pred_model$residual.scale
return(dgamma(x=y, shape = 1/sigma, scale=(mu*sigma)^2))
}
cdf_gamma <- function(y, model, newdata){
pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
mu <- pred_model$fit
sigma <- pred_model$residual.scale
return(pgamma(q=y, shape = 1/sigma, scale=(mu*sigma)^2))
}
mu_gamma <- function(model, newdata, tau){
mu <- predict(model, newdata=newdata, type="response")
mu <- pmax(mu - tau, 0)
return(mu)
}
sim_gamma <- function(model, newdata, tau){
pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
mu <- pred_model$fit
sigma <- pred_model$residual.scale
sim <- rgamma(length(mu), shape = 1/sigma, scale=(mu*sigma)^2)
sim <- pmax(sim - tau, 0)
return(sim)
}
The different user defined base models are then structured as a
adlp_component
object to be used in model fitting for
adlp
.
# Constructing ADLP component classes
base_component1 = adlp_component(
model_train = base_model1,
model_full = base_model1_full,
calc_dens = dens_normal,
calc_cdf = cdf_normal,
calc_mu = mu_normal,
sim_fun = sim_normal
)
base_component2 = adlp_component(
model_train = base_model2,
model_full = base_model2_full,
calc_dens = dens_gamma,
calc_cdf = cdf_gamma,
calc_mu = mu_gamma,
sim_fun = sim_gamma,
tau = tau
)
base_component3 = adlp_component(
model_train = base_model3,
model_full = base_model3_full,
calc_dens = dens_normal,
calc_cdf = cdf_normal,
calc_mu = mu_normal,
sim_fun = sim_normal
)
components <- adlp_components(
base1 = base_component1,
base2 = base_component2,
base3 = base_component3
)
adlp
objects are fitted using the validation data to
determine the best weighting to choose for each partition.
Note that different partitions can be used. fit1
uses no
partition, meaning that the weights are determined via standard linear
pooling. fit2
is an example of creating 3 partitions in the
validation data, with the accident periods being chosen to optimise the
desired weighting of each partition.
# Fitting ADLP class
fit1 <- adlp(
components_lst = components,
newdata = valid_data,
partition_func = ADLP::adlp_partition_none
)
fit2 <- adlp(
components_lst = components,
newdata = valid_data,
partition_func = ADLP::adlp_partition_ap,
tri.size = 40,
size = 3,
weights = c(3, 1, 2)
)
fit1
#> [1] "ADLP Components:"
#> [1] "base1" "base2" "base3"
#> [1] "ADLP weights:"
#> [[1]]
#> base1 base2 base3
#> 0.7853159 0.0000000 0.2146841
fit2
#> [1] "ADLP Components:"
#> [1] "base1" "base2" "base3"
#> [1] "ADLP weights:"
#> [[1]]
#> base1 base2 base3
#> 0.7838638 0.0000000 0.2161362
#>
#> [[2]]
#> base1 base2 base3
#> 0.5237283 0.0000000 0.4762717
#>
#> [[3]]
#> base1 base2 base3
#> 0.7853159 0.0000000 0.2146841
The ADLP
package supports calculation of log score,
CRPS, prediction and simulation from each of the ensemble models. Note
that the user-defined functions for each of the component models is
necessary to perform this calculation.
# Log Score
ensemble_logS <- adlp_logS(fit1, test_data, model = "full")
# Cumulative Ranked Probability Score
ensemble_CRPS <- adlp_CRPS(fit1, test_data, response_name = "claims", model = "full", sample_n = 1000)
# Predictions
ensemble_means <- predict(fit1, test_data)
# Simulations
ensemble_simulate <- adlp_simulate(100, fit1, test_data)
boxplot(ensemble_logS$dens_val, main = "Log Score")
boxplot(ensemble_CRPS$ensemble_crps, main = "Cumulative Ranked Probability Score")
boxplot(ensemble_means$ensemble_mu, main = "Predictions")
boxplot(ensemble_simulate$simulation, main = "Simulations")
The sample below shows how a user can define their own partition function to be used within model fitting.
# Defining own partition function
user_defined_partition <- function(df) {
return (list(
subset1 = df[(as.numeric(as.character(df$origin)) >= 1) & (as.numeric(as.character(df$origin)) <= 15), ],
subset2 = df[(as.numeric(as.character(df$origin)) >= 16) & (as.numeric(as.character(df$origin)) <= 40), ]
))
}
# Fitting ADLP class
fit3 <- adlp(
components_lst = components,
newdata = valid_data,
partition_func = user_defined_partition
)
fit3
#> [1] "ADLP Components:"
#> [1] "base1" "base2" "base3"
#> [1] "ADLP weights:"
#> [[1]]
#> base1 base2 base3
#> 0.6472695 0.0000000 0.3527305
#>
#> [[2]]
#> base1 base2 base3
#> 0.7853159 0.0000000 0.2146841
We can see that the ADLP ensembles perform better than all components on the in-sample and only slightly loses out to base model 1 on the out-of-sample data.
Note that this may also be due to model variance and we have not chosen to optimise the base models.
show_mse <- function(data) {
print("----------------------------")
print("Base models 1, 2, 3")
print(sum((predict(base_model1_full, data, type = "response") - data$claims)^2))
print(sum((predict(base_model2_full, data, type = "response") - data$claims)^2))
print(sum((predict(base_model3_full, data, type = "response") - data$claims)^2))
print("ADLP ensembles 1, 2, 3")
print(sum((predict(fit1, data)$ensemble_mu - data$claims)^2))
print(sum((predict(fit2, data)$ensemble_mu - data$claims)^2))
print(sum((predict(fit3, data)$ensemble_mu - data$claims)^2))
print("----------------------------")
}
show_mse(train_data)
#> [1] "----------------------------"
#> [1] "Base models 1, 2, 3"
#> [1] 9.148445e+13
#> [1] 1.212692e+14
#> [1] 1.156972e+14
#> [1] "ADLP ensembles 1, 2, 3"
#> [1] 9.043105e+13
#> [1] 9.043557e+13
#> [1] 9.056253e+13
#> [1] "----------------------------"
show_mse(valid_data)
#> [1] "----------------------------"
#> [1] "Base models 1, 2, 3"
#> [1] 4.634486e+13
#> [1] 5.964421e+13
#> [1] 5.31757e+13
#> [1] "ADLP ensembles 1, 2, 3"
#> [1] 4.610282e+13
#> [1] 4.610436e+13
#> [1] 4.640151e+13
#> [1] "----------------------------"
show_mse(test_data)
#> [1] "----------------------------"
#> [1] "Base models 1, 2, 3"
#> [1] 1.00074e+14
#> [1] 1.359745e+14
#> [1] 1.804106e+14
#> [1] "ADLP ensembles 1, 2, 3"
#> [1] 1.028518e+14
#> [1] 1.028976e+14
#> [1] 1.030806e+14
#> [1] "----------------------------"