Out-of-Sample R-squared¶
This post shows how to evaluate a predictor’s performance using out-of-sample \(R^2_{OS}\) in R. For more details, please refer to Campbell and Thompson (2008).
1 In-sample \(R^2\)¶
\[
R^2 = 1- \frac{\sum_{t=1}^{T}(r_t - \hat r_t)^2}{\sum_{t=1}^{T}(r_t - \bar r)^2}
\]
2 Out-of-sample \(R^2\)¶
The \(R^2_{OS}\) statistic is proposed by Campbell and Thompson (2008) and measures the proportional reduction in mean squared prediction error (MSPE) for the predictive regression forecast relative to the historical average benchmark:
\[
R^2_{OS} = 1- \frac{\sum_{t=1}^{T}(r_t - \hat r_t)^2}{\sum_{t=1}^{T}(r_t - \bar r_t)^2}
\]
where \(\hat r_t\) is the fitted value from a predictive regression estimated through period \(t-1\), and \(\hat r_t\) is the historical average return estimated through period \(t-1\) (i.e., \(\hat r_{t+1} =\frac{1}{t} \sum_{s=1}^{t}r_t\)). The \(R^2_{OS}\) lies in the range (\(-\infty\),1].
3 Calculate OOS R-squared using R¶
3.1 Prerequisite¶
library(data.table)
library(sandwich)
library(lmtest)
library(knitr)
library(fixest)
library(broom)
library(purrr)
library(multDM)
library(slider)
# remotes::install_github("franz-maikaefer/oosanalysis-R-library", ref = "9b4251b")
library(oosanalysis)
3.2 Data simulation¶
simulate_overlapping_data <- function(n_of_years = 100, delta_t = 1/12){
MU <- 0.08
THETA <- 0.75
SIGMA <- 0.16
n_of_periods <- n_of_years / delta_t
simulate_dt <- data.table(
t = seq(1, n_of_periods),
x_t = rnorm(n_of_periods, mean = 0, sd = 1),
r_t = NA_real_,
r_t_plus_1 = NA_real_
)
#< Set the return of the beginning period to the MU.
set(simulate_dt, i = 1L, j = "r_t", MU)
for (t in 1:(n_of_periods-1)) {
t <- as.integer(t)
set(simulate_dt, t, "r_t_plus_1", THETA * (MU - simulate_dt$r_t[t]) * delta_t + SIGMA * sqrt(delta_t) * rnorm(1))
set(simulate_dt, t + 1L, "r_t", simulate_dt$r_t_plus_1[t])
}
simulate_dt[, `:=`(
r_t_plus_3 = frollsum(r_t, n = 3, align = "left"),
r_t_plus_6 = frollsum(r_t, n = 6, align = "left"),
r_t_plus_12 = frollsum(r_t, n = 12, align = "left")
)]
return(simulate_dt[!is.na(r_t_plus_12)])
}
set.seed(42)
simulate_dt <- simulate_overlapping_data()
kable(head(simulate_dt), "pipe")
t | x_t | r_t | r_t_plus_1 | r_t_plus_3 | r_t_plus_6 | r_t_plus_12 |
---|---|---|---|---|---|---|
1 | 1.3709584 | 0.0800000 | -0.0344801 | 0.0543657 | 0.1392246 | 0.1188966 |
2 | -0.5646982 | -0.0344801 | 0.0088458 | -0.0062542 | 0.1045764 | 0.0885166 |
3 | 0.3631284 | 0.0088458 | 0.0193802 | 0.0495512 | 0.0290402 | 0.1400495 |
4 | 0.6328626 | 0.0193802 | 0.0213252 | 0.0848590 | 0.1118831 | 0.1849328 |
5 | 0.4042683 | 0.0213252 | 0.0441536 | 0.1108306 | 0.1128342 | 0.1644087 |
6 | -0.1061245 | 0.0441536 | 0.0453517 | -0.0205110 | 0.0688945 | 0.1369073 |
3.3 In-sample \(R^2\)¶
#< Fit the predictive regression
m <- feols(r_t_plus_1 ~ x_t,simulate_dt)
#< Extract in-sample R-squared from the model
r_is_1 <- r2(m)[["r2"]]
#< Compute fitted values
r_t_plus_1_hat <- predict(m)
#< Calculate in-sample R-squared manually
r_is_2 <- 1 - sum((simulate_dt$r_t_plus_1 - r_t_plus_1_hat)^2) / sum((simulate_dt$r_t_plus_1 - mean(simulate_dt$r_t_plus_1))^2)
#< Present results
kable(data.table(r_is_1 = r_is_1, r_is_2 = r_is_2))
r_is_1 | r_is_2 |
---|---|
0.000549 | 0.000549 |
3.4 Out-of-sample \(R^2\)¶
extract_r2 <- compose(\(x) x[["r2"]], r2)
predict_one_step_ahead <- function(object, x){
estimates <- coef(object)
y_hat <- x * estimates[[2]] + estimates[[1]]
return(y_hat)
}
r2_oos <- function(n.start = NULL, r.var.ahead = NULL, r.var = NULL, x.var = NULL, date.var = NULL, refit.window = c("recursive", "rolling"), data = NULL){
dt <- as.data.table(data)
dt <- dt[, .SD, .SDcols = c(date.var, r.var.ahead, r.var, x.var)]
N <- nrow(data)
fm <- as.formula(paste0(r.var.ahead, "~", x.var))
if (refit.window == "recursive"){
dt_test <- dt[,
{
slide_dt <- slide(.SD, .f = ~.x, .before = Inf, .after = -1)
results <- data.table(.SD, slide_dt)
results[n.start:.N]
}
]
}
if (refit.window == "rolling"){
dt_test <- dt[,
{
slide_dt <- slide(.SD, .f = ~.x, .before = n.start, .after = -1, .complete = T)
results <- data.table(.SD, slide_dt)
results[(n.start+1):.N]
}
]
}
#< Define functions for Clark and West (2007) test
cw.test <- function(e_null, e_alt, y_hat_null, y_hat_alt, h){
P <- length(e_null)
mspe.adj <- e_null^2 - (e_alt^2 - (y_hat_null - y_hat_alt)^2)
m <- lm(mspe.adj~1)
cw_iid <- tidy(m)$statistic
cw_nw <- coeftest(m, vcov. = NeweyWest(m, lag = h, prewhite = FALSE))[3]
data.table(cw_iid, cw_nw)
}
dt_test[
,
{
m = map(slide_dt, ~feols(fm, .x))
r_hat_ols = map2_dbl(m, get(x.var), ~predict_one_step_ahead(.x, .y))
r_hat_hist_mean = map_dbl(slide_dt, ~mean(.x[, .SD, .SDcols = r.var][[1]]))
r_true = get(r.var)
e_ols <- r_true - r_hat_ols
e_hist_mean <- r_true - r_hat_hist_mean
r2_os = 1 - sum((r_true - r_hat_ols)^2) / sum((r_true - r_hat_hist_mean)^2)
cw.stats = cw.test(e_hist_mean, e_ols, r_hat_hist_mean, r_hat_ols, 1)
dm.stats <- DM.test(r_hat_hist_mean, r_hat_ols, r_true, h = 1, c = TRUE, H1 = "less")$statistic[1]
# dm.stats2 <- DM.test(r_hat_hist_mean, r_hat_ols, r_true, h = 1, c = TRUE, H1 = "less")$statistic[1]
data.table(data = list(data.table(get(date.var), r_hat_ols, r_hat_hist_mean, r_true, e_ols, e_hist_mean)), r2_os = r2_os[1],
cw.stats, dm.stats)
}
]
}
oos_dt <- r2_oos(n.start = 501, r.var.ahead = "r_t_plus_1", r.var = "r_t", x.var = "x_t", refit.window = "recursive", data = simulate_dt)
kable(oos_dt[, .(r2_os, cw_iid, dm.stats)], "pipe")
r2_os | cw_iid | dm.stats |
---|---|---|
-0.005876 | 0.5118299 | -0.7765246 |