使用R实现贝叶斯结构模型,拟合和预测疾病发生率数据
贝叶斯结构时间序列(Bayesian Structural Time Series,BSTS)模型是一种将状态空间模型与贝叶斯推断相结合的时间序列分析方法。BSTS模型通过将时间序列分解为多个有实际意义的组件:趋势(Trend)、季节性(Seasonal)和回归(Regression)等,并使用贝叶斯方法来处理不确定性。该模型不要求数据满足平稳性假设,通过先验分布和数据更新来获得完整的后验分布,能够提供更丰富的预测信息。BSTS模型广泛应用于流行病学、经济预测、市场分析等领域,特别适合需要理解数据结构组成并进行中长期预测的场景。
Bootstrap是一种通过重复抽样来评估统计量的方法。其基本思想是,将原始样本看作总体,通过反复从中抽取样本来模拟从总体中抽样。通过观察这些模拟样本的统计量的变化,来推断总体统计量的抽样分布。
对于时间序列数据,我们往往需要量化模型预测的不确定性,如计算预测区间。传统的BSTS模型可以从后验分布中采样得到预测分布,但这依赖于MCMC结果的收敛性和代表性。为了更稳健地计算预测区间,可以考虑引入Bootstrap方法。
library(bsts)
library(tidyverse)
library(car)
library(forecast)
library(corrplot)
library(nortest)
library(boot)
library(ggplot2)
# 随机生成数据,实际使用中data更换成实际发病率数据
set.seed(123)
n <- 120 # 10年的月度数据
time <- 1:n
season <- sin(2 * pi * time/12) # 季节性组件
trend <- 0.1 * time # 线性趋势
mu <- 10 + trend + 2 * season # 真实均值
sigma <- 1.5 # 标准差
y <- rnorm(n, mu, sigma)
population <- rep(1000, n) # 假设人群基数
cases <- rpois(n, y * population/100) # 实际病例数
incidence <- cases/population * 100 # 发病率(%)
# 创建数据框
data <- data.frame(
time = seq(as.Date("2015-01-01"), as.Date("2024-12-01"), by = "month"),
month =as.numeric( rep(1:12, 10)),
year = as.numeric(rep(1:10, each=12)),
cases = as.numeric(cases),
population =as.numeric( population),
incidence = as.numeric(scale(incidence)),
season = as.numeric(season)
)
# 3.1 数据缺失值检查
print(paste("缺失值数量:", sum(is.na(data))))
[1] "缺失值数量: 0"
# 3.2 正态性检验
shapiro_test <- shapiro.test(data$incidence)
print(shapiro_test)
Shapiro-Wilk normality test
data: data$incidence
W = 0.98712, p-value = 0.3153
# 3.3 时间序列分解
ts_data <- ts(data$incidence, frequency = 12)
decomp <- decompose(ts_data)
plot(decomp)
# 3.4 自相关性检验
acf(data$incidence, main="自相关函数图")
pacf(data$incidence, main="偏自相关函数图")
# 3.5 季节性分析
seasonal_plot <- ggplot(data, aes(x = factor(month), y = incidence, group = year)) +
geom_line(aes(color = factor(year))) +
labs(title = "季节性图", x = "月份", y = "发病率") +
theme_minimal()
print(seasonal_plot)
# 3.6 VIF检验
# 创建线性模型用于VIF检验
lm_model <- lm(incidence ~ time + season, data = data)
vif_result <- vif(lm_model)
print(vif_result)
time season
1.005847 1.005847
# 3.7 Durbin-Watson检验(独立性假设)
dw_test <- durbinWatsonTest(lm_model)
print(dw_test)
lag Autocorrelation D-W Statistic p-value
1 0.01051613 1.945138 0.692
Alternative hypothesis: rho != 0
# 3.8. 残差独立性检验
residuals <- residuals(lm_model)
plot(residuals, type = "l", main = "残差时间序列图")
abline(h = 0, col = "red", lty = 2)
acf(residuals, main = "残差自相关函数图")
正态性检验(3.2)结果显示p>0.05,符合正态分布; 时间序列分解(3.3)结果显示,该数据具有明显的上升趋势;存在稳定的季节性模式;随机波动相对较小且均匀;所以模型选择应该同时考虑趋势和季节性组件; 自相关检验(3.4)结果显示:数据具有较强的持续性可能存在趋势成分,可能存在季节性模式; 季节性分析(3.5)显示:春季(3-4月)通常有较高的发病率,夏末秋初(8-9月)发病率相对较低,年底到年初(11-1月)发病率呈现波动; VIF检验结果(3.6):VIF值都远小于5(临界值)说明time和season变量之间几乎没有共线性; 独立性假设检验(3.7):p值 >0.05,不能拒绝原假设,说明残差可以认为是独立的; 残差图(3.8)显示残差值基本在-1到1之间波动,围绕0线(红色虚线)随机分布,没有明显的系统性模式或趋势,并且残差的独立性表现良好。
# 拟合模型,包含趋势、季节性和自回归组件
ss <- list()
ss <- AddLocalLinearTrend(ss, data$incidence)
ss <- AddSeasonal(ss, data$incidence, nseasons = 12)
ss <- AddAutoAr(ss, data$incidence, lags = 12)
# 设置先验分布并拟合模型, niter 表示迭代次数
model <- bsts(incidence ~ season,
state.specification = ss,
niter = 5000,
ping = 0,
data = data)
# 创建未来12个月的预测数据
last_date <- max(data$time)
future_dates <- seq(from = last_date, by = "month", length.out = 13)[-1]
# 创建新的预测数据框
newdata <- data.frame(
time = future_dates,
season = sin(2 * pi * (1:12)/12) )
# 进行预测
predictions <- predict(model, newdata = newdata, horizon = 12, burn = 384)
# 4.1 计算残差
fitted_values <- colMeans(model$state.contributions)
residuals <- data$incidence - rowSums(fitted_values)
# 调整绘图参数
par(mfrow=c(2,2), mar=c(4,4,2,1))
# 残差时间序列图
plot(residuals, type='l', main="Residual Time Series",
xlab="Time", ylab="Residuals")
abline(h=0, col="red", lty=2)
# 残差自相关图
acf(residuals, main="ACF of Residuals")
# 残差直方图
hist(residuals, main="Histogram of Residuals",
xlab="Residuals", probability=TRUE)
curve(dnorm(x, mean=mean(residuals), sd=sd(residuals)),
add=TRUE, col="red", lwd=2)
# Q-Q图
qqnorm(residuals)
qqline(residuals, col="red")
# 重置绘图参数
par(mfrow=c(1,1), mar=c(5,4,4,2)+0.1)
残差时间序列图表示残差在0线(红色虚线)附近波动,但存在一定的趋势性,表明模型可能还存在一些未被完全捕捉到的时间依赖性;ACF图说明残差中可能还存在一些未被模型充分解释的系统性模式;残差直方图显示整体形状接近正态分布,基本符合正态性假设;Q-Q图整体来看,与正态分布的偏离不是很严重。总体来说,模型表现尚可,但仍有改进空间。
# 获取原始数据的均值和标准差
original_mean <- mean(incidence)
original_sd <- sd(incidence)
# 将预测值转换回原始尺度
predictions_original <- predictions$distribution * original_sd + original_mean
# 计算评估指标
rmse <- sqrt(mean((incidence - colMeans(predictions_original))^2))
mae <- mean(abs(incidence - colMeans(predictions_original)))
# 计算拟合值和预测区间
fitted_values <- colMeans(model$state.contributions)
total_fit <- rowSums(fitted_values)
# 计算预测误差的标准差
sigma <- sd(residuals)
# 计算95%预测区间
lower_bound <- total_fit - 1.96 * sigma
upper_bound <- total_fit + 1.96 * sigma
# 转换回原始尺度
fitted_original <- total_fit * original_sd + original_mean
lower_bound_original <- lower_bound * original_sd + original_mean
upper_bound_original <- upper_bound * original_sd + original_mean
# 计算覆盖率
coverage <- mean(incidence >= lower_bound_original & incidence <= upper_bound_original, na.rm = TRUE)
print(paste("RMSE:", round(rmse, 4)))
[1] "RMSE: 7.4441"
print(paste("MAE:", round(mae, 4)))
[1] "MAE: 6.4178"
print(paste("覆盖率:", round(coverage, 4)))
[1] "覆盖率: 0.95"
MAE比RMSE小,说明预测误差比较稳定,没有特别极端的预测错误;覆盖率 = 0.95(95%)表明模型的预测区间设置得很合理,既不会过于保守也不会过于乐观。模型的预测性能不错,尤其是预测区间的准确性很高。虽然还存在一定的预测误差,但考虑到时间序列预测的复杂性,这个水平可以接受。 这个模型适合用于实际预测,但在使用预测结果时最好参考预测区间,而不是仅仅看点预测值。
pred_df <- data.frame(
date = future_dates,
mean = colMeans(predictions$distribution) * original_sd + original_mean,
lower = apply(predictions$distribution, 2, quantile, 0.025) * original_sd + original_mean,
upper = apply(predictions$distribution, 2, quantile, 0.975) * original_sd + original_mean
)
# 准备历史数据
hist_df <- data.frame(
date = data$time,
value = incidence
)
# 创建预测图
ggplot() +
geom_line(data = hist_df, aes(x = date, y = value),
color = "steelblue", size = 0.8) +
geom_point(data = hist_df, aes(x = date, y = value),
color = "steelblue", size = 2, alpha = 0.6) +
geom_ribbon(data = pred_df,
aes(x = date, ymin = lower, ymax = upper),
fill = "lightpink", alpha = 0.3) +
geom_line(data = pred_df,
aes(x = date, y = mean),
color = "red", size = 1) +
labs(title = "发病率预测结果",
subtitle = "包含95%预测区间",
x = "时间",
y = "发病率 (%)",
caption = "蓝色线:历史数据\n红色线:预测值\n粉色区域:95%预测区间") +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
axis.title = element_text(size = 10),
axis.text = element_text(size = 9),
legend.position = "none",
plot.caption = element_text(size = 8, hjust = 0)
)
# 获取模型残差
fitted_values <- colMeans(model$state.contributions)
total_fit <- rowSums(fitted_values)
residuals <- data$incidence - total_fit
# Shapiro-Wilk正态性检验
sw_test <- shapiro.test(residuals)
print(sw_test)
Shapiro-Wilk normality test
data: residuals
W = 0.98858, p-value = 0.4167
# 残差的正态性很好,残差确实适合用于Bootstrap方法
## Bootstrap预测区间 --------------------
set.seed(123)
n_boot <- 1000
# 创建存储矩阵
boot_predictions <- matrix(NA, nrow = n_boot, ncol = 12)
# 对每个预测时点进行bootstrap
for(i in 1:12) {
for(j in 1:n_boot) {
# 重抽样残差
boot_residuals <- sample(residuals, length(residuals), replace = TRUE)
# 计算这次重抽样的标准差
resid_sd <- sd(boot_residuals)
# 生成新的预测值
boot_predictions[j,i] <- rnorm(1, mean = pred_df$mean[i], sd = resid_sd)
}
}
# 计算bootstrap预测区间
boot_lower <- apply(boot_predictions, 2, quantile, probs = 0.025)
boot_upper <- apply(boot_predictions, 2, quantile, probs = 0.975)
# 创建包含bootstrap区间的数据框
boot_pred_df <- data.frame(
date = future_dates,
mean = pred_df$mean,
boot_lower = boot_lower,
boot_upper = boot_upper,
model_lower = pred_df$lower,
model_upper = pred_df$upper
)
## 可视化比较 --------------------
# 创建对比图
ggplot() +
# 历史数据
geom_line(data = hist_df, aes(x = date, y = value),
color = "steelblue", size = 0.8) +
geom_point(data = hist_df, aes(x = date, y = value),
color = "steelblue", size = 2, alpha = 0.6) +
# 模型预测区间
geom_ribbon(data = boot_pred_df,
aes(x = date, ymin = model_lower, ymax = model_upper),
fill = "lightpink", alpha = 0.3) +
# Bootstrap预测区间
geom_ribbon(data = boot_pred_df,
aes(x = date, ymin = boot_lower, ymax = boot_upper),
fill = "lightgreen", alpha = 0.3) +
# 预测线
geom_line(data = boot_pred_df,
aes(x = date, y = mean),
color = "red", size = 1) +
labs(title = "发病率预测结果对比",
subtitle = "包含模型预测区间和Bootstrap预测区间",
x = "时间",
y = "发病率 (%)",
caption = "蓝色线:历史数据\n红色线:预测值\n粉色区域:模型预测区间\n绿色区域:Bootstrap预测区间") +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
axis.title = element_text(size = 10),
axis.text = element_text(size = 9),
legend.position = "none",
plot.caption = element_text(size = 8, hjust = 0)
)
## 计算Bootstrap覆盖率 --------------------
# 使用最后12个月的实际数据来计算Bootstrap预测区间的覆盖率
actual_last_12 <- tail(incidence, 12)
boot_coverage <- mean(actual_last_12 >= boot_lower & actual_last_12 <= boot_upper)
print(paste("Bootstrap预测区间覆盖率:", round(boot_coverage, 4)))
[1] "Bootstrap预测区间覆盖率: 0.8333"
两种区间都基本捕捉到了趋势的走向。Bootstrap方法给出了更窄但风险更高的预测区间,原始模型更保守,但可靠性更高。 如果对预测准确性要求高,更安全的做法是使用原始模型的预测区间(粉色); 如果能够接受一定风险,且需要更精确的预测范围,可以考虑使用Bootstrap区间(绿色)。
# 定义交叉验证参数 ------------------
initial_window <- 36 # 初始训练窗口为3年
horizon <- 12 # 预测步长为1年
step_size <- 12 # 每次滑动1年
# 准备交叉验证 ------------------
# 计算可能的折数
n_folds <- floor((length(data$incidence) - initial_window - horizon) / step_size) + 1
# 存储每个折的预测结果
cv_results <- list()
cv_metrics <- data.frame(
fold = integer(),
rmse = numeric(),
mae = numeric(),
coverage = numeric()
)
# 执行滚动窗口交叉验证 ------------------
for(i in 1:n_folds) {
# 定义训练集和测试集的索引
train_end <- initial_window + (i-1) * step_size
test_end <- train_end + horizon
# 提取训练集和测试集
train_data <- data[1:train_end, ]
test_data <- data[(train_end+1):test_end, ]
# 构建模型
ss <- list()
ss <- AddLocalLinearTrend(ss, train_data$incidence)
ss <- AddSeasonal(ss, train_data$incidence, nseasons = 12)
ss <- AddAutoAr(ss, train_data$incidence, lags = 12)
# 拟合模型
model <- bsts(incidence ~ season,
state.specification = ss,
niter = 5000,
ping = 0,
data = train_data)
# 进行预测
preds <- predict(model, newdata = test_data, horizon = horizon)
# 存储预测结果
cv_results[[i]] <- list(
actual = test_data$incidence,
predicted = colMeans(preds$distribution),
lower = apply(preds$distribution, 2, quantile, 0.025),
upper = apply(preds$distribution, 2, quantile, 0.975)
)
# 计算评估指标
rmse <- sqrt(mean((test_data$incidence - cv_results[[i]]$predicted)^2))
mae <- mean(abs(test_data$incidence - cv_results[[i]]$predicted))
coverage <- mean(test_data$incidence >= cv_results[[i]]$lower &
test_data$incidence <= cv_results[[i]]$upper)
cv_metrics <- rbind(cv_metrics,
data.frame(fold = i, rmse = rmse, mae = mae, coverage = coverage))
}
# 可视化交叉验证结果 ------------------
# 评估指标的变化趋势
metrics_long <- cv_metrics %>%
gather(key = "metric", value = "value", -fold)
ggplot(metrics_long, aes(x = fold, y = value, color = metric)) +
geom_line() +
geom_point() +
facet_wrap(~metric, scales = "free_y", nrow = 3) +
labs(title = "交叉验证评估指标随折数的变化",
x = "折数",
y = "指标值") +
theme_minimal()
#预测结果可视化
# 创建预测结果的数据框
all_predictions <- data.frame()
for(i in 1:n_folds) {
fold_data <- data.frame(
fold = i,
time = data$time[(initial_window + (i-1) * step_size + 1):
(initial_window + (i-1) * step_size + horizon)],
actual = cv_results[[i]]$actual,
predicted = cv_results[[i]]$predicted,
lower = cv_results[[i]]$lower,
upper = cv_results[[i]]$upper
)
all_predictions <- rbind(all_predictions, fold_data)
}
# 绘制预测结果
ggplot(all_predictions, aes(x = time)) +
geom_line(aes(y = actual), color = "steelblue") +
geom_line(aes(y = predicted), color = "red") +
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "pink", alpha = 0.3) +
facet_wrap(~fold, ncol = 2) +
labs(title = "各折交叉验证的预测结果",
x = "时间",
y = "发病率",
caption = "蓝线:实际值\n红线:预测值\n粉色区域:95%预测区间") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# 汇总分析 ------------------
print("交叉验证评估指标汇总:")
[1] "交叉验证评估指标汇总:"
summary_metrics <- cv_metrics %>%
summarise(
avg_rmse = mean(rmse),
sd_rmse = sd(rmse),
avg_mae = mean(mae),
sd_mae = sd(mae),
avg_coverage = mean(coverage),
sd_coverage = sd(coverage)
)
print(summary_metrics)
avg_rmse sd_rmse avg_mae sd_mae avg_coverage sd_coverage
1 0.5905269 0.2464385 0.4820371 0.1951376 0.9761905 0.0406625
# 检验评估指标的稳定性
print("评估指标的变异系数(CV):")
[1] "评估指标的变异系数(CV):"
cv_stats <- data.frame(
metric = c("RMSE", "MAE", "Coverage"),
cv = c(
sd(cv_metrics$rmse) / mean(cv_metrics$rmse),
sd(cv_metrics$mae) / mean(cv_metrics$mae),
sd(cv_metrics$coverage) / mean(cv_metrics$coverage)
)
)
print(cv_stats)
metric cv
1 RMSE 0.41731972
2 MAE 0.40481866
3 Coverage 0.04165427
性能指标随折数的变化:覆盖率保持在很高水平,基本在92%-100%之间,说明模型预测区间的可靠性很稳定。MAE和RMSE:都呈现下降趋势,在第4折达到最低点,从第4折后略有回升但仍保持较低水平,说明模型预测准确性随时间推移有所提升。7个不同时间窗口的预测结果的预测区间的宽度适中。 在实际应用中,要注意定期更新模型以适应新的数据特征,并且考虑预测误差可能的波动范围。