(Written by Juan Manuel Morales, translated to English by PBA.)
In this lecture we will see how to fit Bayesian models in JAGS
, Stan
and brms
.
The JAGS
software programs chains of Markov Chain Monte Carlo (MCMC) for Bayesian models Plummer, M. (2003). JAGS
is Just Another Gibbs Sampler and is a successor of BUGS
, which is Bayesian inference using Gibbs sampling (Lunn, Jackson, Best, Thomas, & Spiegelhalter, 2013; Lunn, Thomas, Best, & Spiegelhalter, 2000). JAGS
is very similar to BUGS
but has a few additional functions and can be faster. In addition, JAGS
runs on Windows, Mac and Linux.
We will work with the example of Yellowstone bison, which we modeled as lm(logN ~ loglagN + ppt_Jan, data=train )
. To fit this model in JAGS
we have to do a few more things.
First, we have to write the model in the BUGS
language and save the resulting text file in the working directory. We can do that directly from R
using the function cat
:
cat(file = "bison.bug",
"
model {
# likelihood function
for( i in 2:n ){
logN[i] ~ dnorm (mu[i], tau)
loglagN[i] <- logN[i-1]
mu[i] <- b0 + b_lag * loglagN[i] + b_ppt * ppt_Jan[i]
}
# priors
b0 ~ dnorm(0, 0.1)
b_lag ~ dnorm(0, 1)
b_ppt ~ dnorm(0, 1)
s ~ dexp(1)
tau <- 1/(s*s)
}
")
One detail to keep in mind is that BUGS
uses precision (\(1/ \sigma^2\)) in the normal distribution. Since we are used to thinking about the standard deviation, not the precision, we will define the prior as a standard deviation and then transform it to precision.
Now we will load and format the data. In addition to formatting the data as we did before for the lm()
, we will generate some new data.frames: bb_wide
and btrain
.
library(forecast)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(mvtnorm)
bison = read.csv("https://raw.githubusercontent.com/pbadler/forecasting-course-short/master/data/YNP_bison_counts.csv", stringsAsFactors = FALSE)
bison = select(bison,c(year,count.mean)) # drop non-essential columns
names(bison) = c("year","N") # rename columns
bbison = bison # to use with JAGS
tmp = bison
tmp$year = tmp$year + 1
names(tmp)[2] = "lagN"
bison = full_join(bison,tmp)
bison = filter(bison,year > min(year) & year < max(year)) # drop incomplete observations
rm(tmp)
# add log transformed variables
bison = mutate(bison, logN = log(N))
bison = mutate(bison, loglagN = log(lagN))
# Set up weather data ----------------------------------------------------
weather = read.csv("https://raw.githubusercontent.com/pbadler/forecasting-course-short/master/data/YNP_prism.csv", stringsAsFactors = FALSE)
weather = weather %>% separate(Date,c("year","month"), sep = '-')
weather$year = as.numeric(weather$year)
weather$month = as.numeric(weather$month)
weather$clim_year = ifelse(weather$month < 9, weather$year, weather$year + 1)
weather$clim_month = ifelse(weather$month < 9, weather$month + 4, weather$month - 8)
head(weather)
# To prepare for a merge with the bison data, we need to arrange months horizontally
# rather than vertically. Here is how to do it for the precip data:
precip_wide = weather %>%
select(c(clim_year,clim_month,ppt_in)) %>% # drop all the other climate variables
spread(clim_month,ppt_in) # 'spread' is where the magic happens
# rename months (optional, but I think it makes the data frame easier to understand)
names(precip_wide)[2:13] = paste0("ppt_",c("Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug"))
head(precip_wide)
# aggregate by season
precip_wide = mutate(precip_wide, ppt_Fall = rowSums(precip_wide[,c("ppt_Sep","ppt_Oct","ppt_Nov")]))
precip_wide = mutate(precip_wide, ppt_Win = rowSums(precip_wide[,c("ppt_Dec","ppt_Jan","ppt_Feb")]))
precip_wide = mutate(precip_wide, ppt_Spr = rowSums(precip_wide[,c("ppt_Mar","ppt_Apr","ppt_May")]))
precip_wide = mutate(precip_wide, ppt_Sum = rowSums(precip_wide[,c("ppt_Jun","ppt_Jul","ppt_Aug")]))
head(precip_wide)
# merge with bison
bison_wide = left_join(bison,precip_wide,by=c("year" = "clim_year"))
bb_wide = left_join(bbison,precip_wide,by=c("year" = "clim_year"))
train = subset(bison_wide, year < 2012)
test = subset(bison_wide, year >= 2012)
btrain = subset(bb_wide, year < 2012)
Now we make a list with the data that we want to pass to JAGS
, and a vector of the names of the parameters that we want to monitor, or save:
datos <- list(n = dim(btrain)[1],
logN = log(btrain$N),
ppt_Jan = btrain$ppt_Jan)
params <- c("b0", "b_lag", "b_ppt", "s")
We want the Markov chains to begin with distinct values in order to confirm that they converge on the same stationary distribution. To do that, we write a function to generate initial values for the changes (keep in mind that this has nothing to do with the priors).
inits <- function() list(b0 = runif(1, 0, 1),
b_lag = runif(1, 0, 1),
b_ppt = runif(1, 0, 1),
s = runif(1, 0, 10) )
Now we can use the function jags
to fit the model. Note that in addition to installing the R
package jagsUI
, you will need to install the JAGS library.
library(jagsUI)
bison.sim <- jags(data = datos,
inits = inits,
parameters.to.save = params,
model.file = "bison.bug",
n.chains = 3,
n.iter = 1000,
n.burnin = 500,
n.thin = 1)
To see the results of the analysis we can ask R
to print the output from the function jags
, using print(bison.sim)
in this case. We the name of the model, how many chains were simulated, and other details like the time of execution. Then there is a table with the list of parameters that we monitored, and the deviance. The table also includes the mean, standard deviation, and quantiles estimated by the Markov chains. Next, there is a column (overlap0
) that tells us if the posterior overlaps with zero or not, and another (f
) that tells us what fraction of the posterior takes the same sign as the mean. Finally, there are two columns with important information about convergence. Rhat
estimates whether the chains converged to a stable distribution and n.eff
estimates the effective number of samples in the posterior. Before doing anuthing with the results from JAGS, we have to check for convergence (Rhat
\(\leq 1.1\)). We should also check that the effective sample size of the posterior (n.eff
) is sufficient. If we find a value of Rhat
\(> 1.1\), it means that the Markov chains for that parameter (or any unobserved quantity that we want to estimate) still have not arrived at a stationary distribution. Sometimes, all that is necessary is to run more MCMC iterations, but other times lack of convergence indicates that there are problems with the structure of the model, or that the combination of data and priors that we have are not sufficiently informative to estimate the posteriors.
print(bison.sim)
We can look at some graphs of the chains and posteriors:
plot(bison.sim)
If we want to make predictions, we have to account for the uncertainty in the parameters. This uncertainty is represented by the joint posterior of our model. Let’s see how to make a Monte Carlo prediction using the joint posterior:
nsim = 1000
tot_time = dim(test)[1] + 1
nClim = matrix(NA, nsim, tot_time)
init_obs = test$loglagN[test$year == 2012]
nClim[,1] = rnorm(nsim, init_obs, 0)
for(i in 1: nsim){
idx <- sample.int(bison.sim$mcmc.info$n.samples, 1)
for(j in 2: tot_time){
best_guess = bison.sim$sims.list$b0[idx] +
bison.sim$sims.list$b_lag[idx] * nClim[i, j-1] +
bison.sim$sims.list$b_ppt[idx] * test$ppt_Jan[test$year == 2010 + j]
nClim[i, j] = rnorm(1, best_guess, bison.sim$sims.list$s[idx])
}
}
library(coda)
CI <- HPDinterval(as.mcmc(nClim))
mp <- colMeans(nClim)
plot(c(train$year, test$year), c(train$logN, test$logN),
ylim = c(6, 10), type = "l", xlab = "year", ylab = "log N")
lines(test$year, mp[2: ncol(nClim)], col = "red", lty = "solid")
lines(test$year, CI[2: ncol(nClim), 1], col = "red", lty = "dashed")
lines(test$year, CI[2: ncol(nClim), 2], col = "red", lty = "dashed")
accuracy(mp[2: tot_time], test$logN)
A very useful aspect of models written in BUGS
is that we can use them directly to make predictions, at the same time we fit the model. That is because when we write something like logN[i] ~ dnorm(mu[i], tau)
we are saying that the values of logN
are sampled from a normal distribution. When JAGS
finds values in logN[i]
, it is going to use those values to simulate the Markov chains of the parameters from that normal distrubiotn. In contrast, if instead of finding values it finds NA
(missing values), then it will generate a sample from the normal distribution based on the value of the parameters in the Markov chains. Here is how to use this feature to make predictions about the future:
bison_wNA <- bb_wide
bison_wNA$N[bb_wide$year >= 2012] <- NA
datos <- list(n = dim(bison_wNA)[1],
logN = log(bison_wNA$N),
ppt_Jan = bison_wNA$ppt_Jan)
params <- c("b0", "b_lag", "b_ppt", "s", "logN")
bp.sim <- jags(data = datos,
inits = inits,
parameters.to.save = params,
model.file = "bison.bug",
n.chains = 3,
n.iter = 1000,
n.burnin = 500,
n.thin = 1)
print(bp.sim)
plot(bb_wide$year, log(bb_wide$N),
ylim=c(6,10), type ="l",
xlab= "year", ylab = "log N")
lines(bison_wNA$year[43:48], bp.sim$mean$logN[43:48],
col = "red")
lines(bison_wNA$year[43:48], bp.sim$q2.5$logN[43:48],
col = "red", lty = "dashed")
lines(bison_wNA$year[43:48], bp.sim$q97.5$logN[43:48],
col = "red", lty = "dashed")
accuracy( as.numeric( bp.sim$mean$logN[43:48] ), test$logN)
Let’s fit the same model using Stan
. The main difference from JAGS
(among other things) is that Stan
uses Hamiltonian Monte Carlo. It is a high performance platform for statistical modeling.
First, we have to define the model in the language of Stan
:
cat(file = "bison.stan",
"
data {
int<lower=1> n;
vector[n] logN;
vector[n] ppt_Jan;
}
parameters {
real b0;
real b_lag;
real b_ppt;
real<lower=0> sigma;
}
model{
vector[n] mu;
vector[n] loglagN;
b0 ~ normal(0,10);
b_lag ~ normal(0,1);
b_ppt ~ normal(0,1);
sigma ~ exponential(1);
for(i in 2: n){
loglagN[i] = logN[i-1];
mu[i] = b0 + b_lag * loglagN[i] + b_ppt * ppt_Jan[i];
}
logN[2: n] ~ normal(mu[2: n], sigma);
}
"
)
Now we prepare the data and call stan
:
library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
bison_dat <- list(n = dim(btrain)[1],
logN = log(btrain$N),
ppt_Jan = btrain$ppt_Jan)
fit <- stan(file = 'bison.stan', data = bison_dat,
iter = 1000, thin = 1, chains = 3)
print(fit)
In order to make predictions using the joint posterior of this model, we can extract samples from the posterior and them do a Monte Carlo simulation as we have done before.
pos <- rstan::extract(fit, pars = c("b0", "b_lag", "b_ppt", "sigma"))
nsim = 1000
tot_time = dim(test)[1] + 1
nClim = matrix(NA, nsim, tot_time)
init_obs = test$loglagN[test$year == 2012]
nClim[, 1] = rnorm(nsim, init_obs, 0)
for(i in 1: nsim){
idx <- sample.int(dim(pos$b0), 1)
for(j in 2: tot_time){
best_guess = pos$b0[idx] +
pos$b_lag[idx] * nClim[i, j-1] +
pos$b_ppt[idx] * test$ppt_Jan[test$year == 2010 + j]
nClim[i,j] = rnorm(1, best_guess, pos$sigma[idx])
}
}
library(coda)
CI <- HPDinterval(as.mcmc(nClim))
mp <- colMeans(nClim)
plot(c(train$year, test$year), c(train$logN, test$logN), ylim = c(6, 10), type = "l", xlab = "year", ylab = "log N")
lines(test$year, mp[2: ncol(nClim)], col = "red", lty = "solid")
lines(test$year, CI[2: ncol(nClim), 1], col = "red", lty = "dashed")
lines(test$year, CI[2: ncol(nClim), 2], col = "red", lty = "dashed")
accuracy(mp[2: tot_time], test$logN)
Alternatively, we can make predictions at the same time we fit the model using the “generated quantities” block in the Stan
model:
cat(file = "bison1.stan",
"
data {
int<lower=1> n_train;
int<lower=1> n_test;
vector[n_train] logN;
vector[n_train + n_test] ppt_Jan;
}
parameters {
real b0;
real b_lag;
real b_ppt;
real<lower=0> sigma;
}
model{
vector[n_train] mu;
vector[n_train] loglagN;
b0 ~ normal(0,10);
b_lag ~ normal(0,1);
b_ppt ~ normal(0,1);
sigma ~ exponential(1);
for(i in 2: n_train){
loglagN[i] = logN[i-1];
mu[i] = b0 + b_lag * loglagN[i] + b_ppt * ppt_Jan[i];
}
logN[2: n_train] ~ normal(mu[2: n_train], sigma);
}
generated quantities {
vector[n_test] logNpred;
vector[n_test + 1] llagN;
llagN[1] = logN[n_train];
for(t in 1:n_test){
logNpred[t] = normal_rng(b0 + b_lag * llagN[t] + b_ppt * ppt_Jan[n_train + t], sigma);
llagN[t+1] = logNpred[t];
}
}
"
)
bison1_dat <- list(n_train = dim(btrain)[1],
n_test = dim(test)[1],
logN = log(btrain$N),
ppt_Jan = bison_wNA$ppt_Jan)
fit <- stan(file = 'bison1.stan', data = bison1_dat,
iter = 1000, thin = 1, chains = 3)
print(fit)
fit_summary <- summary(fit)$summary
logNpred <- fit_summary[grepl("logNpred", rownames(fit_summary)),]
plot(bb_wide$year, log(bb_wide$N),
ylim=c(6,10), type ="l",
xlab= "year", ylab = "log N")
lines(test$year, logNpred[, 1],
col = "red")
lines(test$year, logNpred[, 4],
col = "red", lty = "dashed")
lines(test$year, logNpred[, 8],
col = "red", lty = "dashed")
accuracy( as.numeric( logNpred[,1] ), test$logN)
For typical regression models, we can use the brms
package by Paul Bürkner. brms
is an interface between R
and stan
that allows us to write linear models with R
formula syntax and brms
translates the model into stan
.
library(brms)
# mb <- brm(logN ~ loglagN + ppt_Jan, data = train)
# make_stancode(logN ~ loglagN + ppt_Jan, data=train)
mb <- brm(logN ~ loglagN + ppt_Jan,
family = gaussian(),
prior = c(set_prior("normal(0, 1)", class = "b"),
set_prior("exponential(1)", class = "sigma")),
chains = 3,
iter = 1000,
warmup = 500,
data = train)
print(mb)
plot(mb)
pos <- posterior_samples(mb)
nsim = 1000
tot_time = dim(test)[1] + 1
nClim = matrix(NA, nsim, tot_time)
init_obs = test$loglagN[test$year == 2012]
nClim[, 1] = rnorm(nsim, init_obs, 0)
for(i in 1: nsim){
idx <- sample.int(dim(pos)[1], 1)
for(j in 2: tot_time){
best_guess = pos$b_Intercept[idx] +
pos$b_loglagN[idx] * nClim[i, j-1] +
pos$b_ppt_Jan[idx] * test$ppt_Jan[test$year == 2010 + j]
nClim[i,j] = rnorm(1, best_guess, pos$sigma[idx])
}
}
library(coda)
CI <- HPDinterval(as.mcmc(nClim))
mp <- colMeans(nClim)
plot(c(train$year, test$year), c(train$logN, test$logN), ylim = c(6, 10), type = "l", xlab = "year", ylab = "log N")
lines(test$year, mp[2: ncol(nClim)], col = "red", lty = "solid")
lines(test$year, CI[2: ncol(nClim), 1], col = "red", lty = "dashed")
lines(test$year, CI[2: ncol(nClim), 2], col = "red", lty = "dashed")
accuracy(mp[2: tot_time], test$logN)
We’ve seen how to implement Bayesian models in JAGS
, Stan
, and brms
. In general, using these methods means more work for us and for the computer. Bayes requires us to write more code, to think about what priors to use, to check that the MCMC chains have converged and the sample size is sufficient, etc. But ultimately we get basically the same results we would have gotten if we had simply fit lm(logN ~ loglagN + ppt_Jan, data=train )
and then generated predictions with a Monte Carlo simulation using the variance-covariance matrix of the coefficients.
In general, for simple models with a lot of data, Bayesian and frequentist approaches give similar results, unless the priors are informative. But Bayesian methods allow us to easily expand and analyze our models. In addition, we don’t have to assume that the covariance of the parameters follows a multivariate normal distribution. For example, let’s see what happens when we model the daily distance moved (in km) by an elk. We will fit a Gamma distribution to these data using both maximum likelihood (frequentist) and Bayesian approaches.
library(emdbook)
library(bbmle)
datos <- read.csv("https://raw.githubusercontent.com/pbadler/forecasting-course-short/master/data/elk_steps.csv", header = TRUE)
steps <- datos$steps[1:30]
gNLL <- function(shape, scale) {
-sum(dgamma(steps,
shape = shape,
scale = scale,
log = TRUE))
}
gm <- mean(steps)
cv <- var(steps)/mean(steps)
mt <- mle2(gNLL, start = list(shape = gm/cv, scale = cv))
summary(mt)
Now we do a Monte Carlo simulation to get predicted daily movement distances.
mu <- coef(mt)
sigma <- vcov(mt)
hist(steps, 20, freq = FALSE, main = "", xlim = c(0, 4), ylim = c(0, 1.5))
curve(dgamma(x, shape = mu[1], scale = mu[2]), add = TRUE, lwd = 2)
NE = 100
coef_samples = rmvnorm(NE,mean=mu,sigma=sigma)
for(i in 1:NE){
curve(dgamma(x, shape = coef_samples[i,1], scale = coef_samples[i,2]),
add = TRUE, col = alpha("black", 0.2))
}
Same thing with Bayes:
library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
cat(file = "gamma.stan",
"
data {
int<lower=1> n;
vector[n] y;
}
parameters {
real<lower=0> shape;
real<lower=0> rate;
}
model{
shape ~ normal(0, 5);
rate ~ normal(0, 5);
y ~ gamma(shape, rate);
}
generated quantities{
vector[n] y_pred;
for(i in 1:n) y_pred[i] = gamma_rng(shape, rate);
}
"
)
gamma_dat <- list(n = length(steps),
y = steps)
fit <- stan(file = 'gamma.stan', data = gamma_dat,
iter = 5000, thin = 5, chains = 3)
print(fit)
sims <- extract(fit, pars = c("shape", "rate"))
hist(steps, 20, freq = FALSE, main = "", xlim = c(0, 4), ylim = c(0, 1.5))
NE = 100
for(i in 1:NE){
idx <- sample.int(length(sims$shape), 1)
curve(dgamma(x, shape = sims$shape[idx], rate = sims$rate[idx]),
add = TRUE, col = alpha("black", 0.2))
}
In this case, the differences between the predictions of the two approaches reflect the fact that the likelihood surface of this model looks more like a banana than an ellipse.
bsurf = curve3d(gNLL(x,y), sys="none",
from=c(0.01,0.01), to=c(6,1.5),
n=c(91,91))
image(bsurf$x,bsurf$y, log(bsurf$z),
xlab="shape",
ylab = "scale",
col=gray((20:0)/20))
points(sims$shape, 1/sims$rate, pch = 19,
col = alpha("red", 0.1))
points(rmvnorm(1000,mean=mu,sigma=sigma),
pch = 19, col = alpha("blue", 0.1))