See this page for background and details of the dataset used in this example.
## Loading required package: rstan
## Loading required package: StanHeaders
## rstan (Version 2.18.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## Loading required package: parallel
## Loading required package: coda
##
## Attaching package: 'coda'
## The following object is masked from 'package:rstan':
##
## traceplot
## Loading required package: readr
## Loading required package: compiler
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: gridExtra
The model below is for logistic regression with a hierarchical shrinkage prior with a global scale parameter \(\tau\) and \(P\) local scale parameters \(\lambda\), where \(P\) is the number of penalized coefficients.
The linear regression model differs from the linear regression model only in that the response variable y
is specified as real, its distribution is gaussian, and extra lines related to the gaussian residual standard deviation sigma
are included.
// saved as hs.stan
// hierarchical shrinkage prior on regression coeffs
functions {
// elementwise inverse of a vector
vector inv_vec(vector x) {
vector[dims(x)[1]] res;
for (m in 1:dims(x)[1]) {
res[m] = 1 / x[m];
}
return res;
}
}
data {
int <lower=0> with_penalized; // indicator for including penalized variables
int N; // number of training observations
int P; // number of columns in X matrix
int U; // number of unpenalized columns in model matrix
real < lower=0 > scale_icept ; // prior std for the intercept
real < lower=0 > scale_global ; // scale for the half -t prior for tau
real < lower=1 > nu_global ; // degrees of freedom for the half -t priors for tau
real < lower=1 > nu_local ; // degrees of freedom for the half - t priors for lambdas
real < lower=0 > slab_scale ; // slab scale for the regularized horseshoe
real < lower=0 > slab_df ; // slab degrees of freedom for the regularized horseshoe
// vector[N] y; // uncomment for linear regression
int < lower=0 , upper=1 > y[N]; // comment out for linear regression
matrix[N, P] X; // X matrix including intercept
}
parameters {
vector[U] beta_u; // unpenalized regression parameters
vector[P-U] z; // auxiliary variable
// t priors are specified as mixtures: normal scaled by sqrt(~ inv_gamma)
// real logsigma ; // uncomment for linear regression ? where is prior spec
real < lower=0 > aux1_global ;
real < lower=0 > aux2_global ;
vector < lower=0 >[P-U] aux1_local ;
vector < lower=0 >[P-U] aux2_local ;
real < lower=0 > caux ;
}
transformed parameters {
vector[P-U] beta_p; // penalized regression parameters
// real < lower=0 > sigma ; # noise std // uncomment for linear regression
real <lower=0> tau; // global scale parameter
vector <lower=0>[P-U] lambda; // local scale parameters
vector < lower =0 >[P-U] lambda_tilde ; // ’ truncated ’ local shrinkage parameter
real < lower =0 > c; // slab scale
vector[N] theta; // linear predictor
// sigma = exp ( logsigma ); // uncomment for linear regression
lambda = aux1_local .* sqrt ( aux2_local );
tau = aux1_global * sqrt ( aux2_global ) * scale_global;
c = slab_scale * sqrt ( caux );
lambda_tilde = sqrt ( c ^2 * square ( lambda ) ./ (c ^2 + tau ^2* square ( lambda )) );
beta_p = z .* lambda_tilde * tau;
theta = X[, 1:U] * beta_u + with_penalized * X[, (U+1):P] * beta_p;
}
model {
// half Student-t priors for local scale parameters lambda (nu = 1 corresponds to horseshoe)
z ~ normal(0, 1);
aux1_local ~ normal(0, 1);
aux2_local ~ inv_gamma (0.5* nu_local , 0.5* nu_local );
aux1_global ~ normal(0, 1);
aux2_global ~ inv_gamma(0.5* nu_global , 0.5* nu_global );
caux ~ inv_gamma(0.5* slab_df , 0.5* slab_df );
for(j in 1:U) { // unpenalized coefficients including intercept
beta_u[j] ~ normal(0, scale_icept);
}
y ~ bernoulli_logit(theta); // commment out for linear regression
// sigma ~ inv_gamma(1, 1); // uncomment for linear regression
// y_train ~ normal(theta_train, sigma); // uncomment for linear regression
}
generated quantities {
vector[P-U] kappa;
kappa = inv_vec(1.0 + lambda_tilde .* lambda_tilde);
}
## save as stanhelpers.R
summarize.params <- function(samples, pars, varnames) { # returns posterior means and 95% CIs
post.params <- As.mcmc.list(object=samples, pars)
params.summary <- summary(post.params)
params.summary <- data.frame(mean=params.summary$statistics[, 1],
params.summary$quantiles[, c(1, 5)])
colnames(params.summary) <- c("mean", "centile2.5", "centile97.5")
rownames(params.summary) <- varnames
return(params.summary)
}
posterior.means <- function(samples, varnames) { # posterior means of named vars
x.summary <- summary(As.mcmc.list(object=samples, pars=varnames))$statistics
return(x.summary[, 1])
}
get.coefficients <- function(samples, u.names, p.names) { # post. means of regression coeffs
beta.u <- posterior.means(samples, "beta_u")
beta.p <- posterior.means(samples, "beta_p")
names(beta.u) <- c("(Intercept)", u.names)
names(beta.p) <- p.names
return(list(unpenalized=beta.u, penalized=beta.p))
## need a function to get predictions of y given new x data and posterior samples of beta
}
load("../cleveland.Rdata") # loads data objects: y, X, N, U, P, fixed params
## MCMC samples
hs.samples.mc <- sampling(hshrinkreg.model, #data=xdata.list,
iter=1000, warmup=500,
chains=4, seed=123, open_progress=FALSE,
control=list(adapt_delta=0.8))
## Warning: There were 1 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
rstan::stan_trace(hs.samples.mc, pars=c("aux1_global", "aux2_global", "caux", "kappa"))
Trace plots and density plots of shrinkage parameters from MCMC sampling
rstan::stan_dens(hs.samples.mc, pars=c("aux1_global", "aux2_global", "caux", "kappa"))
Trace plots and density plots of shrinkage parameters from MCMC sampling
if(logistic) {
varnames <- c("tau", "lp__")
} else {
varnames <- c("sigma", "tau", "lp__")
}
params.summary.mc <- summarize.params(samples=hs.samples.mc,
pars=varnames,
varnames=varnames)
params.mean.mc <- params.summary.mc[, 1]
names(params.mean.mc) <- rownames(params.summary.mc)
pander(signif(params.summary.mc, 2))
 | mean | centile2.5 | centile97.5 |
---|---|---|---|
tau | 2.8 | 0.25 | 9.3 |
**lp__** | -150 | -160 | -140 |
#varnames <- names(hs.samples.mc)
kappa.mean.mc <- posterior.means(hs.samples.mc, "kappa")
kappas.mc <- data.frame(kappa=as.numeric(kappa.mean.mc),
variable=p.varnames)
kappas.mc$variable <- factor(kappas.mc$variable, levels = kappas.mc$variable)
p1 <- ggplot(kappas.mc, aes(variable, kappa)) + geom_col() + coord_flip() +
ylim(0, 1)
beta_p.mean.mc <- posterior.means(hs.samples.mc, "beta_p")
betas.mc <- data.frame(beta=as.numeric(beta_p.mean.mc),
variable=p.varnames)
betas.mc$variable <- factor(betas.mc$variable, levels = betas.mc$variable)
p2 <- ggplot(betas.mc, aes(variable, beta)) + geom_col() + coord_flip()
grid.arrange(p1, p2, nrow=1)
Posterior means of regression parameters from MCMC sampling
beta.samples <- As.mcmc.list(hs.samples.mc, pars=c("beta_u", "beta_p"))
beta.samples <- rbind(beta.samples[[1]], beta.samples[[2]],
beta.samples[[3]], beta.samples[[4]])
beta.samples <- t(beta.samples) # P x S
Variational Bayes is an approximation to full Bayesian inference, which approximates the true posterior distribution by a simpler distribution chosen to minimize the KL divergence from the true distribution to the approximating distribution. Variational Bayes approximation is usually much faster than sampling the posterior by Markov chain Monte Carlo algorithms, and in many situations the approximation is good enough for practical purposes. The mean field approximation is based on approximating the true posterior by a separable distribution. A special case of variational Bayes inference is to use the _maximum a posteriori_estimates: this is equivalent to approximating the true posterior by a delta function in which all parameters are fixed at the posterior mode. However bad the mean field approximation may be, it will always be better than the maximum a posteriori approximation.
The code below fits a variational Bayes mean field approximation and generates samples from this approximate posterior distribution. The results are similar to those obtained with MCMC sampling.
## algorithm="full rank" does not work well with penalized regression
sink("vb.out")
hs.samples.vb <- vb(hshrinkreg.model, # data=xdata.list,
iter=50000,
output_samples=2000,
init='random',
algorithm="meanfield" #algorithm="fullrank"
)
sink()
params.summary.vb <- summarize.params(samples=hs.samples.vb,
pars=varnames,
varnames=varnames)
params.mean.vb <- params.summary.vb[, 1]
names(params.mean.vb) <- rownames(params.summary.vb)
pander(signif(params.summary.vb, 2))
kappa.mean.vb <- posterior.means(hs.samples.vb, "kappa")
kappas.vb <- data.frame(kappa=as.numeric(kappa.mean.vb),
variable=p.varnames)
p1 <- ggplot(kappas.vb, aes(variable, kappa)) + geom_col() + coord_flip() +
ylim(0, 1)
beta_p.mean.vb <- posterior.means(hs.samples.vb, "beta_p")
betas.vb <- data.frame(beta=as.numeric(beta_p.mean.vb),
variable=p.varnames)
p2 <- ggplot(betas.vb, aes(variable, beta)) + geom_col() + coord_flip()
grid.arrange(p1, p2, nrow=1)
Posterior means of shrinkage coefficients and regression coefficients, obtained with variational mean field approximation
Although the ADVI algorithm generates the evidence lower bound (ELBO) automatically, the current release of Stan outputs this only to one significant figure, which is useless for model comparison.
To fix this, edit the file advi.hpp
within the StanHeaders package directory. If the package is installed system-wide, the location of this file will be in this directory:
/usr/local/lib/R/site-library/StanHeaders/include/src/stan/variational/
Otherwise it will be in the equivalent folder in your home directory.
Edit line 414 of advi.hpp
to change setprecision(1)
to setprecision(6)
.
Update: this fix has been accepted by the package developers, and will be in the next release
The ELBO output is only to the console - this is not so easy to fix. To be able to plot the ELBO values, we have to resort to a hack: save the console output from each run to a file, tidy it up, and read it back into R as a data frame.
This R script will run the variational Bayes algorithm four times on the stan model object hs.stan
using the dataset object xdata.list
, saving the posterior samples from each run to vb.list
and the ELBO values from each run to elbo.list`.
N.vbruns <- 4
vb.list <- vector("list", N.vbruns)
elbo.list <- vector("list", N.vbruns)
if(FALSE) {
for(i in 1:N.vbruns) {
sink(paste0("vb_", i, ".out"))
vb.list[[i]] <- vb(hshrinkreg.model, data=xdata.list, iter=5000,
output_samples=2000,
init='random',
algorithm="meanfield", #algorithm="fullrank",
elbo_samples=200, # default value of 100 is too low
eval_elbo=100,
tol_rel_obj=0.01
)
sink()
system(paste("./saveelbo.sh", i))
# elbo.list[[i]] <- read.table(file=paste0("vb.cleaned_", i, ".out"), header=TRUE)
}
}
To clean up the saved console output before reading the ELBO iterations into R as a data frame, the R script uses the bash script saveelbo.sh
:-
#!/bin/bash
i=$1
vboutfile=vb_$i.out
grep -A 1000 " iter " $vboutfile | sed 's/^ \+//' | sed 's/ \+/\t/g' | \
cut -f1-4 | tac | grep -A 1000 "^[0-9]" | tac > vb.cleaned_$i.out
This R code will use the saved object elbo.list
to plot the trajectory of ELBO values in each run, and will print a correlation matrix for the model parameters so we can see how well the runs agree
elbo.min <- numeric(N.vbruns)
elbo.max <- numeric(N.vbruns)
for(i in 1:N.vbruns) {
elbo.min[i] <- min(elbo.list[[i]]$ELBO)
elbo.max[i] <- max(elbo.list[[i]]$ELBO)
}
elbo.plot <- ggplot(data=elbo.list[[1]], aes(x=iter, y=ELBO, group=1)) +
geom_line() + geom_point() +
labs(x="Iterations", y="Evidence lower bound") +
ylim(c(min(elbo.max-500), max(elbo.max))) +
theme_grey(base_size = 16)
for(i in 2:N.vbruns) {
elbo.plot <- elbo.plot +
geom_line(data=elbo.list[[i]], aes(x=iter, y=ELBO, group=i)) +
geom_point(data=elbo.list[[i]], aes(x=iter, y=ELBO, group=i))
}
elbo.plot
params.means <- NULL
for(i in 1:N.vbruns) {
varnames <- c("beta_z")
params.means <- cbind(params.means,
summarize.params(samples=vb.list[[i]],
pars=varnames)[, 1])
}
print(data.frame(round(cor(params.means), 2), elbo.max))
Piironen and Vehtari (2015) describe an efficient approach to re-using the posterior distribution to select a subset of variables that maximizes predictive performance for the number of retained variables, based on forward selection of predictors by projection of the full linear predictor on to subspaces of variables.
For each model based on a subset of \(Q\) columns of the \(N \times P\) predictor matrix \(\boldsymbol{X}\), the \(S\) samples of fitted values are regressed on these columns of \(X\), and the KL divergence of the distribution of the full-model fitted values from the distribution of sub-space fitted values is computed.
This procedure is used for forward selection, starting with a baseline models and adding variables one at a time so that in each step, the variable for which fitted values have the least KL divergence of full model from subspace model is chosen.
For linear regression, the sub-space fitted values can be computed efficiently as a matrix product. For logistic regression, fitting at each step \(P - Q\) models that add one of the remaining variables to the \(Q\) variables chosen so far, on each of the \(S\) samples is too slow. Instead we fit a single model to regress the full-model fitted values on the \(Q\) variables chosen so far (using the glm.fit
option family=quasi-binomial
, and compute a score test for the remaining \(P - Q\) variables to determine which one will most improve the fit when added to the current model.
xlogy <- function(x, y) { # x*log(y) when x=0 as limit from above
z <- numeric(length(x))
z[x == 0] <- 0 # limit by L'Hopital's rule
z[x > 0] <- x[x > 0] * log(y[x > 0])
return(z)
}
getkl <- function(yfit, yfitp, logistic, sigmasq=NULL) {
## estimate KL divergence between the full and projected model
## fit and fitp are both N x S matrices
if(!logistic) {
sigmasqp <- sigmasq + colMeans((yfit - yfitp)^2)
kl <- mean(0.5 * log(sigmasqp/sigmasq))
} else {
kl <- mean( xlogy(yfit, yfit) - xlogy(yfit, yfitp) +
xlogy(1 - yfit, 1 - yfit) - xlogy(1 - yfit, 1 - yfitp) )
return(kl)
}
}
getkl.compiled <- cmpfun(getkl)
score.logistic <- function(y, p, x) { # score test for each X variable not in null model
## y, p have N rows indexing individuals, S cols indexing posterior samples
## x has P rows and N columns
resids.samples <- (y - p) # N x S
score.samples <- x %*% resids.samples # P x S, sum over indivs before averaging over posterior
missinfo <- apply(score.samples, 1, var) # length P
score <- rowMeans(score.samples) # length P
compinfo <- rowSums(x^2 %*% (p * (1 - p))) # length P
Z <- score / sqrt((compinfo - missinfo)) # length P
return(Z)
}
score.logistic.compiled <- cmpfun(score.logistic)
choose.nextvariable <- function(fit, fitp, xleft) {
## fit is N x S matrix of fitted values given full model
## fitp is matrix of fitted values given null model
## xleft is model matrix of dimension N x nleft
Z <- score.logistic(fit, fitp, t(xleft))
return(which.max(abs(Z))) # returns variable as index column of xleft matrix
}
choose.nextvariable.compiled <- cmpfun(choose.nextvariable)
lm_proj <- function(fit, xp, sigmasq=NULL) {
## fit is N x S matrix of fitted values given full model
## sigmasq is residual variance if gaussian
## xp is model matrix of dimension N x P, including column of ones
## get KL divergence of full-model fitted values from sub-space fitted values
if(!logistic) {
## regress S samples of fitted values fit on subset of X variables xp
## to get projected coefficients wp
wp <- solve(t(xp) %*% xp, t(xp) %*% fit) # matrix of dimension Q x S
## predict from these projected coefficients
fitp <- xp %*% wp
kl <- getkl.compiled(fit, fitp, logistic=FALSE, sigmasq)
} else {
## this should be parallelized
## alternatively write the iteratively reweighted least squares algo to use
## a matrix of y values
## fit is an N x S matrix
## why does this use so much memory?
fitvalues <- function(s) {
glm.fit(x=xp, y=fit[, s],
family=quasibinomial())$fitted.value
}
fitp <- foreach(s=1:ncol(fit),
.combine=cbind,
.inorder=TRUE) %dopar% fitvalues(s=s)
# browser("fitp")
# fitp <- sapply(1:ncol(fit),
# Vectorize(function(s)
# glm.fit(x=xp, y=fit[, s],
# family=quasibinomial())$fitted.value))
kl <- getkl.compiled(fit, fitp, logistic=TRUE, sigmasq)
}
return(list(kl=kl, fitp=fitp))
}
lm_proj.compiled <- cmpfun(lm_proj)
get.projkl <- function(i, chosen, notchosen, fit, x, ind, sigmasq=NULL) {
ind <- sort( c(chosen, notchosen[i]) )
kl <- lm_proj.compiled(fit=fit, xp=x, sigmasq=sigmasq)
return(kl)
}
get.projkl.compiled <- cmpfun(get.projkl)
lm_fprojsel <- function(w, x, U, sigmasq=NULL) {# forward selection using the projection
## U is number of unpenalized variables (always chosen) including intercept
## returns indices of variables chosen (in order of selection) and KL divergences.
fit <- x %*% w # matrix of dimension N x S: N individuals, S samples from posterior
if(logistic) {
fit <- (1 + exp(-fit))^-1
}
P = dim(x)[2]
chosen <- 1:U
## start from model with unpenalized variables only
notchosen <- setdiff(1:P, chosen) # notchosen is a vector of columns in x matrix
nleft <- length(notchosen)
kl <- rep(NA, P - U + 1)
klfit <- lm_proj.compiled(fit, x[, chosen], sigmasq)
kl[1] <- klfit$kl
kl.index <- 2
cat("KL of model with unpenalized variables only", kl[1], "\n")
## add variables one at a time up to 100
while(nleft > 0 & length(chosen) <= 100) {
cat("selecting penalized variable from", nleft, "remaining ... ")
## null model: all x variables chosen so far,
## score test for all nleft remaining variables
## return index of penalized variable with most extreme test statistic
## this value is the column of x[, notchosen]
nextvar <-
choose.nextvariable.compiled(fit=fit, fitp=klfit$fitp, xleft=x[, notchosen])
chosen <- c(chosen, notchosen[nextvar])
notchosen <- setdiff(1:P, chosen)
cat("calculating KL divergence of new model ... ")
## calculate KL divergence of new model and re-use the fitted values
klfit <- lm_proj.compiled(fit, x[, chosen], sigmasq) # fitted values will be re-used
kl[kl.index] <- klfit$kl
cat("variable", nextvar, "added, KL divergence ",
round(kl[kl.index], digits=4), "\n")
kl.index <- kl.index + 1
nleft <- length(notchosen)
}
return(list(chosen=chosen, kl=kl))
}
studyname <- "cleveland"
penalized.vartype <- "variables"
# cat("variable selection by forward selection using projection ...")
spath <- lm_fprojsel(w=beta.samples, x=X, U=U)
Warning: executing %dopar% sequentially: no parallel backend registered
save(spath, file="spath.Rdata")
#cat("done\n")
## each value of spath has first one val
beta.means <- get_posterior_mean(hs.samples.mc, pars=c("beta_u", "beta_p"))
beta.means <- beta.means[, ncol(beta.means)]
chosen <- spath$chosen[-(1:U)]
selected <- data.frame(variable=c("baseline", beta.names[chosen]),
kl=spath$kl[1:(1 + length(chosen))])
selected$rank <- 1:nrow(selected)
selected$beta <- c(1, beta.means[chosen])
print(head(selected))
sel.varnames <- as.character(selected$variable)
max.numlabels <- which(selected$kl / selected$kl[1] < 0.2)[1]
cat(max.numlabels, "variables needed to reduce KL divergence from projection to full model by 80%\n")
max.numlabels <- ifelse(max.numlabels > 7, 7, max.numlabels)
sel.varnames[c(1, max.numlabels:length(sel.varnames))] <- ""
sign.effect <- as.factor(selected$beta >= 0)
#postscript(paste0(studyname, "_projectiveselection.eps"))
p <- ggplot(data=selected, aes(rank-1, kl)) +
geom_point() +
scale_y_reverse(lim=c(max(selected$kl), 0), expand=c(0, 0)) +
xlim(c(0, max(selected$rank))) +
xlab(paste("Number of", penalized.vartype, "retained")) +
ylab("KL divergence from projection\nto full model (nat logs)") +
theme(axis.line.x=element_line()) +
geom_text(aes(label=sel.varnames, colour=sign.effect),
hjust=0, vjust=0,
position=position_nudge(x=0.02 * max(selected$rank), y=-0.01), size=6,
show.legend=FALSE) +
theme_grey(base_size = 18)
print(p)
Warning: Removed 1 rows containing missing values (geom_point).
Warning: Removed 2 rows containing missing values (geom_text).
Projection predictive forward selection of variables
#dev.off()
lastrow <- ifelse(nrow(selected) > 25, 25, nrow(selected))
selected.top <- selected[2:lastrow, ]
selected.top$variable <- factor(selected.top$variable,
levels=rev(as.character(selected.top$variable)))
postscript(paste0(studyname, "_topcoeffs.eps"))
p <- ggplot(data=selected.top, aes(x=variable, y=beta)) +
geom_bar(stat="identity") +
coord_flip() +
ylab("Coefficient when added in sequence") +
theme_grey(base_size = 18)
print(p)
dev.off()