Stan model for regression with hierarchical shrinkage prior

See this page for background and details of the dataset used in this example.

## Loading required package: rstan

## Loading required package: StanHeaders

## rstan (Version 2.18.2, GitRev: 2e1f913d3ca3)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)

## Loading required package: parallel

## Loading required package: coda

## 
## Attaching package: 'coda'

## The following object is masked from 'package:rstan':
## 
##     traceplot

## Loading required package: readr

## Loading required package: compiler

## Loading required package: doParallel

## Loading required package: foreach

## Loading required package: iterators

## Loading required package: gridExtra

The model below is for logistic regression with a hierarchical shrinkage prior with a global scale parameter \(\tau\) and \(P\) local scale parameters \(\lambda\), where \(P\) is the number of penalized coefficients.

The linear regression model differs from the linear regression model only in that the response variable y is specified as real, its distribution is gaussian, and extra lines related to the gaussian residual standard deviation sigma are included.

Logistic regression model specified in Stan

// saved as hs.stan

// hierarchical shrinkage prior on regression coeffs 

functions {
  // elementwise inverse of a vector
  vector inv_vec(vector x) {
    vector[dims(x)[1]] res;
    for (m in 1:dims(x)[1]) {
      res[m] = 1 / x[m];
    }
    return res;
  }
}

data {
  int <lower=0> with_penalized; // indicator for including penalized variables
  int N; // number of training observations
  int P; // number of columns in X matrix
  int U; // number of unpenalized columns in model matrix
  real < lower=0 > scale_icept ; // prior std for the intercept
  real < lower=0 > scale_global ; // scale for the half -t prior for tau
  real < lower=1 > nu_global ; // degrees of freedom for the half -t priors for tau
  real < lower=1 > nu_local ; // degrees of freedom for the half - t priors for lambdas
  real < lower=0 > slab_scale ; // slab scale for the regularized horseshoe
  real < lower=0 > slab_df ; // slab degrees of freedom for the regularized horseshoe
  // vector[N] y; // uncomment for linear regression
  int < lower=0 , upper=1 > y[N]; // comment out for linear regression
  matrix[N, P] X; // X matrix including intercept
}

parameters {
  vector[U] beta_u; // unpenalized regression parameters
  vector[P-U] z; // auxiliary variable

  // t priors are specified as mixtures: normal scaled by sqrt(~ inv_gamma)
  // real logsigma ; // uncomment for linear regression ? where is prior spec
  real < lower=0 > aux1_global ;
  real < lower=0 > aux2_global ;
  vector < lower=0 >[P-U] aux1_local ;
  vector < lower=0 >[P-U] aux2_local ;
  real < lower=0 > caux ;
}

transformed parameters {
  vector[P-U] beta_p; // penalized regression parameters
  // real < lower=0 > sigma ; # noise std // uncomment for linear regression
  real <lower=0> tau;  // global scale parameter
  vector <lower=0>[P-U] lambda; // local scale parameters
  vector < lower =0 >[P-U] lambda_tilde ; // ’ truncated ’ local shrinkage parameter
  real < lower =0 > c; // slab scale
  vector[N] theta; // linear predictor

  // sigma = exp ( logsigma ); // uncomment for linear regression
  lambda = aux1_local .* sqrt ( aux2_local );
  tau = aux1_global * sqrt ( aux2_global ) * scale_global;
  c = slab_scale * sqrt ( caux );
  lambda_tilde = sqrt ( c ^2 * square ( lambda ) ./ (c ^2 + tau ^2* square ( lambda )) );
  beta_p = z .* lambda_tilde * tau;
  theta = X[, 1:U] * beta_u + with_penalized * X[, (U+1):P] * beta_p;
}

model {
  // half Student-t priors for local scale parameters lambda (nu = 1 corresponds to horseshoe)
  z ~ normal(0, 1);
  aux1_local ~ normal(0, 1);
  aux2_local ~ inv_gamma (0.5* nu_local , 0.5* nu_local );
  aux1_global ~ normal(0, 1);
  aux2_global ~ inv_gamma(0.5* nu_global , 0.5* nu_global );
  caux ~ inv_gamma(0.5* slab_df , 0.5* slab_df );
  
  for(j in 1:U) { // unpenalized coefficients including intercept
    beta_u[j] ~ normal(0, scale_icept);  
  }
  
  y ~ bernoulli_logit(theta); // commment out for linear regression
  // sigma ~ inv_gamma(1, 1); // uncomment for linear regression
  // y_train ~ normal(theta_train, sigma); // uncomment for linear regression
}

generated quantities {
  vector[P-U] kappa;
  kappa = inv_vec(1.0 + lambda_tilde .* lambda_tilde);
}

Some useful R functions for processing the output of Stan

## save as stanhelpers.R

summarize.params <- function(samples, pars, varnames) { # returns posterior means and 95% CIs
    post.params <- As.mcmc.list(object=samples, pars)
    params.summary <- summary(post.params)
    params.summary <- data.frame(mean=params.summary$statistics[, 1],
                                 params.summary$quantiles[, c(1, 5)])
    colnames(params.summary) <- c("mean", "centile2.5", "centile97.5")
    rownames(params.summary) <- varnames
    return(params.summary)
}

posterior.means <- function(samples, varnames) { # posterior means of named vars
    x.summary <- summary(As.mcmc.list(object=samples, pars=varnames))$statistics
    return(x.summary[, 1])
}

get.coefficients <- function(samples, u.names, p.names) { # post. means of regression coeffs
    beta.u <- posterior.means(samples, "beta_u")
    beta.p <- posterior.means(samples, "beta_p")
    names(beta.u) <- c("(Intercept)", u.names)
    names(beta.p) <- p.names
    return(list(unpenalized=beta.u, penalized=beta.p))

    ## need a function to get predictions of y given new x data and  posterior samples of beta

}

Using MCMC sampling to generate the posterior distribution

load("../cleveland.Rdata") # loads data objects: y, X, N, U, P, fixed params

## MCMC samples
hs.samples.mc <- sampling(hshrinkreg.model, #data=xdata.list,
                          iter=1000, warmup=500,
                          chains=4, seed=123, open_progress=FALSE,
                          control=list(adapt_delta=0.8))

## Warning: There were 1 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup

## Warning: Examine the pairs() plot to diagnose sampling problems

rstan::stan_trace(hs.samples.mc, pars=c("aux1_global", "aux2_global", "caux", "kappa"))

Trace plots and density plots of shrinkage parameters from MCMC sampling

rstan::stan_dens(hs.samples.mc, pars=c("aux1_global", "aux2_global", "caux", "kappa"))

Trace plots and density plots of shrinkage parameters from MCMC sampling

if(logistic) {
        varnames <- c("tau", "lp__")
} else {
    varnames <- c("sigma", "tau", "lp__")
}
params.summary.mc <- summarize.params(samples=hs.samples.mc,
                                      pars=varnames,
                                      varnames=varnames)
params.mean.mc <- params.summary.mc[, 1]
names(params.mean.mc) <- rownames(params.summary.mc)
pander(signif(params.summary.mc, 2))

	mean	centile2.5	centile97.5
tau	2.8	0.25	9.3
lp__	-150	-160	-140

#varnames <- names(hs.samples.mc)

kappa.mean.mc <- posterior.means(hs.samples.mc, "kappa")
kappas.mc <- data.frame(kappa=as.numeric(kappa.mean.mc),
                        variable=p.varnames)
kappas.mc$variable <- factor(kappas.mc$variable, levels = kappas.mc$variable)

p1 <- ggplot(kappas.mc, aes(variable, kappa)) + geom_col() + coord_flip() +
    ylim(0, 1)

beta_p.mean.mc <- posterior.means(hs.samples.mc, "beta_p")
betas.mc <- data.frame(beta=as.numeric(beta_p.mean.mc),
                    variable=p.varnames)
betas.mc$variable <- factor(betas.mc$variable, levels = betas.mc$variable)

p2 <- ggplot(betas.mc, aes(variable, beta)) + geom_col() + coord_flip()

grid.arrange(p1, p2, nrow=1)

Posterior means of regression parameters from MCMC sampling

beta.samples <- As.mcmc.list(hs.samples.mc, pars=c("beta_u", "beta_p"))
beta.samples <- rbind(beta.samples[[1]], beta.samples[[2]],
                      beta.samples[[3]],  beta.samples[[4]])
beta.samples <- t(beta.samples) # P x S

Variational Bayes as an alternative to MCMC

Variational Bayes is an approximation to full Bayesian inference, which approximates the true posterior distribution by a simpler distribution chosen to minimize the KL divergence from the true distribution to the approximating distribution. Variational Bayes approximation is usually much faster than sampling the posterior by Markov chain Monte Carlo algorithms, and in many situations the approximation is good enough for practical purposes. The mean field approximation is based on approximating the true posterior by a separable distribution. A special case of variational Bayes inference is to use the _maximum a posteriori_estimates: this is equivalent to approximating the true posterior by a delta function in which all parameters are fixed at the posterior mode. However bad the mean field approximation may be, it will always be better than the maximum a posteriori approximation.

The code below fits a variational Bayes mean field approximation and generates samples from this approximate posterior distribution. The results are similar to those obtained with MCMC sampling.

## algorithm="full rank" does not work well with penalized regression
sink("vb.out")
hs.samples.vb <- vb(hshrinkreg.model, # data=xdata.list,
                    iter=50000,
                    output_samples=2000,
                    init='random', 
                    algorithm="meanfield"  #algorithm="fullrank"
                    )
sink()

params.summary.vb <- summarize.params(samples=hs.samples.vb,
                                      pars=varnames,
                                      varnames=varnames)
params.mean.vb <- params.summary.vb[, 1]
names(params.mean.vb) <- rownames(params.summary.vb)
pander(signif(params.summary.vb, 2))

kappa.mean.vb <- posterior.means(hs.samples.vb, "kappa")
kappas.vb <- data.frame(kappa=as.numeric(kappa.mean.vb),
                    variable=p.varnames)
p1 <- ggplot(kappas.vb, aes(variable, kappa)) + geom_col() + coord_flip() +
    ylim(0, 1)

beta_p.mean.vb <- posterior.means(hs.samples.vb, "beta_p")
betas.vb <- data.frame(beta=as.numeric(beta_p.mean.vb),
                       variable=p.varnames)
p2 <- ggplot(betas.vb, aes(variable, beta)) + geom_col() + coord_flip()

grid.arrange(p1, p2, nrow=1)

Posterior means of shrinkage coefficients and regression coefficients, obtained with variational mean field approximation

Monitoring the evidence lower bound when running the variational Bayes algorithm

Although the ADVI algorithm generates the evidence lower bound (ELBO) automatically, the current release of Stan outputs this only to one significant figure, which is useless for model comparison.

To fix this, edit the file advi.hpp within the StanHeaders package directory. If the package is installed system-wide, the location of this file will be in this directory:

/usr/local/lib/R/site-library/StanHeaders/include/src/stan/variational/

Otherwise it will be in the equivalent folder in your home directory.

Edit line 414 of advi.hpp to change setprecision(1) to setprecision(6).

Update: this fix has been accepted by the package developers, and will be in the next release

The ELBO output is only to the console - this is not so easy to fix. To be able to plot the ELBO values, we have to resort to a hack: save the console output from each run to a file, tidy it up, and read it back into R as a data frame.

This R script will run the variational Bayes algorithm four times on the stan model object hs.stan using the dataset object xdata.list, saving the posterior samples from each run to vb.list and the ELBO values from each run to elbo.list`.

N.vbruns <- 4
vb.list <- vector("list", N.vbruns)
elbo.list <- vector("list", N.vbruns)

if(FALSE) {
    for(i in 1:N.vbruns) {
        sink(paste0("vb_", i, ".out"))
        vb.list[[i]] <- vb(hshrinkreg.model, data=xdata.list, iter=5000,
                           output_samples=2000,
                           init='random', 
                           algorithm="meanfield",  #algorithm="fullrank",
                           elbo_samples=200, # default value of 100 is too low
                           eval_elbo=100,
                           tol_rel_obj=0.01
                           )
        sink()
        system(paste("./saveelbo.sh", i))
                                        # elbo.list[[i]] <- read.table(file=paste0("vb.cleaned_", i, ".out"), header=TRUE)
    }
}

To clean up the saved console output before reading the ELBO iterations into R as a data frame, the R script uses the bash script saveelbo.sh:-

#!/bin/bash
i=$1
vboutfile=vb_$i.out
grep -A 1000 " iter " $vboutfile | sed 's/^ \+//' | sed 's/ \+/\t/g' | \
    cut -f1-4 | tac | grep -A 1000 "^[0-9]" | tac > vb.cleaned_$i.out

This R code will use the saved object elbo.list to plot the trajectory of ELBO values in each run, and will print a correlation matrix for the model parameters so we can see how well the runs agree

elbo.min <- numeric(N.vbruns)
elbo.max <- numeric(N.vbruns)
for(i in 1:N.vbruns) {
    elbo.min[i] <- min(elbo.list[[i]]$ELBO)
    elbo.max[i] <- max(elbo.list[[i]]$ELBO)
}

elbo.plot <- ggplot(data=elbo.list[[1]], aes(x=iter, y=ELBO, group=1)) +
    geom_line() + geom_point() +
    labs(x="Iterations", y="Evidence lower bound") +
    ylim(c(min(elbo.max-500), max(elbo.max))) +
    theme_grey(base_size = 16) 

for(i in 2:N.vbruns) {
    elbo.plot <- elbo.plot +
        geom_line(data=elbo.list[[i]], aes(x=iter, y=ELBO, group=i)) +
        geom_point(data=elbo.list[[i]], aes(x=iter, y=ELBO, group=i))
}
elbo.plot

params.means <- NULL
for(i in 1:N.vbruns) {
    varnames <- c("beta_z")
    params.means <- cbind(params.means,
                          summarize.params(samples=vb.list[[i]],
                                           pars=varnames)[, 1])
}
print(data.frame(round(cor(params.means), 2), elbo.max))

Projection predictive variable selection

Piironen and Vehtari (2015) describe an efficient approach to re-using the posterior distribution to select a subset of variables that maximizes predictive performance for the number of retained variables, based on forward selection of predictors by projection of the full linear predictor on to subspaces of variables.

For each model based on a subset of \(Q\) columns of the \(N \times P\) predictor matrix \(\boldsymbol{X}\), the \(S\) samples of fitted values are regressed on these columns of \(X\), and the KL divergence of the distribution of the full-model fitted values from the distribution of sub-space fitted values is computed.

This procedure is used for forward selection, starting with a baseline models and adding variables one at a time so that in each step, the variable for which fitted values have the least KL divergence of full model from subspace model is chosen.

For linear regression, the sub-space fitted values can be computed efficiently as a matrix product. For logistic regression, fitting at each step \(P - Q\) models that add one of the remaining variables to the \(Q\) variables chosen so far, on each of the \(S\) samples is too slow. Instead we fit a single model to regress the full-model fitted values on the \(Q\) variables chosen so far (using the glm.fit option family=quasi-binomial, and compute a score test for the remaining \(P - Q\) variables to determine which one will most improve the fit when added to the current model.

R functions for projection predictive variable selection

xlogy <- function(x, y) { # x*log(y) when x=0 as limit from above
    z <- numeric(length(x))
    z[x == 0] <- 0 # limit by L'Hopital's rule
    z[x > 0] <- x[x > 0] * log(y[x > 0])
    return(z)
}

getkl <- function(yfit, yfitp, logistic, sigmasq=NULL) {
    ## estimate KL divergence between the full and projected model
    ## fit and fitp are both N x S matrices  
    if(!logistic) {
        sigmasqp <- sigmasq + colMeans((yfit - yfitp)^2)
        kl <- mean(0.5 * log(sigmasqp/sigmasq))
    } else {
        kl <- mean( xlogy(yfit, yfit)         - xlogy(yfit, yfitp) +
                    xlogy(1 - yfit, 1 - yfit) - xlogy(1 - yfit, 1 - yfitp) )
        return(kl)
    }
}
getkl.compiled <- cmpfun(getkl)

score.logistic <- function(y, p, x) { # score test for each X variable not in null model
    ## y, p have N rows indexing individuals, S cols indexing posterior samples
    ## x has P rows and N columns
    resids.samples <- (y - p) # N x S
    score.samples <- x %*% resids.samples # P x S, sum over indivs before averaging over posterior
    missinfo <- apply(score.samples, 1, var) # length P
    score <- rowMeans(score.samples) # length P
    compinfo <- rowSums(x^2 %*% (p * (1 - p))) # length P
    Z <- score / sqrt((compinfo - missinfo)) # length P
    return(Z)
}
score.logistic.compiled <- cmpfun(score.logistic) 

choose.nextvariable <- function(fit, fitp, xleft) {
    ## fit is N x S matrix of fitted values given full model
    ## fitp is matrix of fitted values given null model
    ## xleft is model matrix of dimension N x nleft
    Z <- score.logistic(fit, fitp, t(xleft))
    return(which.max(abs(Z))) # returns variable as index column of xleft matrix
}
choose.nextvariable.compiled <- cmpfun(choose.nextvariable)

lm_proj <- function(fit, xp, sigmasq=NULL) {
    ## fit is N x S matrix of fitted values given full model
    ## sigmasq is residual variance if gaussian
    ## xp is model matrix of dimension N x P, including column of ones
    ## get KL divergence of full-model fitted values from sub-space fitted values 
    if(!logistic) {
        ## regress S samples of fitted values fit on subset of X variables xp 
        ## to get projected coefficients wp
        wp <- solve(t(xp) %*% xp, t(xp) %*% fit) # matrix of dimension Q x S
        ## predict from these projected coefficients
        fitp <- xp %*% wp
        kl <- getkl.compiled(fit, fitp, logistic=FALSE, sigmasq)
    } else {
        ## this should be parallelized
        ## alternatively write the iteratively reweighted least squares algo to use
        ## a matrix of y values
        ## fit is an N x S matrix
        ## why does this use so much memory? 
        fitvalues <- function(s) { 
            glm.fit(x=xp, y=fit[, s],
                    family=quasibinomial())$fitted.value
        }
        fitp <- foreach(s=1:ncol(fit),
                    .combine=cbind,
                    .inorder=TRUE) %dopar% fitvalues(s=s)
  #       browser("fitp")
  #      fitp <- sapply(1:ncol(fit), 
  #                    Vectorize(function(s)
  #                        glm.fit(x=xp, y=fit[, s],
  #                                family=quasibinomial())$fitted.value))
        kl <- getkl.compiled(fit, fitp, logistic=TRUE, sigmasq)
    }
    return(list(kl=kl, fitp=fitp))
}
lm_proj.compiled <- cmpfun(lm_proj)

get.projkl <- function(i, chosen, notchosen, fit, x, ind, sigmasq=NULL) {
    ind <- sort( c(chosen, notchosen[i]) )
    kl <- lm_proj.compiled(fit=fit, xp=x, sigmasq=sigmasq)
    return(kl)
}
get.projkl.compiled <- cmpfun(get.projkl)

lm_fprojsel <- function(w, x, U, sigmasq=NULL) {# forward selection using the projection
    ## U is number of unpenalized variables (always chosen) including intercept
    ## returns indices of variables chosen (in order of selection) and KL divergences.
    fit <- x %*% w  # matrix of dimension N x S: N individuals, S samples from posterior
    if(logistic) {
        fit <- (1 + exp(-fit))^-1
    }
    P = dim(x)[2]
    chosen <- 1:U
    ## start from model with unpenalized variables only
    notchosen <- setdiff(1:P, chosen) # notchosen is a vector of columns in x matrix
    nleft <- length(notchosen)
    kl <- rep(NA, P - U + 1) 
    klfit <- lm_proj.compiled(fit, x[, chosen], sigmasq)
    kl[1] <- klfit$kl
    kl.index <- 2
    cat("KL of model with unpenalized variables only", kl[1], "\n")
    
    ## add variables one at a time up to 100
    while(nleft > 0 & length(chosen) <= 100) {
        cat("selecting penalized variable from", nleft, "remaining ... ")
        ## null model: all x variables chosen so far,
        ## score test for all nleft remaining variables
        ## return index of penalized variable with most extreme test statistic
        ## this value is the column of x[, notchosen] 
        nextvar <-
            choose.nextvariable.compiled(fit=fit, fitp=klfit$fitp, xleft=x[, notchosen])
        chosen <- c(chosen, notchosen[nextvar])
        notchosen <- setdiff(1:P, chosen)
        
        cat("calculating KL divergence of new model ... ")
        ## calculate KL divergence of new model and re-use the fitted values
        klfit <- lm_proj.compiled(fit, x[, chosen], sigmasq) # fitted values will be re-used
        kl[kl.index] <- klfit$kl
        cat("variable", nextvar, "added, KL divergence ",
            round(kl[kl.index], digits=4), "\n")
        kl.index <- kl.index + 1
        nleft <- length(notchosen)
    }
    return(list(chosen=chosen, kl=kl))
}

studyname <- "cleveland"
penalized.vartype <- "variables"
# cat("variable selection by forward selection using projection ...")
spath <- lm_fprojsel(w=beta.samples, x=X, U=U)

Warning: executing %dopar% sequentially: no parallel backend registered

save(spath, file="spath.Rdata")
#cat("done\n")

## each value of spath has first one val
beta.means <- get_posterior_mean(hs.samples.mc, pars=c("beta_u", "beta_p"))
beta.means <- beta.means[, ncol(beta.means)]
chosen <- spath$chosen[-(1:U)]
selected <- data.frame(variable=c("baseline", beta.names[chosen]),
                       kl=spath$kl[1:(1 + length(chosen))])
selected$rank <- 1:nrow(selected)
selected$beta <- c(1, beta.means[chosen])
print(head(selected))

sel.varnames <- as.character(selected$variable)
max.numlabels <- which(selected$kl / selected$kl[1] < 0.2)[1]
cat(max.numlabels, "variables needed to reduce KL divergence from projection to full model by 80%\n")
max.numlabels <- ifelse(max.numlabels > 7, 7, max.numlabels)

sel.varnames[c(1, max.numlabels:length(sel.varnames))] <- ""
sign.effect <- as.factor(selected$beta >= 0)
#postscript(paste0(studyname, "_projectiveselection.eps"))
p <- ggplot(data=selected, aes(rank-1, kl)) + 
    geom_point() + 
    scale_y_reverse(lim=c(max(selected$kl), 0), expand=c(0, 0)) +
    xlim(c(0, max(selected$rank))) + 
    xlab(paste("Number of", penalized.vartype, "retained")) +
    ylab("KL divergence from projection\nto full model (nat logs)") +
    theme(axis.line.x=element_line()) +
    geom_text(aes(label=sel.varnames, colour=sign.effect),
              hjust=0, vjust=0,
              position=position_nudge(x=0.02 * max(selected$rank), y=-0.01), size=6,
              show.legend=FALSE) +
    theme_grey(base_size = 18)
print(p)

Warning: Removed 1 rows containing missing values (geom_point).

Warning: Removed 2 rows containing missing values (geom_text).

Projection predictive forward selection of variables

#dev.off()

lastrow <- ifelse(nrow(selected) > 25, 25, nrow(selected))
selected.top <- selected[2:lastrow, ]
selected.top$variable <- factor(selected.top$variable,
                                levels=rev(as.character(selected.top$variable)))
postscript(paste0(studyname, "_topcoeffs.eps"))
p <- ggplot(data=selected.top, aes(x=variable, y=beta)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ylab("Coefficient when added in sequence") +
    theme_grey(base_size = 18)
print(p)
dev.off()

Tutorial on using Stan to fit penalized regression models

Paul McKeigue, Marco Colombo

07 January 2019