Bayesian logistic regression with hierarchical shrinkage prior using PyMC3

This Jupyter notebook demonstrates how to use PyMC3 for Bayesian logistic regression with shrinkage priors. The statistical approach, and the dataset used for this example are described here

Loading Python modules

In [7]:
import numpy as np
import pandas as pd
import theano.tensor as tt
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')

import os
os.environ['R_USER'] = '~/.local/lib/python3.6/site-packages/rpy2'

import rpy2 # unit tests: python -m 'rpy2.robjects.tests.__init__'
rpy2.__path__
print("rpy2 version", rpy2.__version__)
#import rpy2.situation
#for row in rpy2.situation.iter_info():
#    print(row)
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri,r
# from rpy2.robjects.conversion import localconverter

import pymc3 as pm
print('Running on PyMC3 v{}'.format(pm.__version__))
# Initialize random number generator
np.random.seed(123)

from scipy import optimize

def load_r_matrix_into_pandas_dataframe(r_matrix):
    '''
    Import a matrix from R saved as RData to a pandas data frame without losing the column names of the R matrix
    https://stackoverflow.com/q/45392308/395857
     - Input: R matrix object
     - Output: Pandas DataFrame
    '''
    numpy_matrix = pandas2ri.ri2py(r_matrix)
    frame_column_names = r_matrix.colnames
    frame = pd.DataFrame(data=numpy_matrix, columns=list(frame_column_names))
    return frame
rpy2 version 2.9.5
Running on PyMC3 v3.4.1

Loading data saved as R objects

This uses the rpy2.robjects package

In [58]:
robjects.r['load']("/mnt/homepage/stats/cleveland.Rdata")

y = robjects.r['y']
X = robjects.r['X']
scale_icept = np.asscalar(np.array(robjects.r ['scale_icept'])) # informative prior on intercept
scale_global = np.asscalar(np.array(robjects.r['scale_global'])) # large effects unlikely  
nu_global = np.asscalar(np.array(robjects.r['nu_global']))      # gaussian prior on global scale param
nu_local = np.asscalar(np.array(robjects.r['nu_local']))     # 1 for half-Cauchy, horseshoe+
slab_scale = np.asscalar(np.array(robjects.r['slab_scale'])) # larger value specifies less regularization 
slab_df = np.asscalar(np.array(robjects.r['slab_df']))       # large value specifies gaussian prior on slab

U = np.asscalar(np.array(robjects.r['U']).astype(int)) # num unpenalized coeffs including intercept
P = np.asscalar(np.array(robjects.r['P']).astype(int)) # total num coeffs
D = P - U

x = load_r_matrix_into_pandas_dataframe(X)
N = x.shape[0]

#print(x.head())
print(np.round(x.describe(), 2))

xvars = x.iloc[:, U:]

data = {'x': xvars, 'y': y}
       (Intercept)     age     sex  chest_pain  resting_BP  cholesterol  \
count        297.0  297.00  297.00      297.00      297.00       297.00   
mean           1.0   -0.00   -0.00       -0.00       -0.00         0.00   
std            0.0    1.00    1.00        1.00        1.00         1.00   
min            1.0   -2.82   -1.44       -2.24       -2.05        -2.18   
25%            1.0   -0.72   -1.44       -0.16       -0.69        -0.78   
50%            1.0    0.16    0.69       -0.16       -0.09        -0.07   
75%            1.0    0.71    0.69        0.87        0.52         0.66   
max            1.0    2.48    0.69        0.87        3.20         3.73   

       fasting_blood_sugar     ECG  max_heart_rate  angina  ST_depression  \
count               297.00  297.00          297.00  297.00         297.00   
mean                  0.00   -0.00            0.00    0.00           0.00   
std                   1.00    1.00            1.00    1.00           1.00   
min                  -0.41   -1.00           -3.43   -0.70          -0.95   
25%                  -0.41   -1.00           -0.72   -0.70          -0.95   
50%                  -0.41    0.00            0.15   -0.70          -0.21   
75%                  -0.41    1.01            0.71    1.43           0.55   
max                   2.43    1.01            2.28    1.43           3.39   

       ST_slope  coronary_fluoroscopy  thallium_test  
count    297.00                297.00         297.00  
mean      -0.00                  0.00          -0.00  
std        1.00                  1.00           1.00  
min       -0.97                 -0.72          -0.89  
25%       -0.97                 -0.72          -0.89  
50%        0.64                 -0.72          -0.89  
75%        0.64                  0.34           1.17  
max        2.26                  2.47           1.17  

TODO: fix model to allow for unpenalized variables

MCMC sampling

In [10]:
with pm.Model() as logistic_model:
    # Priors on model parameters
    z = pm.Normal('z', mu=0, sd=1, shape=D)
    aux1_local = pm.Normal('aux1_local', mu=0, sd=1, shape=D)
    aux2_local = pm.InverseGamma ('aux2_local', alpha=0.5 * nu_local, beta=0.5 * nu_local, shape=D)
    aux1_global = pm.Normal('aux1_global', mu=0, sd=1)
    aux2_global = pm.InverseGamma('aux2_global', 0.5 * nu_global , 0.5 * nu_global)
    caux  = pm.InverseGamma('caux', 0.5 * slab_df, 0.5 * slab_df)
    alpha = pm.Normal('alpha', mu=0, sd=scale_icept)

    # transformed parameters
    lambd = aux1_local * np.sqrt (aux2_local)
    tau = aux1_global * np.sqrt(aux2_global) * scale_global
    c = slab_scale * np.sqrt(caux)
    lambd_tilde = np.sqrt(np.divide(c**2 * np.square(lambd), 
                                    c**2 + tau**2 * np.square(lambd)) 
                         )
    beta = z * lambd_tilde * tau
    p = pm.math.invlogit(alpha + tt.dot(xvars, beta))    

    # likelihood and sampling 
    likelihood = pm.Bernoulli('likelihood', p, observed=y)
    step = pm.NUTS(target_accept=.8)
    #step = pm.HamiltonianMC()
    trace_logistic_model = pm.sample(1000, 
                                     chains=4, init='advi+adapt_diag', tune=500)
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = 161.45:   8%|â–Š         | 16971/200000 [00:12<02:19, 1308.11it/s]
Convergence archived at 17100
Interrupted at 17,099 [8%]: Average Loss = 216.57
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [alpha, caux_log__, aux2_global_log__, aux1_global, aux2_local_log__, aux1_local, z]
100%|██████████| 1500/1500 [00:31<00:00, 48.38it/s]
There were 7 divergences after tuning. Increase `target_accept` or reparameterize.
There were 1 divergences after tuning. Increase `target_accept` or reparameterize.
There were 19 divergences after tuning. Increase `target_accept` or reparameterize.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.
The gelman-rubin statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.

Density and trace plots of posterior distribution

In [53]:
pm.summary(trace_logistic_model)
_ = pm.traceplot(trace_logistic_model, varnames=["z", "aux1_global", "aux2_global", "caux"])
In [71]:
z = trace_logistic_model['z']
aux1_local = trace_logistic_model['aux1_local']
aux2_local = trace_logistic_model['aux2_local']
aux1_global = trace_logistic_model['aux1_global']
aux2_global = trace_logistic_model['aux2_global']
caux = trace_logistic_model['caux']

c = slab_scale * np.sqrt(caux)
tau = aux1_global * np.sqrt(aux2_global) * scale_global

lambd = aux1_local * np.sqrt(aux2_local)

S = lambd.shape[0]
P = lambd.shape[1]

lambd_tilde = np.empty([S, P])
beta = np.empty([S, P])

for j in range(P):
    lambd_tilde[:, j] = np.sqrt(np.divide(c**2 * np.square(lambd[:, j]), 
                                    c**2 + tau**2 * np.square(lambd[:, j])))
    beta[:, j] = z[:, j] * lambd_tilde[:, j] * tau
                                     
beta_means = np.mean(beta, axis=0)
#print(beta_means.round(2))

plt.rcdefaults()
fig, ax = plt.subplots()

varnames = list(x.columns.values)[U:]
y_pos = np.arange(len(varnames))

ax.barh(y_pos, beta_means, align='center',
        color='green', ecolor='black')
ax.set_xlabel('Posterior means of regression coefficients')
ax.set_yticks(y_pos)
ax.set_yticklabels(varnames)

plt.show()