import numpy as np
import pandas as pd
import theano.tensor as tt
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
import os
os.environ['R_USER'] = '~/.local/lib/python3.6/site-packages/rpy2'
import rpy2 # unit tests: python -m 'rpy2.robjects.tests.__init__'
rpy2.__path__
print("rpy2 version", rpy2.__version__)
#import rpy2.situation
#for row in rpy2.situation.iter_info():
# print(row)
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri,r
# from rpy2.robjects.conversion import localconverter
import pymc3 as pm
print('Running on PyMC3 v{}'.format(pm.__version__))
# Initialize random number generator
np.random.seed(123)
from scipy import optimize
def load_r_matrix_into_pandas_dataframe(r_matrix):
'''
Import a matrix from R saved as RData to a pandas data frame without losing the column names of the R matrix
https://stackoverflow.com/q/45392308/395857
- Input: R matrix object
- Output: Pandas DataFrame
'''
numpy_matrix = pandas2ri.ri2py(r_matrix)
frame_column_names = r_matrix.colnames
frame = pd.DataFrame(data=numpy_matrix, columns=list(frame_column_names))
return frame
This uses the rpy2.robjects
package
robjects.r['load']("/mnt/homepage/stats/cleveland.Rdata")
y = robjects.r['y']
X = robjects.r['X']
scale_icept = np.asscalar(np.array(robjects.r ['scale_icept'])) # informative prior on intercept
scale_global = np.asscalar(np.array(robjects.r['scale_global'])) # large effects unlikely
nu_global = np.asscalar(np.array(robjects.r['nu_global'])) # gaussian prior on global scale param
nu_local = np.asscalar(np.array(robjects.r['nu_local'])) # 1 for half-Cauchy, horseshoe+
slab_scale = np.asscalar(np.array(robjects.r['slab_scale'])) # larger value specifies less regularization
slab_df = np.asscalar(np.array(robjects.r['slab_df'])) # large value specifies gaussian prior on slab
U = np.asscalar(np.array(robjects.r['U']).astype(int)) # num unpenalized coeffs including intercept
P = np.asscalar(np.array(robjects.r['P']).astype(int)) # total num coeffs
D = P - U
x = load_r_matrix_into_pandas_dataframe(X)
N = x.shape[0]
#print(x.head())
print(np.round(x.describe(), 2))
xvars = x.iloc[:, U:]
data = {'x': xvars, 'y': y}
TODO: fix model to allow for unpenalized variables
with pm.Model() as logistic_model:
# Priors on model parameters
z = pm.Normal('z', mu=0, sd=1, shape=D)
aux1_local = pm.Normal('aux1_local', mu=0, sd=1, shape=D)
aux2_local = pm.InverseGamma ('aux2_local', alpha=0.5 * nu_local, beta=0.5 * nu_local, shape=D)
aux1_global = pm.Normal('aux1_global', mu=0, sd=1)
aux2_global = pm.InverseGamma('aux2_global', 0.5 * nu_global , 0.5 * nu_global)
caux = pm.InverseGamma('caux', 0.5 * slab_df, 0.5 * slab_df)
alpha = pm.Normal('alpha', mu=0, sd=scale_icept)
# transformed parameters
lambd = aux1_local * np.sqrt (aux2_local)
tau = aux1_global * np.sqrt(aux2_global) * scale_global
c = slab_scale * np.sqrt(caux)
lambd_tilde = np.sqrt(np.divide(c**2 * np.square(lambd),
c**2 + tau**2 * np.square(lambd))
)
beta = z * lambd_tilde * tau
p = pm.math.invlogit(alpha + tt.dot(xvars, beta))
# likelihood and sampling
likelihood = pm.Bernoulli('likelihood', p, observed=y)
step = pm.NUTS(target_accept=.8)
#step = pm.HamiltonianMC()
trace_logistic_model = pm.sample(1000,
chains=4, init='advi+adapt_diag', tune=500)
pm.summary(trace_logistic_model)
_ = pm.traceplot(trace_logistic_model, varnames=["z", "aux1_global", "aux2_global", "caux"])
z = trace_logistic_model['z']
aux1_local = trace_logistic_model['aux1_local']
aux2_local = trace_logistic_model['aux2_local']
aux1_global = trace_logistic_model['aux1_global']
aux2_global = trace_logistic_model['aux2_global']
caux = trace_logistic_model['caux']
c = slab_scale * np.sqrt(caux)
tau = aux1_global * np.sqrt(aux2_global) * scale_global
lambd = aux1_local * np.sqrt(aux2_local)
S = lambd.shape[0]
P = lambd.shape[1]
lambd_tilde = np.empty([S, P])
beta = np.empty([S, P])
for j in range(P):
lambd_tilde[:, j] = np.sqrt(np.divide(c**2 * np.square(lambd[:, j]),
c**2 + tau**2 * np.square(lambd[:, j])))
beta[:, j] = z[:, j] * lambd_tilde[:, j] * tau
beta_means = np.mean(beta, axis=0)
#print(beta_means.round(2))
plt.rcdefaults()
fig, ax = plt.subplots()
varnames = list(x.columns.values)[U:]
y_pos = np.arange(len(varnames))
ax.barh(y_pos, beta_means, align='center',
color='green', ecolor='black')
ax.set_xlabel('Posterior means of regression coefficients')
ax.set_yticks(y_pos)
ax.set_yticklabels(varnames)
plt.show()