# Bayesian Sparse Logistic Regression with Spike-and-Slab Priors (using Edward)

In [1]:
%matplotlib notebook

In [37]:
import numpy as np
import tensorflow as tf
import edward as ed
import matplotlib.pyplot as plt
import seaborn as sns

from edward.models import (Bernoulli, Laplace, Normal,
Empirical, PointMass)

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

In [3]:
plt.style.use('seaborn-notebook')
sns.set_context('notebook')
np.set_printoptions(precision=2,
edgeitems=3,
linewidth=80,
suppress=True)

In [4]:
'TensorFlow version: ' + tf.__version__

Out[4]:
'TensorFlow version: 1.4.0'
###### Constants¶
In [5]:
T = 10000  # HMC nbr. of samples
P = 1000  # Nbr. posterior (predictive) samples

seed = 8888
rng = np.random.RandomState(seed)


#### Toy Dataset ("Moons")¶

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
*make_moons(500, random_state=rng),
test_size=.2,
random_state=rng
)

In [7]:
N, D = X_train.shape

In [8]:
fig, ax = plt.subplots(figsize=(7, 5))

cb = ax.scatter(*X_test.T, c=y_test, cmap='coolwarm')
fig.colorbar(cb, ax=ax)

ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')

plt.show()


### Gaussian Prior¶

#### Model specification¶

In [9]:
X = tf.placeholder(tf.float32, [N, D])

w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
b = Normal(loc=tf.zeros([]), scale=tf.ones([]))

y = Bernoulli(logits=ed.dot(X, w) + b)


#### Inference¶

##### Maximum a posterior (MAP)¶
In [10]:
qw = PointMass(params=tf.Variable(tf.random_normal([D])))
qb = PointMass(params=tf.Variable(tf.random_normal([])))

In [11]:
inference = ed.MAP({w: qw, b: qb}, data={X: X_train, y: y_train})
inference.run()

1000/1000 [100%] ██████████████████████████████ Elapsed: 2s | Loss: 113.618

In [12]:
w_map = qw.eval()
w_map

Out[12]:
array([ 1.15, -4.49], dtype=float32)
In [13]:
b_map = qb.eval()
b_map

Out[13]:
0.44861546
##### Hybrid Monte Carlo¶
In [14]:
qw = Empirical(params=tf.Variable(tf.random_normal([T, D])))
qb = Empirical(params=tf.Variable(tf.random_normal([T])))

In [15]:
inference = ed.HMC({w: qw, b: qb}, data={X: X_train, y: y_train})
inference.initialize(n_print=10, step_size=0.15)

tf.global_variables_initializer().run()

for i in range(inference.n_iter):

info_dict = inference.update()
inference.print_progress(info_dict)

inference.finalize()

10000/10000 [100%] ██████████████████████████████ Elapsed: 21s | Acceptance Rate: 0.903

In [16]:
w1_min, w1_max = -.2, 2
w2_min, w2_max = -8, 1.


#### Evaluation¶

In [17]:
g = sns.JointGrid(*qw.sample(sample_shape=P).eval().T,
xlim=(w1_min, w1_max),
ylim=(w2_min, w2_max))
g = g.plot_joint(plt.scatter, s=3.**2, alpha=.4)
g = g.plot_joint(sns.kdeplot, zorder=0, n_levels=6)
g = g.plot_marginals(sns.distplot, kde=True)
g = g.set_axis_labels('$w_1$', '$w_2$')


### Laplace Prior¶

#### Model specification¶

In [18]:
X = tf.placeholder(tf.float32, [N, D])

w = Laplace(loc=tf.zeros(D), scale=tf.ones(D))
b = Laplace(loc=tf.zeros([]), scale=tf.ones([]))

y = Bernoulli(logits=ed.dot(X, w) + b)


#### Inference¶

##### Maximum a posterior (MAP)¶
In [19]:
qw = PointMass(params=tf.Variable(tf.random_normal([D])))
qb = PointMass(params=tf.Variable(tf.random_normal([])))

In [20]:
inference = ed.MAP({w: qw, b: qb}, data={X: X_train, y: y_train})
inference.run()

1000/1000 [100%] ██████████████████████████████ Elapsed: 1s | Loss: 106.517

In [21]:
w_map = qw.eval()
w_map

Out[21]:
array([ 1.17, -5.56], dtype=float32)
In [22]:
b_map = qb.eval()
b_map

Out[22]:
0.68323755
##### Hybrid Monte Carlo¶
In [23]:
qw = Empirical(params=tf.Variable(tf.random_normal([T, D])))
qb = Empirical(params=tf.Variable(tf.random_normal([T])))

In [24]:
inference = ed.HMC({w: qw, b: qb}, data={X: X_train, y: y_train})
inference.initialize(n_print=10, step_size=0.15)

tf.global_variables_initializer().run()

for i in range(inference.n_iter):

info_dict = inference.update()
inference.print_progress(info_dict)

inference.finalize()

10000/10000 [100%] ██████████████████████████████ Elapsed: 23s | Acceptance Rate: 0.910


#### Evaluation¶

In [25]:
g = sns.JointGrid(*qw.sample(sample_shape=P).eval().T,
xlim=(w1_min, w1_max),
ylim=(w2_min, w2_max))
g = g.plot_joint(plt.scatter, s=3.**2, alpha=.4)
g = g.plot_joint(sns.kdeplot, zorder=0, n_levels=6)
g = g.plot_marginals(sns.distplot, kde=True)
g = g.set_axis_labels('$w_1$', '$w_2$')


### Spike-and-Slab Prior¶

#### Model specification¶

In [26]:
X = tf.placeholder(tf.float32, [N, D])

w_raw = Normal(loc=tf.zeros(D), scale=tf.ones(D))

b_raw = Normal(loc=tf.zeros([]), scale=tf.ones([]))

y = Bernoulli(logits=ed.dot(X, w) + b)


#### Inference¶

##### Maximum a posterior (MAP)¶
In [27]:
qw = PointMass(params=tf.Variable(tf.random_normal([D])))
qb = PointMass(params=tf.Variable(tf.random_normal([])))

In [28]:
inference = ed.MAP({w_raw: qw, b_raw: qb},
data={X: X_train, y: y_train})
inference.run()

1000/1000 [100%] ██████████████████████████████ Elapsed: 2s | Loss: 286.974

In [29]:
w_map = qw.eval()
w_map

Out[29]:
array([ 1.3 , -3.53], dtype=float32)
In [30]:
b_map = qb.eval()
b_map

Out[30]:
-0.046046846
##### Hybrid Monte Carlo¶
In [31]:
qw = Empirical(params=tf.Variable(tf.random_normal([T, D])))
qb = Empirical(params=tf.Variable(tf.random_normal([T])))

In [32]:
inference = ed.HMC({w_raw: qw, b_raw: qb},
data={X: X_train, y: y_train})
inference.initialize(n_print=10, step_size=0.15)

tf.global_variables_initializer().run()

for i in range(inference.n_iter):

info_dict = inference.update()
inference.print_progress(info_dict)

inference.finalize()

10000/10000 [100%] ██████████████████████████████ Elapsed: 32s | Acceptance Rate: 0.416


#### Evaluation¶

In [33]:
w_mask_samples = w_mask.sample(sample_shape=P).eval()
w_raw_samples = qw.sample(sample_shape=P).eval()

In [34]:
g = sns.JointGrid(*w_samples.T,
xlim=(w1_min, w1_max),
ylim=(w2_min, w2_max))
g = g.plot_joint(plt.scatter, s=3.**2, alpha=.4)
g = g.plot_marginals(sns.distplot, kde=True)
g = g.set_axis_labels('$w_1$', '$w_2$')

In [35]:
mixtures = np.right_shift(np.packbits(w_mask_samples,
axis=1), 6).flatten()

In [36]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(9, 4))

fig.suptitle('Posterior samples')

ax1.scatter(*w_samples.T, s=4.**2, alpha=.6)
ax2.scatter(*w_samples.T, c=mixtures, s=4.**2, alpha=.6, cmap='Blues')

ax1.set_title('Zoomed out')

ax1.set_xlabel('$w_1$')
ax1.set_ylabel('$w_2$')

ax2.set_title('Colored by mixture')

ax2.set_xlabel('$w_1$')

plt.show()