variational_autoencoder.ipynb (Source)

Preamble

In [1]:
%matplotlib notebook
In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

from keras import backend as K

from keras.layers import (Input, InputLayer, Dense, Lambda, Layer, 
                          Add, Multiply)
from keras.models import Model, Sequential
from keras.datasets import mnist
Using TensorFlow backend.
In [3]:
import pandas as pd

from matplotlib.ticker import FormatStrFormatter
from keras.utils.vis_utils import model_to_dot, plot_model
from IPython.display import SVG

Notebook Configuration

In [4]:
np.set_printoptions(precision=2,
                    edgeitems=3,
                    linewidth=80,
                    suppress=True)
In [5]:
'TensorFlow version: ' + K.tf.__version__
Out[5]:
'TensorFlow version: 1.4.1'

Dataset (MNIST)

In [6]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = np.expand_dims(x_train, axis=-1) / 255.
x_test = np.expand_dims(x_test, axis=-1) / 255.
In [7]:
img_rows, img_cols, img_chns = x_train.shape[1:]
Constant definitions
In [8]:
original_dim = img_rows * img_cols
intermediate_dim = 256
latent_dim = 2
batch_size = 100
epochs = 50
epsilon_std = 1.0

Model specification

Encoder

Inference network

In [9]:
x = Input(shape=(original_dim,), name='x')
h = Dense(intermediate_dim, activation='relu', 
          name='hidden_enc')(x)
z_mu = Dense(latent_dim, name='mu')(h)
z_log_var = Dense(latent_dim, name='log_var')(h)
z_sigma = Lambda(lambda t: K.exp(.5*t), name='sigma')(z_log_var)
In [10]:
m = Model(inputs=x, outputs=[z_mu, z_log_var])
In [11]:
SVG(model_to_dot(m, show_shapes=False)
    .create(prog='dot', format='svg'))
Out[11]:
G 140521335860920 x: InputLayer140521335860864 hidden_enc: Dense140521335860920->140521335860864 140521335861032 mu: Dense140521335860864->140521335861032 140521335943520 log_var: Dense140521335860864->140521335943520
In [12]:
plot_model(
    model=m, show_shapes=False,
    to_file='../../images/vae/inference_network.svg'
)

Reparameterization with Merge Layers

In [13]:
z_mu = Input(shape=(latent_dim,), name='mu')
z_sigma = Input(shape=(latent_dim,), name='sigma')
eps = Input(shape=(latent_dim,), name='eps')
z_eps = Multiply(name='z_eps')([z_sigma, eps])
z = Add(name='z')([z_mu, z_eps])
In [14]:
m = Model(inputs=[eps, z_mu, z_sigma], outputs=z)
In [15]:
SVG(model_to_dot(m, show_shapes=False)
    .create(prog='dot', format='svg'))
Out[15]:
G 140521334340688 sigma: InputLayer140521334341192 z_eps: Multiply140521334340688->140521334341192 140521334341416 eps: InputLayer140521334341416->140521334341192 140521334340744 mu: InputLayer140521334341080 z: Add140521334340744->140521334341080 140521334341192->140521334341080
In [16]:
plot_model(
    model=m, show_shapes=False,
    to_file='../../images/vae/reparameterization.svg'
)

Simplified architecture visualization

In [17]:
x = Input(shape=(original_dim,), name='x')
h = Dense(intermediate_dim, activation='relu', 
          name='hidden_enc')(x)
z_mu = Dense(latent_dim, name='mu')(h)
z_log_var = Dense(latent_dim, name='log_var')(h)
z_sigma = Lambda(lambda t: K.exp(.5*t), name='sigma')(z_log_var)

eps = Input(shape=(latent_dim,), name='eps')
z_eps = Multiply(name='z_eps')([z_sigma, eps])
z = Add(name='z')([z_mu, z_eps])
In [18]:
encoder = Model(inputs=[x, eps], outputs=z)
In [19]:
SVG(model_to_dot(encoder, show_shapes=False)
    .create(prog='dot', format='svg'))
Out[19]:
G 140521333986584 x: InputLayer140521333988712 hidden_enc: Dense140521333986584->140521333988712 140521333985464 log_var: Dense140521333988712->140521333985464 140521333987536 mu: Dense140521333988712->140521333987536 140521334002128 sigma: Lambda140521333985464->140521334002128 140521333996008 z_eps: Multiply140521334002128->140521333996008 140521333995280 eps: InputLayer140521333995280->140521333996008 140521334492408 z: Add140521333987536->140521334492408 140521333996008->140521334492408
In [20]:
plot_model(
    model=encoder, show_shapes=False,
    to_file='../../images/vae/encoder.svg'
)
In [21]:
plot_model(
    model=encoder, show_shapes=True,
    to_file='../../images/vae/encoder_shapes.svg'
)

Full architecture visualization with auxiliary layers

In [22]:
class KLDivergenceLayer(Layer):

    """ Identity transform layer that adds KL divergence
    to the final model loss.
    """

    def __init__(self, *args, **kwargs):
        self.is_placeholder = True
        super(KLDivergenceLayer, self).__init__(*args, **kwargs)

    def call(self, inputs):

        mu, log_var = inputs

        kl_batch = - .5 * K.sum(1 + log_var -
                                K.square(mu) -
                                K.exp(log_var), axis=-1)

        self.add_loss(K.mean(kl_batch), inputs=inputs)

        return inputs
In [23]:
x = Input(shape=(original_dim,), name='x')
h = Dense(intermediate_dim, activation='relu', 
          name='hidden_enc')(x)
z_mu = Dense(latent_dim, name='mu')(h)
z_log_var = Dense(latent_dim, name='log_var')(h)
z_mu, z_log_var = KLDivergenceLayer(name='kl')([z_mu, z_log_var])
z_sigma = Lambda(lambda t: K.exp(.5*t), name='sigma')(z_log_var)

eps = Input(tensor=K.random_normal(shape=(K.shape(x)[0], 
                                          latent_dim)), name='eps')
z_eps = Multiply(name='z_eps')([z_sigma, eps])
z = Add(name='z')([z_mu, z_eps])
In [24]:
encoder = Model(inputs=[x, eps], outputs=z)
In [25]:
SVG(model_to_dot(encoder, show_shapes=False)
    .create(prog='dot', format='svg'))
Out[25]:
G 140521334638744 x: InputLayer140521334639248 hidden_enc: Dense140521334638744->140521334639248 140521334640480 mu: Dense140521334639248->140521334640480 140521334475352 log_var: Dense140521334639248->140521334475352 140521333687072 kl: KLDivergenceLayer140521334640480->140521333687072 140521334475352->140521333687072 140521335410640 sigma: Lambda140521333687072->140521335410640 140521333465272 z: Add140521333687072->140521333465272 140521335409576 z_eps: Multiply140521335410640->140521335409576 140521333689312 eps: InputLayer140521333689312->140521335409576 140521335409576->140521333465272
In [26]:
plot_model(
    model=encoder, show_shapes=False,
    to_file='../../images/vae/encoder_full.svg'
)
In [27]:
plot_model(
    model=encoder, show_shapes=True,
    to_file='../../images/vae/encoder_full_shapes.svg'
)

Decoder

In [28]:
decoder = Sequential([
  Dense(intermediate_dim, input_dim=latent_dim, 
        activation='relu', name='hidden_dec'),
  Dense(original_dim, activation='sigmoid', name='x_pred')
], name='decoder')
In [29]:
# equivalent to above. we specify InputLayer 
# explicitly to set layer name for architecture diagram 
decoder = Sequential([
    InputLayer(input_shape=(latent_dim,), name='z'),
    Dense(intermediate_dim, input_shape=(latent_dim,),
          activation='relu', name='hidden_dec'),
    Dense(original_dim, activation='sigmoid', name='x_pred')
], name='decoder')
In [30]:
SVG(model_to_dot(decoder, show_shapes=False)
    .create(prog='dot', format='svg'))
Out[30]:
G 140521333004944 z: InputLayer140521332758680 hidden_dec: Dense140521333004944->140521332758680 140521332759744 x_pred: Dense140521332758680->140521332759744
In [31]:
plot_model(
    model=decoder, show_shapes=False,
    to_file='../../images/vae/decoder.svg'
)

Specifying the VAE

In [32]:
x_pred = decoder(z)
In [33]:
# again, equivalent to above. fleshing it out fully
# for final end-to-end vae architecture visualization;
# otherwise, sequential models just get chunked into
# single layer
h_dec = Dense(intermediate_dim, activation='relu', 
              name='hidden_dec')(z)
x_pred = Dense(original_dim, activation='sigmoid', 
               name='x_pred')(h_dec)
In [34]:
vae = Model(inputs=[x, eps], outputs=x_pred)
In [35]:
SVG(model_to_dot(vae, show_shapes=True)
    .create(prog='dot', format='svg'))
Out[35]:
G 140521334638744 x: InputLayerinput:output:(None, 784)(None, 784)140521334639248 hidden_enc: Denseinput:output:(None, 784)(None, 256)140521334638744->140521334639248 140521334640480 mu: Denseinput:output:(None, 256)(None, 2)140521334639248->140521334640480 140521334475352 log_var: Denseinput:output:(None, 256)(None, 2)140521334639248->140521334475352 140521333687072 kl: KLDivergenceLayerinput:output:[(None, 2), (None, 2)][(None, 2), (None, 2)]140521334640480->140521333687072 140521334475352->140521333687072 140521335410640 sigma: Lambdainput:output:(None, 2)(None, 2)140521333687072->140521335410640 140521333465272 z: Addinput:output:[(None, 2), (None, 2)](None, 2)140521333687072->140521333465272 140521335409576 z_eps: Multiplyinput:output:[(None, 2), (None, 2)](None, 2)140521335410640->140521335409576 140521333689312 eps: InputLayerinput:output:(None, 2)(None, 2)140521333689312->140521335409576 140521335409576->140521333465272 140521333307264 hidden_dec: Denseinput:output:(None, 2)(None, 256)140521333465272->140521333307264 140521333307096 x_pred: Denseinput:output:(None, 256)(None, 784)140521333307264->140521333307096
In [36]:
plot_model(
    model=vae, show_shapes=False,
    to_file='../../images/vae/vae_full.svg'
)
In [37]:
plot_model(
    model=vae, show_shapes=True,
    to_file='../../images/vae/vae_full_shapes.svg'
)

Putting it all together

In [38]:
def nll(y_true, y_pred):
    """ Negative log likelihood (Bernoulli). """

    # keras.losses.binary_crossentropy gives the mean
    # over the last axis. we require the sum
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)
In [39]:
x = Input(shape=(original_dim,))
h = Dense(intermediate_dim, activation='relu')(x)

z_mu = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

z_mu, z_log_var = KLDivergenceLayer()([z_mu, z_log_var])
z_sigma = Lambda(lambda t: K.exp(.5*t))(z_log_var)

eps = Input(tensor=K.random_normal(shape=(K.shape(x)[0], 
                                          latent_dim)))
z_eps = Multiply()([z_sigma, eps])
z = Add()([z_mu, z_eps])

decoder = Sequential([
    Dense(intermediate_dim, input_dim=latent_dim, activation='relu'),
    Dense(original_dim, activation='sigmoid')
])

x_pred = decoder(z)
In [40]:
vae = Model(inputs=[x, eps], outputs=x_pred, name='vae')
vae.compile(optimizer='rmsprop', loss=nll)

Model fitting

In [41]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, original_dim) / 255.
x_test = x_test.reshape(-1, original_dim) / 255.
In [42]:
hist = vae.fit(
    x_train,
    x_train,
    shuffle=True,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(x_test, x_test)
)
Train on 60000 samples, validate on 10000 samples
Epoch 1/50
60000/60000 [==============================] - 3s 45us/step - loss: 189.9761 - val_loss: 172.5720
Epoch 2/50
60000/60000 [==============================] - 2s 36us/step - loss: 170.1135 - val_loss: 167.8894
Epoch 3/50
60000/60000 [==============================] - 2s 32us/step - loss: 166.0852 - val_loss: 164.7543
Epoch 4/50
60000/60000 [==============================] - 2s 33us/step - loss: 163.3691 - val_loss: 162.7478
Epoch 5/50
60000/60000 [==============================] - 2s 33us/step - loss: 161.3272 - val_loss: 160.5001
Epoch 6/50
60000/60000 [==============================] - 2s 32us/step - loss: 159.6904 - val_loss: 159.3923
Epoch 7/50
60000/60000 [==============================] - 2s 32us/step - loss: 158.3870 - val_loss: 158.0175
Epoch 8/50
60000/60000 [==============================] - 2s 32us/step - loss: 157.3856 - val_loss: 157.3784
Epoch 9/50
60000/60000 [==============================] - 2s 33us/step - loss: 156.6052 - val_loss: 156.3889
Epoch 10/50
60000/60000 [==============================] - 2s 33us/step - loss: 155.9176 - val_loss: 155.8414
Epoch 11/50
60000/60000 [==============================] - 2s 33us/step - loss: 155.4029 - val_loss: 155.6303
Epoch 12/50
60000/60000 [==============================] - 2s 35us/step - loss: 154.9231 - val_loss: 155.0976
Epoch 13/50
60000/60000 [==============================] - 2s 34us/step - loss: 154.5098 - val_loss: 154.9601
Epoch 14/50
60000/60000 [==============================] - 2s 34us/step - loss: 154.1307 - val_loss: 154.5180
Epoch 15/50
60000/60000 [==============================] - 2s 33us/step - loss: 153.8076 - val_loss: 154.1228
Epoch 16/50
60000/60000 [==============================] - 2s 34us/step - loss: 153.5146 - val_loss: 154.0472
Epoch 17/50
60000/60000 [==============================] - 2s 32us/step - loss: 153.2379 - val_loss: 154.1220
Epoch 18/50
60000/60000 [==============================] - 2s 33us/step - loss: 152.9888 - val_loss: 153.6126
Epoch 19/50
60000/60000 [==============================] - 2s 33us/step - loss: 152.7369 - val_loss: 153.4778
Epoch 20/50
60000/60000 [==============================] - 2s 32us/step - loss: 152.5146 - val_loss: 153.3696
Epoch 21/50
60000/60000 [==============================] - 2s 33us/step - loss: 152.2750 - val_loss: 153.3662
Epoch 22/50
60000/60000 [==============================] - 2s 33us/step - loss: 152.1104 - val_loss: 153.1072
Epoch 23/50
60000/60000 [==============================] - 2s 32us/step - loss: 151.9265 - val_loss: 152.9527
Epoch 24/50
60000/60000 [==============================] - 2s 33us/step - loss: 151.7205 - val_loss: 152.6068
Epoch 25/50
60000/60000 [==============================] - 2s 32us/step - loss: 151.5694 - val_loss: 152.4197
Epoch 26/50
60000/60000 [==============================] - 2s 32us/step - loss: 151.3990 - val_loss: 152.7037
Epoch 27/50
60000/60000 [==============================] - 2s 32us/step - loss: 151.2282 - val_loss: 152.5272
Epoch 28/50
60000/60000 [==============================] - 2s 33us/step - loss: 151.0731 - val_loss: 152.4991
Epoch 29/50
60000/60000 [==============================] - 2s 35us/step - loss: 150.9353 - val_loss: 152.5627
Epoch 30/50
60000/60000 [==============================] - 2s 35us/step - loss: 150.7897 - val_loss: 152.1692
Epoch 31/50
60000/60000 [==============================] - 2s 34us/step - loss: 150.6560 - val_loss: 152.1399
Epoch 32/50
60000/60000 [==============================] - 2s 32us/step - loss: 150.4971 - val_loss: 151.9153
Epoch 33/50
60000/60000 [==============================] - 2s 33us/step - loss: 150.3984 - val_loss: 151.9858
Epoch 34/50
60000/60000 [==============================] - 2s 34us/step - loss: 150.2646 - val_loss: 151.9476
Epoch 35/50
60000/60000 [==============================] - 2s 33us/step - loss: 150.1343 - val_loss: 151.9988
Epoch 36/50
60000/60000 [==============================] - 2s 33us/step - loss: 150.0046 - val_loss: 151.5923
Epoch 37/50
60000/60000 [==============================] - 2s 32us/step - loss: 149.8885 - val_loss: 151.9278
Epoch 38/50
60000/60000 [==============================] - 2s 33us/step - loss: 149.7350 - val_loss: 151.7532
Epoch 39/50
60000/60000 [==============================] - 2s 32us/step - loss: 149.5951 - val_loss: 151.8567
Epoch 40/50
60000/60000 [==============================] - 2s 32us/step - loss: 149.5323 - val_loss: 151.1956
Epoch 41/50
60000/60000 [==============================] - 2s 32us/step - loss: 149.4150 - val_loss: 151.0924
Epoch 42/50
60000/60000 [==============================] - 2s 32us/step - loss: 149.2937 - val_loss: 151.5380
Epoch 43/50
60000/60000 [==============================] - 2s 32us/step - loss: 149.1658 - val_loss: 151.2329
Epoch 44/50
60000/60000 [==============================] - 2s 32us/step - loss: 149.0978 - val_loss: 151.1590
Epoch 45/50
60000/60000 [==============================] - 2s 33us/step - loss: 148.9758 - val_loss: 151.3259
Epoch 46/50
60000/60000 [==============================] - 2s 32us/step - loss: 148.8757 - val_loss: 151.6150
Epoch 47/50
60000/60000 [==============================] - 2s 32us/step - loss: 148.7731 - val_loss: 151.1791
Epoch 48/50
60000/60000 [==============================] - 2s 32us/step - loss: 148.6903 - val_loss: 151.3282
Epoch 49/50
60000/60000 [==============================] - 2s 34us/step - loss: 148.6127 - val_loss: 151.0146
Epoch 50/50
60000/60000 [==============================] - 2s 33us/step - loss: 148.5373 - val_loss: 151.0519

Model Evaluation

In [43]:
golden_size = lambda width: (width, 2. * width / (1 + np.sqrt(5)))

NELBO

In [44]:
fig, ax = plt.subplots(figsize=golden_size(6))

hist_df = pd.DataFrame(hist.history)
hist_df.plot(ax=ax)

ax.set_ylabel('NELBO')
ax.set_xlabel('# epochs')

ax.set_ylim(.99*hist_df[1:].values.min(), 
            1.1*hist_df[1:].values.max())

plt.savefig('../../images/vae/nelbo.svg', format='svg')
plt.show()

Observed space manifold

In [45]:
# display a 2D manifold of the images
n = 15  # figure with 15x15 images
quantile_min = 0.01
quantile_max = 0.99

# linearly spaced coordinates on the unit square were transformed
# through the inverse CDF (ppf) of the Gaussian to produce values
# of the latent variables z, since the prior of the latent space
# is Gaussian

z1 = norm.ppf(np.linspace(quantile_min, quantile_max, n))
z2 = norm.ppf(np.linspace(quantile_max, quantile_min, n))
z_grid = np.dstack(np.meshgrid(z1, z2))
In [46]:
x_pred_grid = decoder.predict(z_grid.reshape(n*n, latent_dim)) \
                     .reshape(n, n, img_rows, img_cols)
In [47]:
fig, ax = plt.subplots(figsize=(5, 5))

ax.imshow(np.block(list(map(list, x_pred_grid))), cmap='gray')

ax.set_xticks(np.arange(0, n*img_rows, img_rows) + .5 * img_rows)
ax.set_xticklabels(map('{:.2f}'.format, z1), rotation=90)

ax.set_yticks(np.arange(0, n*img_cols, img_cols) + .5 * img_cols)
ax.set_yticklabels(map('{:.2f}'.format, z2))

ax.set_xlabel('$z_1$')
ax.set_ylabel('$z_2$')

plt.savefig('../../images/vae/result_manifold.png')
plt.show()
In [48]:
# deterministic test time encoder
test_encoder = Model(x, z_mu)

# display a 2D plot of the digit classes in the latent space
z_test = test_encoder.predict(x_test, batch_size=batch_size)
In [49]:
fig, ax = plt.subplots(figsize=(6, 5))

cbar = ax.scatter(z_test[:, 0], z_test[:, 1], c=y_test,
                   alpha=.4, s=3**2, cmap='viridis')
fig.colorbar(cbar, ax=ax)

ax.set_xlim(2.*norm.ppf((quantile_min, quantile_max)))
ax.set_ylim(2.*norm.ppf((quantile_min, quantile_max)))

ax.set_xlabel('$z_1$')
ax.set_ylabel('$z_2$')

plt.savefig('../../images/vae/result_latent_space.png')
plt.show()
In [50]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4.5))

ax1.imshow(np.block(list(map(list, x_pred_grid))), cmap='gray')

ax1.set_xticks(np.arange(0, n*img_rows, img_rows) + .5 * img_rows)
ax1.set_xticklabels(map('{:.2f}'.format, z1), rotation=90)

ax1.set_yticks(np.arange(0, n*img_cols, img_cols) + .5 * img_cols)
ax1.set_yticklabels(map('{:.2f}'.format, z2))

ax.set_xlabel('$z_1$')
ax.set_ylabel('$z_2$')

cbar = ax2.scatter(z_test[:, 0], z_test[:, 1], c=y_test,
                   alpha=.4, s=3**2, cmap='viridis')
fig.colorbar(cbar, ax=ax2)

ax2.set_xlim(2.*norm.ppf((quantile_min, quantile_max)))
ax2.set_ylim(2.*norm.ppf((quantile_min, quantile_max)))

ax2.set_xlabel('$z_1$')
ax2.set_ylabel('$z_2$')

plt.savefig('../../images/vae/result_combined.png')
plt.show()