A Variational Autoencoder (VAE) is a generative model that learns a compressed, latent representation of the input data and uses this representation to generate new data samples. Unlike traditional autoencoders, VAEs impose a specific structure on the latent space, enforcing a smooth and continuous distribution, typically a Gaussian. This allows for random sampling from the latent space and generating new, diverse data points by decoding these samples. VAEs achieve this by learning an encoder that maps input data to the parameters (mean and variance) of a Gaussian distribution in the latent space and a decoder that maps latent space samples back to the original data space.
Imagine you have a dataset of handwritten digits (like MNIST). A VAE would learn a latent space where each digit is represented by a point. Similar digits would be clustered together in this space. To generate a new digit, the VAE would randomly sample a point from the latent space (for instance, a point between the clusters representing "3" and "8") and decode it into an image. This could result in a digit that looks like a hybrid between a 3 and an 8, demonstrating the VAE's ability to create new data rather than just copy existing examples. The learned distribution could represent features like roundness, slant or the presence of loops.
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Lambda, Layer
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
import numpy as np
# Define the encoder network
def build_encoder(input_dim, latent_dim):
inputs = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(inputs)
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)
return Model(inputs, [z_mean, z_log_var])
# Define the sampling layer (reparameterization trick)
class Sampling(Layer):
def call(self, inputs):
z_mean, z_log_var = inputs
epsilon = K.random_normal(shape=K.shape(z_mean))
return z_mean + K.exp(0.5 * z_log_var) * epsilon
# Define the decoder network
def build_decoder(latent_dim, output_dim):
latent_inputs = Input(shape=(latent_dim,))
x = Dense(256, activation='relu')(latent_inputs)
outputs = Dense(output_dim, activation='sigmoid')(x)
return Model(latent_inputs, outputs)
# Build the VAE model
def build_vae(input_dim, latent_dim):
encoder = build_encoder(input_dim, latent_dim)
decoder = build_decoder(latent_dim, input_dim)
sampler = Sampling()
inputs = Input(shape=(input_dim,))
z_mean, z_log_var = encoder(inputs)
z = sampler([z_mean, z_log_var])
outputs = decoder(z)
vae = Model(inputs, outputs)
# Define the KL divergence loss
kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
vae.add_loss(kl_loss)
return vae, encoder, decoder
# Hyperparameters
input_dim = 28 * 28
latent_dim = 2
epochs = 50
batch_size = 128
# Load and preprocess MNIST dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
# Build and compile the VAE
vae, encoder, decoder = build_vae(input_dim, latent_dim)
vae.compile(optimizer='adam', loss='binary_crossentropy')
# Train the VAE
vae.fit(x_train, x_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test, x_test))
# Generate new samples from the latent space
n = 15 # figure with 15x15 digits
digit_size = 28
figure = np.zeros((digit_size * n, digit_size * n))
grid_x = np.linspace(-1.5, 1.5, n)
grid_y = np.linspace(-1.5, 1.5, n)
for i, yi in enumerate(grid_x):
for j, xi in enumerate(grid_y):
z_sample = np.array([[xi, yi]])
x_decoded = decoder.predict(z_sample)
digit = x_decoded[0].reshape(digit_size, digit_size)
figure[i * digit_size: (i + 1) * digit_size,
j * digit_size: (j + 1) * digit_size] = digit
plt.figure(figsize=(10, 10))
plt.imshow(figure, cmap='Greys_r')
plt.show()