Applying CNN Based AutoEncoder (CAE) on MNIST Data

AutoEncoder

Autoencoder

Principal Component Analysis (PCA) are often used to extract orthognal, independent variables for a given coveraiance matrix. It is effectively Singlar Value Deposition (SVD) in linear algebra and it is so powerful and elegant that usually deemed as the crown drews of linear algebra. However, the obvious limition of SVD is the linear transformation assumption. With nonlinear transforms for generalization purpose, Hinton and Salakhutdinov extended the method in data dimensionaility reduction using neural networks. It shown better results than pure PCA method.

In this project, I will use MNIST hand-writing digits dataset and Tensorflow to train an autoencoder (encoder and decoder). Because the flow tries to first compress input data into a smaller dimension, then to regenerate an output that closely matches input. It does not need supervised data (label, bounding box etc.). Autoencoder belongs to the category of unsupervised learning. The final output dimensionality is the same as the input, but the internal dimension can be much smaller. This dimensionality reduction can be deemed as a generalized PCA, it can be very useful to provide insight to data and for efficient storing/comparing/retrieving data. In a sense, I feel that word embedding (covered by one of my other blogs) is a special kind of autoencoder.

In [1]:
# Step 1: Prepare environments
import sys
import time
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import math
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE-DEVICES"]=""    # Change to '0' to use tf.device("/gpu:0")

from tensorflow.examples.tutorials.mnist import input_data
%matplotlib inline

%load_ext autoreload
%autoreload 2

MNIST = input_data.read_data_sets("MNIST_data", one_hot=True)
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
In [2]:
# Define functions for weights and bias initialization
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=1/math.sqrt(shape[2]))
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.zeros(shape)
  return tf.Variable(initial)

# Define CNN and max_pool operations
def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')
In [3]:
# In this CNN Based AutoEncoder, I will compress 784-dimension images to 128 (4 x 4 x 8) dimensions
# The latent code can be used for many purpose
class AutoEncoder(object):
    
    """
    AutoEncoder class for MNIST
    """
    
    def __init__(self, MNIST):
        self.MNIST = MNIST
        # Input, output placeholders
        self.x_input = tf.placeholder(tf.float32, shape=[None, 784])
        #y_ = tf.placeholder(tf.float32, shape=[None, 10])
        #self.mean_img = np.mean(self.MNIST.train.images, axis=0)
        self.mean_img = np.zeros((784))
        # Shapes: bookkeeping dimensions
        self.shapes = []
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

    def __build_graph(self):
        # First cnn layer
        W_conv1 = weight_variable([3, 3, 1, 32])
        b_conv1 = bias_variable([32])

        self.x_image = tf.reshape(self.x_input, [-1,28,28,1])
        self.shapes.append(self.x_image.get_shape().as_list())
        # shape[0]
        print("x_image shape: %s" % self.shapes[-1])

        h_conv1 = tf.nn.relu(tf.nn.conv2d(self.x_image, W_conv1, strides=[1, 2, 2, 1], padding='SAME') + b_conv1)
        self.shapes.append(h_conv1.get_shape().as_list())
        #h_pool1 = max_pool_2x2(h_conv1)
        # shape[1]
        print("h_conv1 dimension: %s" % self.shapes[-1])

        # 2nd CNN layer
        # 64 features with 5x5 filter
        W_conv2 = weight_variable([3, 3, 32, 16])
        b_conv2 = bias_variable([16])

        h_conv2 = tf.nn.relu(tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding='SAME') + b_conv2)
        self.shapes.append(h_conv2.get_shape().as_list())
        # shape[2]
        print("h_conv2 dimension: %s" % self.shapes[-1])
        #h_pool2 = max_pool_2x2(h_conv2)

        # 3rd CNN layer
        # 32 features with 5x5 filter
        W_conv3 = weight_variable([3, 3, 16, 8])
        b_conv3 = bias_variable([8])

        self.laten_code = tf.nn.relu(tf.nn.conv2d(h_conv2, W_conv3, strides=[1, 2, 2, 1], padding='SAME') + b_conv3)
        self.shapes.append(self.laten_code.get_shape().as_list())
        # shape[3]
        print("h_conv3 dimension: %s" % self.shapes[-1])

        #h_pool3 = max_pool_2x2(laten_code)


        # 3rd Deconv layer
        W_dconv3 = W_conv3
        b_dconv3 = bias_variable([16])
        h_dconv3 = tf.nn.relu(tf.nn.conv2d_transpose(self.laten_code, W_dconv3, 
                                                tf.stack([tf.shape(self.x_input)[0], self.shapes[2][1], 
                                                          self.shapes[2][2], self.shapes[2][3]]),
                                                strides=[1, 2, 2, 1], padding='SAME' ) + b_dconv3)
        print("h_dconv3 dimension: %s" % h_dconv3.get_shape().as_list())

        # 2nd Deconv layer
        W_dconv2 = W_conv2
        b_dconv2 = bias_variable([32])
        h_dconv2 = tf.nn.relu(tf.nn.conv2d_transpose(h_dconv3, W_dconv2,
                                                tf.stack([tf.shape(self.x_input)[0], self.shapes[1][1], 
                                                          self.shapes[1][2], self.shapes[1][3]]),
                                                strides=[1, 2, 2, 1], padding='SAME' ) + b_dconv2)

        print("h_dconv2 dimension: %s" % h_dconv2.get_shape())

        # 1st Deconv layer to reconstruct the image
        W_dconv1 = W_conv1
        b_dconv1 = bias_variable([1])
        self.reconstructed_img = tf.nn.relu(tf.nn.conv2d_transpose(h_dconv2, W_dconv1,
                                                tf.stack([tf.shape(self.x_input)[0], self.shapes[0][1], 
                                                          self.shapes[0][2], self.shapes[0][3]]),
                                                strides=[1, 2, 2, 1], padding='SAME' ) + b_dconv1)
        print("reconstructed_img dimension: %s" % self.reconstructed_img.get_shape())


        # Define the cost function
        self.cost = tf.reduce_sum(tf.square(self.reconstructed_img - self.x_image))
    
        #return {'input': x_input, 'laten_code': laten_code, 'reconstructed_img': reconstructed_img, 'cost': cost}
        
    def train(self, n_epochs=10, learning_rate=0.01, batch_size=100):
        self.__build_graph()
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
            saver = tf.train.Saver()
            writer = tf.summary.FileWriter('./graphs', sess.graph)
            
            # Initialize variables
            sess.run(tf.global_variables_initializer())
            # Training
            for epoch_i in range(n_epochs):
                for batch_i in range(self.MNIST.train.num_examples // batch_size):
                    batch_xs, _ = self.MNIST.train.next_batch(batch_size)
                    train_imgs = np.array([img - self.mean_img for img in batch_xs])
                    sess.run(self.optimizer, feed_dict={self.x_input: train_imgs})
                print(epoch_i, sess.run(self.cost, feed_dict={self.x_input: train_imgs}))
                saver.save(sess, 'checkpoints/checkpoint', global_step=self.global_step)
            writer.close()

                
    def test(self):
        # Plot example reconstructions
        n_examples = 10
        imgs, raw_labels = self.MNIST.test.next_batch(n_examples)
        labels = [np.nonzero(label)[0][0] for label in raw_labels]
        test_imgs = np.array([img - self.mean_img for img in imgs])
        with tf.Session() as sess:
            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())
            # Check whether we have check points from training
            ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
            if ckpt and ckpt.model_checkpoint_path:
                pass
            else:
                train()
            initial_step = self.global_step.eval()
            saver.restore(sess, ckpt.model_checkpoint_path)
            restored_imgs = sess.run(self.reconstructed_img, feed_dict={self.x_input: test_imgs})
            
        fig, axs = plt.subplots(2, n_examples, figsize=(10, 2))
        for i in range(n_examples):
            axs[0][i].set_title(labels[i])
            axs[0][i].imshow(
                np.reshape(imgs[i, :], (28, 28)), cmap=plt.get_cmap('gray'))
            axs[0][i].set_axis_off()
            axs[1][i].imshow(
                np.reshape(
                    np.reshape(restored_imgs[i, ...], (784,)) + self.mean_img,
                    (28, 28)), cmap=plt.get_cmap('gray'))
            axs[1][i].set_axis_off()
        plt.show()
In [4]:
# Create CAE and train it with MNIST data
cae = AutoEncoder(MNIST=MNIST)
cae.train(n_epochs=20)
x_image shape: [None, 28, 28, 1]
h_conv1 dimension: [None, 14, 14, 32]
h_conv2 dimension: [None, 7, 7, 16]
h_conv3 dimension: [None, 4, 4, 8]
h_dconv3 dimension: [None, None, None, 16]
h_dconv2 dimension: (?, ?, ?, 32)
reconstructed_img dimension: (?, ?, ?, ?)
(0, 5696.1021)
(1, 4914.9907)
(2, 4715.1685)
(3, 4630.8564)
(4, 4587.5801)
(5, 4545.1592)
(6, 4513.7646)
(7, 4248.1909)
(8, 4197.1558)
(9, 3786.3115)
(10, 3818.7854)
(11, 3735.7927)
(12, 3750.4924)
(13, 3894.1965)
(14, 3990.6133)
(15, 973.03845)
(16, 932.56592)
(17, 832.84894)
(18, 726.32324)
(19, 710.25317)
In [5]:
# Plot randomly selected MNIST test images and reconstructed images
cae.test()
INFO:tensorflow:Restoring parameters from checkpoints/checkpoint-0
In [ ]:
 

Published: July 20 2017

  • category:
blog comments powered by Disqus