Fix bug in transformer model: Changed the examples to specify the output size (#155)

* Changed the examples to specify the output size instead of the downsample_factor. This is required by PR #57 * Address flake8 errors. * Update readme and parameter descriptions.

Fix bug in transformer model: Changed the examples to specify the output size (#155)
* Changed the examples to specify the output size instead of the downsample_factor. This is required by PR #57 * Address flake8 errors. * Update readme and parameter descriptions.
eec79382 · Evan Lezar · Martin Wicke · 76f567df · eec79382 · eec79382
Commit eec79382 authored May 31, 2016 by Evan Lezar Committed by Martin Wicke May 31, 2016
4 changed files
--- a/transformer/README.md
+++ b/transformer/README.md
@@ -28,13 +28,8 @@ transformer(U, theta, downsample_factor=1)
    theta: float   
        The output of the
        localisation network should be [num_batch, 6].
-    downsample_factor : float
+    out_size: tuple of two ints
-        A value of 1 will keep the original size of the image
+        The size of the output of the network
-        Values larger than 1 will downsample the image. 
-        Values below 1 will upsample the image
-        example image: height = 100, width = 200
-        downsample_factor = 2
-        output image will then be 50, 100
 #### Notes

--- a/transformer/cluttered_mnist.py
+++ b/transformer/cluttered_mnist.py
@@ -11,13 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+# =============================================================================
 import tensorflow as tf
 from spatial_transformer import transformer
-from scipy import ndimage
 import numpy as np
-import matplotlib.pyplot as plt
+from tf_utils import weight_variable, bias_variable, dense_to_one_hot
-from tf_utils import conv2d, linear, weight_variable, bias_variable, dense_to_one_hot
 # %% Load data
 mnist_cluttered = np.load('./data/mnist_sequence1_sample_5distortions5x5.npz')
@@ -48,13 +46,15 @@ y = tf.placeholder(tf.float32, [None, 10])
 # dimension should not change size.
 x_tensor = tf.reshape(x, [-1, 40, 40, 1])
-# %% We'll setup the two-layer localisation network to figure out the parameters for an affine transformation of the input
+# %% We'll setup the two-layer localisation network to figure out the
+# %% parameters for an affine transformation of the input
 # %% Create variables for fully connected layer
 W_fc_loc1 = weight_variable([1600, 20])
 b_fc_loc1 = bias_variable([20])
 W_fc_loc2 = weight_variable([20, 6])
-initial = np.array([[1.,0, 0],[0,1.,0]]) # Use identity transformation as starting point
+# Use identity transformation as starting point
+initial = np.array([[1., 0, 0], [0, 1., 0]])
 initial = initial.astype('float32')
 initial = initial.flatten()
 b_fc_loc2 = tf.Variable(initial_value=initial, name='b_fc_loc2')
@@ -67,8 +67,10 @@ h_fc_loc1_drop = tf.nn.dropout(h_fc_loc1, keep_prob)
 # %% Second layer
 h_fc_loc2 = tf.nn.tanh(tf.matmul(h_fc_loc1_drop, W_fc_loc2) + b_fc_loc2)
-# %% We'll create a spatial transformer module to identify discriminative patches
+# %% We'll create a spatial transformer module to identify discriminative
-h_trans = transformer(x_tensor, h_fc_loc2, downsample_factor=1)
+# %% patches
+out_size = (40, 40)
+h_trans = transformer(x_tensor, h_fc_loc2, out_size)
 # %% We'll setup the first convolutional layer
 # Weight matrix is [height x width x input_channels x output_channels]
@@ -140,7 +142,7 @@ iter_per_epoch = 100
 n_epochs = 500
 train_size = 10000
-indices = np.linspace(0,10000 - 1,iter_per_epoch)
+indices = np.linspace(0, 10000 - 1, iter_per_epoch)
 indices = indices.astype('int')
 for epoch_i in range(n_epochs):
@@ -160,13 +162,12 @@ for epoch_i in range(n_epochs):
        sess.run(optimizer, feed_dict={
            x: batch_xs, y: batch_ys, keep_prob: 0.8})
+    print('Accuracy (%d): ' % epoch_i + str(sess.run(accuracy,
-    print('Accuracy: ' + str(sess.run(accuracy,
                                                     feed_dict={
                                                         x: X_valid,
                                                         y: Y_valid,
                                                         keep_prob: 1.0
                                                     })))
-    #theta = sess.run(h_fc_loc2, feed_dict={
+    # theta = sess.run(h_fc_loc2, feed_dict={
    #        x: batch_xs, keep_prob: 1.0})
-    #print(theta[0])
+    # print(theta[0])
--- a/transformer/example.py
+++ b/transformer/example.py
@@ -17,22 +17,25 @@ from spatial_transformer import transformer
 from scipy import ndimage
 import numpy as np
 import matplotlib.pyplot as plt
-from tf_utils import conv2d, linear, weight_variable, bias_variable
 # %% Create a batch of three images (1600 x 1200)
-# %% Image retrieved from https://raw.githubusercontent.com/skaae/transformer_network/master/cat.jpg
+# %% Image retrieved from:
+# %% https://raw.githubusercontent.com/skaae/transformer_network/master/cat.jpg
 im = ndimage.imread('cat.jpg')
 im = im / 255.
 im = im.reshape(1, 1200, 1600, 3)
 im = im.astype('float32')
+# %% Let the output size of the transformer be half the image size.
+out_size = (600, 800)
 # %% Simulate batch
 batch = np.append(im, im, axis=0)
 batch = np.append(batch, im, axis=0)
 num_batch = 3
 x = tf.placeholder(tf.float32, [None, 1200, 1600, 3])
-x = tf.cast(batch,'float32')
+x = tf.cast(batch, 'float32')
 # %% Create localisation network and convolutional layer
 with tf.variable_scope('spatial_transformer_0'):
@@ -42,13 +45,13 @@ with tf.variable_scope('spatial_transformer_0'):
    W_fc1 = tf.Variable(tf.zeros([1200 * 1600 * 3, n_fc]), name='W_fc1')
    # %% Zoom into the image
-    initial = np.array([[0.5,0, 0],[0,0.5,0]]) 
+    initial = np.array([[0.5, 0, 0], [0, 0.5, 0]])
    initial = initial.astype('float32')
    initial = initial.flatten()
    b_fc1 = tf.Variable(initial_value=initial, name='b_fc1')
-    h_fc1 = tf.matmul(tf.zeros([num_batch ,1200 * 1600 * 3]), W_fc1) + b_fc1
+    h_fc1 = tf.matmul(tf.zeros([num_batch, 1200 * 1600 * 3]), W_fc1) + b_fc1
-    h_trans = transformer(x, h_fc1, downsample_factor=2)
+    h_trans = transformer(x, h_fc1, out_size)
 # %% Run session
 sess = tf.Session()

--- a/transformer/spatial_transformer.py
+++ b/transformer/spatial_transformer.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 import tensorflow as tf
 def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
    """Spatial Transformer Layer
@@ -28,8 +29,8 @@ def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
    theta: float
        The output of the
        localisation network should be [num_batch, 6].
-    out_size: tuple of two floats
+    out_size: tuple of two ints
-        The size of the output of the network
+        The size of the output of the network (height, width)
    References
    ----------
@@ -51,10 +52,11 @@ def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
    def _repeat(x, n_repeats):
        with tf.variable_scope('_repeat'):
-            rep = tf.transpose(tf.expand_dims(tf.ones(shape=tf.pack([n_repeats,])),1),[1,0])
+            rep = tf.transpose(
+                tf.expand_dims(tf.ones(shape=tf.pack([n_repeats, ])), 1), [1, 0])
            rep = tf.cast(rep, 'int32')
-            x = tf.matmul(tf.reshape(x,(-1, 1)), rep)
+            x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
-            return tf.reshape(x,[-1])
+            return tf.reshape(x, [-1])
    def _interpolate(im, x, y, out_size):
        with tf.variable_scope('_interpolate'):
@@ -98,8 +100,9 @@ def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
            idx_c = base_y0 + x1
            idx_d = base_y1 + x1
-            # use indices to lookup pixels in the flat image and restore channels dim
+            # use indices to lookup pixels in the flat image and restore
-            im_flat = tf.reshape(im,tf.pack([-1, channels]))
+            # channels dim
+            im_flat = tf.reshape(im, tf.pack([-1, channels]))
            im_flat = tf.cast(im_flat, 'float32')
            Ia = tf.gather(im_flat, idx_a)
            Ib = tf.gather(im_flat, idx_b)
@@ -111,10 +114,10 @@ def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
            x1_f = tf.cast(x1, 'float32')
            y0_f = tf.cast(y0, 'float32')
            y1_f = tf.cast(y1, 'float32')
-            wa = tf.expand_dims(((x1_f-x) * (y1_f-y)),1)
+            wa = tf.expand_dims(((x1_f-x) * (y1_f-y)), 1)
-            wb = tf.expand_dims(((x1_f-x) * (y-y0_f)),1)
+            wb = tf.expand_dims(((x1_f-x) * (y-y0_f)), 1)
-            wc = tf.expand_dims(((x-x0_f) * (y1_f-y)),1)
+            wc = tf.expand_dims(((x-x0_f) * (y1_f-y)), 1)
-            wd = tf.expand_dims(((x-x0_f) * (y-y0_f)),1)
+            wd = tf.expand_dims(((x-x0_f) * (y-y0_f)), 1)
            output = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])
            return output
@@ -126,12 +129,12 @@ def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
            #  ones = np.ones(np.prod(x_t.shape))
            #  grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
            x_t = tf.matmul(tf.ones(shape=tf.pack([height, 1])),
-                        tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width),1),[1,0])) 
+                            tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
-            y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height),1),
+            y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
                            tf.ones(shape=tf.pack([1, width])))
-            x_t_flat = tf.reshape(x_t,(1, -1))
+            x_t_flat = tf.reshape(x_t, (1, -1))
-            y_t_flat = tf.reshape(y_t,(1, -1))
+            y_t_flat = tf.reshape(y_t, (1, -1))
            ones = tf.ones_like(x_t_flat)
            grid = tf.concat(0, [x_t_flat, y_t_flat, ones])
@@ -152,29 +155,31 @@ def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
            out_height = out_size[0]
            out_width = out_size[1]
            grid = _meshgrid(out_height, out_width)
-            grid = tf.expand_dims(grid,0)
+            grid = tf.expand_dims(grid, 0)
-            grid = tf.reshape(grid,[-1])
+            grid = tf.reshape(grid, [-1])
-            grid = tf.tile(grid,tf.pack([num_batch]))
+            grid = tf.tile(grid, tf.pack([num_batch]))
-            grid = tf.reshape(grid,tf.pack([num_batch, 3, -1])) 
+            grid = tf.reshape(grid, tf.pack([num_batch, 3, -1]))
            # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s)
            T_g = tf.batch_matmul(theta, grid)
-            x_s = tf.slice(T_g, [0,0,0], [-1,1,-1])
+            x_s = tf.slice(T_g, [0, 0, 0], [-1, 1, -1])
-            y_s = tf.slice(T_g, [0,1,0], [-1,1,-1])
+            y_s = tf.slice(T_g, [0, 1, 0], [-1, 1, -1])
-            x_s_flat = tf.reshape(x_s,[-1])
+            x_s_flat = tf.reshape(x_s, [-1])
-            y_s_flat = tf.reshape(y_s,[-1])
+            y_s_flat = tf.reshape(y_s, [-1])
            input_transformed = _interpolate(
                input_dim, x_s_flat, y_s_flat,
                out_size)
-            output = tf.reshape(input_transformed, tf.pack([num_batch, out_height, out_width, num_channels]))
+            output = tf.reshape(
+                input_transformed, tf.pack([num_batch, out_height, out_width, num_channels]))
            return output
    with tf.variable_scope(name):
        output = _transform(theta, U, out_size)
        return output
 def batch_transformer(U, thetas, out_size, name='BatchSpatialTransformer'):
    """Batch Spatial Transformer Layer
@@ -196,4 +201,3 @@ def batch_transformer(U, thetas, out_size, name='BatchSpatialTransformer'):
        indices = [[i]*num_transforms for i in xrange(num_batch)]
        input_repeated = tf.gather(U, tf.reshape(indices, [-1]))
        return transformer(input_repeated, thetas, out_size)