Initial tf-slim checkin (#349)

a5c4fd06 · nathansilberman · Martin Wicke · a5594334 · a5c4fd06 · a5c4fd06
Commit a5c4fd06 authored Aug 26, 2016 by nathansilberman Committed by Martin Wicke Aug 26, 2016
4 changed files
--- a/slim/models/vgg_preprocessing.py
+++ b/slim/models/vgg_preprocessing.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides utilities to preprocess images.
+The preprocessing steps for VGG were introduced in the following technical
+report:
+  Very Deep Convolutional Networks For Large-Scale Image Recognition
+  Karen Simonyan and Andrew Zisserman
+  arXiv technical report, 2015
+  PDF: http://arxiv.org/pdf/1409.1556.pdf
+  ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
+  CC-BY-4.0
+More information can be obtained from the VGG website:
+www.robots.ox.ac.uk/~vgg/research/very_deep/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from tensorflow.python.ops import control_flow_ops
+slim = tf.contrib.slim
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+_RESIZE_SIDE_MIN = 256
+_RESIZE_SIDE_MAX = 512
+def _crop(image, offset_height, offset_width, crop_height, crop_width):
+  """Crops the given image using the provided offsets and sizes.
+  Note that the method doesn't assume we know the input image size but it does
+  assume we know the input image rank.
+  Args:
+    image: an image of shape [height, width, channels].
+    offset_height: a scalar tensor indicating the height offset.
+    offset_width: a scalar tensor indicating the width offset.
+    crop_height: the height of the cropped image.
+    crop_width: the width of the cropped image.
+  Returns:
+    the cropped (and resized) image.
+  Raises:
+    InvalidArgumentError: if the rank is not 3 or if the image dimensions are
+      less than the crop size.
+  """
+  original_shape = tf.shape(image)
+  rank_assertion = tf.Assert(
+      tf.equal(tf.rank(image), 3),
+      ['Rank of image must be equal to 3.'])
+  cropped_shape = control_flow_ops.with_dependencies(
+      [rank_assertion],
+      tf.pack([crop_height, crop_width, original_shape[2]]))
+  size_assertion = tf.Assert(
+      tf.logical_and(
+          tf.greater_equal(original_shape[0], crop_height),
+          tf.greater_equal(original_shape[1], crop_width)),
+      ['Crop size greater than the image size.'])
+  offsets = tf.to_int32(tf.pack([offset_height, offset_width, 0]))
+  # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
+  # define the crop size.
+  image = control_flow_ops.with_dependencies(
+      [size_assertion],
+      tf.slice(image, offsets, cropped_shape))
+  return tf.reshape(image, cropped_shape)
+def _random_crop(image_list, crop_height, crop_width):
+  """Crops the given list of images.
+  The function applies the same crop to each image in the list. This can be
+  effectively applied when there are multiple image inputs of the same
+  dimension such as:
+    image, depths, normals = _random_crop([image, depths, normals], 120, 150)
+  Args:
+    image_list: a list of image tensors of the same dimension but possibly
+      varying channel.
+    crop_height: the new height.
+    crop_width: the new width.
+  Returns:
+    the image_list with cropped images.
+  Raises:
+    ValueError: if there are multiple image inputs provided with different size
+      or the images are smaller than the crop dimensions.
+  """
+  if not image_list:
+    raise ValueError('Empty image_list.')
+  # Compute the rank assertions.
+  rank_assertions = []
+  for i in range(len(image_list)):
+    image_rank = tf.rank(image_list[i])
+    rank_assert = tf.Assert(
+        tf.equal(image_rank, 3),
+        ['Wrong rank for tensor  %s [expected] [actual]',
+         image_list[i].name, 3, image_rank])
+    rank_assertions.append(rank_assert)
+  image_shape = control_flow_ops.with_dependencies(
+      [rank_assertions[0]],
+      tf.shape(image_list[0]))
+  image_height = image_shape[0]
+  image_width = image_shape[1]
+  crop_size_assert = tf.Assert(
+      tf.logical_and(
+          tf.greater_equal(image_height, crop_height),
+          tf.greater_equal(image_width, crop_width)),
+      ['Crop size greater than the image size.'])
+  asserts = [rank_assertions[0], crop_size_assert]
+  for i in range(1, len(image_list)):
+    image = image_list[i]
+    asserts.append(rank_assertions[i])
+    shape = control_flow_ops.with_dependencies([rank_assertions[i]],
+                                               tf.shape(image))
+    height = shape[0]
+    width = shape[1]
+    height_assert = tf.Assert(
+        tf.equal(height, image_height),
+        ['Wrong height for tensor %s [expected][actual]',
+         image.name, height, image_height])
+    width_assert = tf.Assert(
+        tf.equal(width, image_width),
+        ['Wrong width for tensor %s [expected][actual]',
+         image.name, width, image_width])
+    asserts.extend([height_assert, width_assert])
+  # Create a random bounding box.
+  #
+  # Use tf.random_uniform and not numpy.random.rand as doing the former would
+  # generate random numbers at graph eval time, unlike the latter which
+  # generates random numbers at graph definition time.
+  max_offset_height = control_flow_ops.with_dependencies(
+      asserts, tf.reshape(image_height - crop_height + 1, []))
+  max_offset_width = control_flow_ops.with_dependencies(
+      asserts, tf.reshape(image_width - crop_width + 1, []))
+  offset_height = tf.random_uniform(
+      [], maxval=max_offset_height, dtype=tf.int32)
+  offset_width = tf.random_uniform(
+      [], maxval=max_offset_width, dtype=tf.int32)
+  return [_crop(image, offset_height, offset_width,
+                crop_height, crop_width) for image in image_list]
+def _central_crop(image_list, crop_height, crop_width):
+  """Performs central crops of the given image list.
+  Args:
+    image_list: a list of image tensors of the same dimension but possibly
+      varying channel.
+    crop_height: the height of the image following the crop.
+    crop_width: the width of the image following the crop.
+  Returns:
+    the list of cropped images.
+  """
+  outputs = []
+  for image in image_list:
+    image_height = tf.shape(image)[0]
+    image_width = tf.shape(image)[1]
+    offset_height = (image_height - crop_height) / 2
+    offset_width = (image_width - crop_width) / 2
+    outputs.append(_crop(image, offset_height, offset_width,
+                         crop_height, crop_width))
+  return outputs
+def _mean_image_subtraction(image, means):
+  """Subtracts the given means from each image channel.
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image = _mean_image_subtraction(image, means)
+  Note that the rank of `image` must be known.
+  Args:
+    image: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+  Returns:
+    the centered image.
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+  num_channels = image.get_shape().as_list()[-1]
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+  channels = tf.split(2, num_channels, image)
+  for i in range(num_channels):
+    channels[i] -= means[i]
+  return tf.concat(2, channels)
+def _smallest_size_at_least(height, width, smallest_side):
+  """Computes new shape with the smallest side equal to `smallest_side`.
+  Computes new shape with the smallest side equal to `smallest_side` while
+  preserving the original aspect ratio.
+  Args:
+    height: an int32 scalar tensor indicating the current height.
+    width: an int32 scalar tensor indicating the current width.
+    smallest_side: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+  Returns:
+    new_height: an int32 scalar tensor indicating the new height.
+    new_width: and int32 scalar tensor indicating the new width.
+  """
+  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
+  height = tf.to_float(height)
+  width = tf.to_float(width)
+  smallest_side = tf.to_float(smallest_side)
+  scale = tf.cond(tf.greater(height, width),
+                  lambda: smallest_side / width,
+                  lambda: smallest_side / height)
+  new_height = tf.to_int32(height * scale)
+  new_width = tf.to_int32(width * scale)
+  return new_height, new_width
+def _aspect_preserving_resize(image, smallest_side):
+  """Resize images preserving the original aspect ratio.
+  Args:
+    image: A 3-D image `Tensor`.
+    smallest_side: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+  Returns:
+    resized_image: A 3-D tensor containing the resized image.
+  """
+  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
+  shape = tf.shape(image)
+  height = shape[0]
+  width = shape[1]
+  new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
+  image = tf.expand_dims(image, 0)
+  resized_image = tf.image.resize_bilinear(image, [new_height, new_width],
+                                           align_corners=False)
+  resized_image = tf.squeeze(resized_image)
+  resized_image.set_shape([None, None, 3])
+  return resized_image
+def preprocess_for_train(image,
+                         output_height,
+                         output_width,
+                         resize_side_min=_RESIZE_SIDE_MIN,
+                         resize_side_max=_RESIZE_SIDE_MAX):
+  """Preprocesses the given image for training.
+  Note that the actual resizing scale is sampled from
+    [`resize_size_min`, `resize_size_max`].
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    resize_side_min: The lower bound for the smallest side of the image for
+      aspect-preserving resizing.
+    resize_side_max: The upper bound for the smallest side of the image for
+      aspect-preserving resizing.
+  Returns:
+    A preprocessed image.
+  """
+  resize_side = tf.random_uniform(
+      [], minval=resize_side_min, maxval=resize_side_max+1, dtype=tf.int32)
+  image = _aspect_preserving_resize(image, resize_side)
+  image = _random_crop([image], output_height, output_width)[0]
+  image.set_shape([output_height, output_width, 3])
+  image = tf.to_float(image)
+  image = tf.image.random_flip_left_right(image)
+  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
+def preprocess_for_eval(image, output_height, output_width, resize_side):
+  """Preprocesses the given image for evaluation.
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    resize_side: The smallest side of the image for aspect-preserving resizing.
+  Returns:
+    A preprocessed image.
+  """
+  image = _aspect_preserving_resize(image, resize_side)
+  image = _central_crop([image], output_height, output_width)[0]
+  image.set_shape([output_height, output_width, 3])
+  image = tf.to_float(image)
+  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
+def preprocess_image(image, output_height, output_width, is_training=False,
+                     resize_side_min=_RESIZE_SIDE_MIN,
+                     resize_side_max=_RESIZE_SIDE_MAX):
+  """Preprocesses the given image.
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+    resize_side_min: The lower bound for the smallest side of the image for
+      aspect-preserving resizing. If `is_training` is `False`, then this value
+      is used for rescaling.
+    resize_side_max: The upper bound for the smallest side of the image for
+      aspect-preserving resizing. If `is_training` is `False`, this value is
+      ignored. Otherwise, the resize side is sampled from
+        [resize_size_min, resize_size_max].
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    return preprocess_for_train(image, output_height, output_width,
+                                resize_side_min, resize_side_max)
+  else:
+    return preprocess_for_eval(image, output_height, output_width,
+                               resize_side_min)
--- a/slim/nets/lenet.py
+++ b/slim/nets/lenet.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains a variant of the LeNet model definition."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+slim = tf.contrib.slim
+def lenet(images, num_classes=10, is_training=False,
+          dropout_keep_prob=0.5,
+          prediction_fn=slim.softmax,
+          scope='LeNet'):
+  """Creates a variant of the LeNet model.
+  Note that since the output is a set of 'logits', the values fall in the
+  interval of (-infinity, infinity). Consequently, to convert the outputs to a
+  probability distribution over the characters, one will need to convert them
+  using the softmax function:
+        logits = mnist.Mnist(images, is_training=False)
+        probabilities = tf.nn.softmax(logits)
+        predictions = tf.argmax(logits, 1)
+  Args:
+    images: A batch of `Tensors` of size [batch_size, height, width, channels].
+    num_classes: the number of classes in the dataset.
+    is_training: specifies whether or not we're currently training the model.
+      This variable will determine the behaviour of the dropout layer.
+    dropout_keep_prob: the percentage of activation values that are retained.
+    prediction_fn: a function to get predictions out of logits.
+    scope: Optional variable_scope.
+  Returns:
+    logits: the pre-softmax activations, a tensor of size
+      [batch_size, `num_classes`]
+    end_points: a dictionary from components of the network to the corresponding
+      activation.
+  """
+  end_points = {}
+  with tf.variable_scope(scope, 'LeNet', [images, num_classes]):
+    net = slim.conv2d(images, 32, [5, 5], scope='conv1')
+    net = slim.max_pool2d(net, [2, 2], 2, scope='pool1')
+    net = slim.conv2d(net, 64, [5, 5], scope='conv2')
+    net = slim.max_pool2d(net, [2, 2], 2, scope='pool2')
+    net = slim.flatten(net)
+    end_points['Flatten'] = net
+    net = slim.fully_connected(net, 1024, scope='fc3')
+    net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
+                       scope='dropout3')
+    logits = slim.fully_connected(net, num_classes, activation_fn=None,
+                                  scope='fc4')
+  end_points['Logits'] = logits
+  end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
+  return logits, end_points
+lenet.default_image_size = 28
+def lenet_arg_scope(weight_decay=0.0):
+  """Defines the default lenet argument scope.
+  Args:
+    weight_decay: The weight decay to use for regularizing the model.
+  Returns:
+    An `arg_scope` to use for the inception v3 model.
+  """
+  with slim.arg_scope(
+      [slim.conv2d, slim.fully_connected],
+      weights_regularizer=slim.l2_regularizer(weight_decay),
+      weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
+      activation_fn=tf.nn.relu) as sc:
+    return sc
--- a/slim/scripts/train_lenet_on_mnist.sh
+++ b/slim/scripts/train_lenet_on_mnist.sh
+#!/bin/bash
+#
+# Before running this script, make sure you've followed the instructions for
+# downloading and converting the MNIST dataset.
+# See slim/datasets/download_and_convert_mnist.py.
+#
+# Usage:
+# ./slim/scripts/train_lenet_on_mnist.sh
+# Compile the training and evaluation binaries
+bazel build slim:train
+bazel build slim:eval
+# Where the checkpoint and logs will be saved to.
+TRAIN_DIR=/tmp/lenet-model
+# Where the dataset was saved to.
+DATASET_DIR=/tmp/mnist
+# Run training.
+./bazel-bin/slim/train \
+  --train_dir=${TRAIN_DIR} \
+  --dataset_name=mnist \
+  --dataset_split_name=train \
+  --dataset_dir=${DATASET_DIR} \
+  --model_name=lenet \
+  --preprocessing_name=lenet \
+  --max_number_of_steps=20000 \
+  --learning_rate=0.01 \
+  --save_interval_secs=60 \
+  --save_summaries_secs=60 \
+  --optimizer=sgd \
+  --learning_rate_decay_factor=1.0
+  --weight_decay=0
+# Run evaluation.
+./blaze-bin/slim/eval \
+  --checkpoint_path=${TRAIN_DIR} \
+  --eval_dir=${TRAIN_DIR} \
+  --dataset_name=mnist \
+  --dataset_split_name=test \
+  --dataset_dir=${DATASET_DIR} \
+  --model_name=lenet
--- a/slim/train.py
+++ b/slim/train.py