Preprocessing tuning for resnet (#3558)

86b1f07b · Karmel Allison · GitHub · 376dc8dd · 86b1f07b · 86b1f07b
Unverified Commit 86b1f07b authored Mar 13, 2018 by Karmel Allison Committed by GitHub Mar 13, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 314 additions and 22 deletions

official/resnet/imagenet_main.py official/resnet/imagenet_main.py +57 -22

official/resnet/imagenet_preprocessing.py official/resnet/imagenet_preprocessing.py +257 -0

No files found.
--- a/official/resnet/imagenet_main.py
+++ b/official/resnet/imagenet_main.py
@@ -24,7 +24,7 @@ import sys
 import tensorflow as tf

 from official.resnet import resnet
-from official.resnet import vgg_preprocessing
+from official.resnet import imagenet_preprocessing

 _DEFAULT_IMAGE_SIZE = 224
 _NUM_CHANNELS = 3
@@ -57,9 +57,25 @@ def get_filenames(is_training, data_dir):
 def _parse_example_proto(example_serialized):
  """Parses an Example proto containing a training example of an image.

-  The dataset contains serialized Example protocol buffers.
-  The Example proto is expected to contain features named
-  image/encoded (a JPEG-encoded string) and image/class/label (int)
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields (values are included as examples):
+
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>

  Args:
    example_serialized: scalar Tensor tf.string containing a serialized
@@ -67,19 +83,45 @@ def _parse_example_proto(example_serialized):

  Returns:
    image_buffer: Tensor tf.string containing the contents of a JPEG file.
-    label: Tensor tf.int64 containing the label.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
  """
  # Dense features in Example proto.
  feature_map = {
      'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
                                          default_value=''),
      'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
-                                              default_value=-1)
+                                              default_value=-1),
+      'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
+                                             default_value=''),
  }
+  sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+      {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                   'image/object/bbox/ymin',
+                                   'image/object/bbox/xmax',
+                                   'image/object/bbox/ymax']})

  features = tf.parse_single_example(example_serialized, feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)

-  return features['image/encoded'], features['image/class/label']
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(bbox, [0, 2, 1])
+
+  return features['image/encoded'], label, bbox


 def parse_record(raw_record, is_training):
@@ -95,25 +137,18 @@ def parse_record(raw_record, is_training):

  Returns:
    Tuple with processed image tensor and one-hot-encoded label tensor.
-"""
-  image, label = _parse_example_proto(raw_record)
-
-  # Decode the string as an RGB JPEG.
-  # Note that the resulting image contains an unknown height and width
-  # that is set dynamically by decode_jpeg. In other words, the height
-  # and width of image is unknown at compile-time.
-  # Results in a 3-D int8 Tensor. This will be converted to a float later,
-  # during resizing.
-  image = tf.image.decode_jpeg(image, channels=_NUM_CHANNELS)
-
-  image = vgg_preprocessing.preprocess_image(
-      image=image,
+  """
+  image_buffer, label, bbox = _parse_example_proto(raw_record)
+
+  image = imagenet_preprocessing.preprocess_image(
+      image_buffer=image_buffer,
+      bbox=bbox,
      output_height=_DEFAULT_IMAGE_SIZE,
      output_width=_DEFAULT_IMAGE_SIZE,
+      num_channels=_NUM_CHANNELS,
      is_training=is_training)

-  label = tf.cast(tf.reshape(label, shape=[]), dtype=tf.int32)
-  label = tf.one_hot(label, _NUM_CLASSES)
+  label = tf.one_hot(tf.reshape(label, shape=[]), _NUM_CLASSES)

  return image, label


--- a/official/resnet/vgg_preprocessing.py
+++ b/official/resnet/vgg_preprocessing.py
@@ -14,18 +14,21 @@
 # ==============================================================================
 """Provides utilities to preprocess images.

-The preprocessing steps for VGG were introduced in the following technical
-report:
-
-  Very Deep Convolutional Networks For Large-Scale Image Recognition
-  Karen Simonyan and Andrew Zisserman
-  arXiv technical report, 2015
-  PDF: http://arxiv.org/pdf/1409.1556.pdf
-  ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
-  CC-BY-4.0
-
-More information can be obtained from the VGG website:
-www.robots.ox.ac.uk/~vgg/research/very_deep/
+Training images are sampled using the provided bounding boxes, and subsequently
+cropped to the sampled bounding box. Images are additionally flipped randomly,
+then resized to the target output size (without aspect-ratio preservation).
+
+Images used during evaluation are resized (with aspect-ratio preservation) and
+centrally cropped.
+
+All images undergo mean color subtraction.
+
+Note that these steps are colloquially referred to as "ResNet preprocessing,"
+and they differ from "VGG preprocessing," which does not use bounding boxes
+and instead does an aspect-preserving resize followed by random crop during
+training. (These both differ from "Inception preprocessing," which introduces
+color distortion steps.)
+
 """

 from __future__ import absolute_import
@@ -37,44 +40,59 @@ import tensorflow as tf
 _R_MEAN = 123.68
 _G_MEAN = 116.78
 _B_MEAN = 103.94
+_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]

-_RESIZE_SIDE_MIN = 256
-_RESIZE_SIDE_MAX = 512
-
-
-def _get_h_w(image):
-  """Convenience for grabbing the height and width of an image.
-  """
-  shape = tf.shape(image)
-  return shape[0], shape[1]
+# The lower bound for the smallest side of the image for aspect-preserving
+# resizing. For example, if an image is 500 x 1000, it will be resized to
+# _RESIZE_MIN x (_RESIZE_MIN * 2).
+_RESIZE_MIN = 256


-def _random_crop_and_flip(image, crop_height, crop_width):
+def _decode_crop_and_flip(image_buffer, bbox, num_channels):
  """Crops the given image to a random part of the image, and randomly flips.

+  We use the fused decode_and_crop op, which performs better than the two ops
+  used separately in series, but note that this requires that the image be
+  passed in as an un-decoded string Tensor.
+
  Args:
-    image: a 3-D image tensor
-    crop_height: the new height.
-    crop_width: the new width.
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    num_channels: Integer depth of the image buffer for decoding.

  Returns:
    3-D tensor with cropped image.

  """
-  height, width = _get_h_w(image)
-
-  # Create a random bounding box.
-  # Use tf.random_uniform and not numpy.random.rand as doing the former would
-  # generate random numbers at graph eval time, unlike the latter which
-  # generates random numbers at graph definition time.
-  total_crop_height = (height - crop_height)
-  crop_top = tf.random_uniform([], maxval=total_crop_height + 1, dtype=tf.int32)
-  total_crop_width = (width - crop_width)
-  crop_left = tf.random_uniform([], maxval=total_crop_width + 1, dtype=tf.int32)
-
-  cropped = tf.slice(
-      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
-
+  # A large fraction of image datasets contain a human-annotated bounding box
+  # delineating the region of the image containing the object of interest.  We
+  # choose to create a new bounding box for the object which is a randomly
+  # distorted version of the human-annotated bounding box that obeys an
+  # allowed range of aspect ratios, sizes and overlap with the human-annotated
+  # bounding box. If no box is supplied, then we assume the bounding box is
+  # the entire image.
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      tf.image.extract_jpeg_shape(image_buffer),
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_y, offset_x, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+
+  # Use the fused decode and crop op here, which is faster than each in series.
+  cropped = tf.image.decode_and_crop_jpeg(
+      image_buffer, crop_window, channels=num_channels)
+
+  # Flip to add a little more random distortion in.
  cropped = tf.image.random_flip_left_right(cropped)
  return cropped

@@ -90,17 +108,18 @@ def _central_crop(image, crop_height, crop_width):
  Returns:
    3-D tensor with cropped image.
  """
-  height, width = _get_h_w(image)
+  shape = tf.shape(image)
+  height, width = shape[0], shape[1]

-  total_crop_height = (height - crop_height)
-  crop_top = total_crop_height // 2
-  total_crop_width = (width - crop_width)
-  crop_left = total_crop_width // 2
+  amount_to_be_cropped_h = (height - crop_height)
+  crop_top = amount_to_be_cropped_h // 2
+  amount_to_be_cropped_w = (width - crop_width)
+  crop_left = amount_to_be_cropped_w // 2
  return tf.slice(
      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])


-def _mean_image_subtraction(image, means):
+def _mean_image_subtraction(image, means, num_channels):
  """Subtracts the given means from each image channel.

  For example:
@@ -112,6 +131,7 @@ def _mean_image_subtraction(image, means):
  Args:
    image: a tensor of size [height, width, C].
    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.

  Returns:
    the centered image.
@@ -123,7 +143,7 @@ def _mean_image_subtraction(image, means):
  """
  if image.get_shape().ndims != 3:
    raise ValueError('Input must be of size [height, width, C>0]')
-  num_channels = image.get_shape().as_list()[-1]
+
  if len(means) != num_channels:
    raise ValueError('len(means) must match the number of channels')

@@ -133,7 +153,7 @@ def _mean_image_subtraction(image, means):
  return image - means


-def _smallest_size_at_least(height, width, smallest_side):
+def _smallest_size_at_least(height, width, resize_min):
  """Computes new shape with the smallest side equal to `smallest_side`.

  Computes new shape with the smallest side equal to `smallest_side` while
@@ -142,84 +162,96 @@ def _smallest_size_at_least(height, width, smallest_side):
  Args:
    height: an int32 scalar tensor indicating the current height.
    width: an int32 scalar tensor indicating the current width.
-    smallest_side: A python integer or scalar `Tensor` indicating the size of
+    resize_min: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    new_height: an int32 scalar tensor indicating the new height.
    new_width: an int32 scalar tensor indicating the new width.
  """
-  smallest_side = tf.cast(smallest_side, tf.float32)
+  resize_min = tf.cast(resize_min, tf.float32)

-  height = tf.cast(height, tf.float32)
-  width = tf.cast(width, tf.float32)
+  # Convert to floats to make subsequent calculations go smoothly.
+  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)

  smaller_dim = tf.minimum(height, width)
-  scale_ratio = smallest_side / smaller_dim
+  scale_ratio = resize_min / smaller_dim
+
+  # Convert back to ints to make heights and widths that TF ops will accept.
  new_height = tf.cast(height * scale_ratio, tf.int32)
  new_width = tf.cast(width * scale_ratio, tf.int32)

  return new_height, new_width


-def _aspect_preserving_resize(image, smallest_side):
+def _aspect_preserving_resize(image, resize_min):
  """Resize images preserving the original aspect ratio.

  Args:
    image: A 3-D image `Tensor`.
-    smallest_side: A python integer or scalar `Tensor` indicating the size of
+    resize_min: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    resized_image: A 3-D tensor containing the resized image.
  """
-  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
+  shape = tf.shape(image)
+  height, width = shape[0], shape[1]

-  height, width = _get_h_w(image)
-  new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
+  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+
+  return _resize_image(image, new_height, new_width)

-  resized_image = tf.image.resize_images(
-      image, [new_height, new_width], method=tf.image.ResizeMethod.BILINEAR,
-      align_corners=False)
-  return resized_image

+def _resize_image(image, height, width):
+  """Simple wrapper around tf.resize_images to make sure we use the same
+  `method` and other details each time.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    height: The target height for the resized image.
+    width: The target width for the resized image.

-def preprocess_image(image, output_height, output_width, is_training=False,
-                     resize_side_min=_RESIZE_SIDE_MIN,
-                     resize_side_max=_RESIZE_SIDE_MAX):
+  Returns:
+    resized_image: A 3-D tensor containing the resized image. The first two
+      dimensions have the shape [height, width].
+  """
+  return tf.image.resize_images(
+      image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+def preprocess_image(image_buffer, bbox, output_height, output_width,
+                     num_channels, is_training=False):
  """Preprocesses the given image.

+  Preprocessing includes decoding, cropping, and resizing for both training
+  and eval images. Training preprocessing, however, introduces some random
+  distortion of the image to improve accuracy.
+
  Args:
-    image: A `Tensor` representing an image of arbitrary size.
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
    output_height: The height of the image after preprocessing.
    output_width: The width of the image after preprocessing.
+    num_channels: Integer depth of the image buffer for decoding.
    is_training: `True` if we're preprocessing the image for training and
      `False` otherwise.
-    resize_side_min: The lower bound for the smallest side of the image for
-      aspect-preserving resizing. If `is_training` is `False`, then this value
-      is used for rescaling.
-    resize_side_max: The upper bound for the smallest side of the image for
-      aspect-preserving resizing. If `is_training` is `False`, this value is
-      ignored. Otherwise, the resize side is sampled from
-        [resize_size_min, resize_size_max].

  Returns:
    A preprocessed image.
  """
  if is_training:
    # For training, we want to randomize some of the distortions.
-    resize_side = tf.random_uniform(
-        [], minval=resize_side_min, maxval=resize_side_max + 1, dtype=tf.int32)
-    crop_fn = _random_crop_and_flip
+    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
+    image = _resize_image(image, output_height, output_width)
  else:
-    resize_side = resize_side_min
-    crop_fn = _central_crop
-
-  num_channels = image.get_shape().as_list()[-1]
-  image = _aspect_preserving_resize(image, resize_side)
-  image = crop_fn(image, output_height, output_width)
+    # For validation, we want to decode, resize, then just crop the middle.
+    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
+    image = _aspect_preserving_resize(image, _RESIZE_MIN)
+    image = _central_crop(image, output_height, output_width)

  image.set_shape([output_height, output_width, num_channels])

-  image = tf.cast(image, tf.float32)
-  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
+  return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)