Performance tuning image preprocessing (#3383)

* Refining preprocessing, part 1 * Refinements to preprocessing resulting from multi-GPU tests * Reviving one-hot labels * Reviving one-hot labels * Fixing label shapes * Adding random flip back in * Reverting unnecessary linting of test file * Respond to CR * Respond to CR * Respond to CR

Performance tuning image preprocessing (#3383)
* Refining preprocessing, part 1 * Refinements to preprocessing resulting from multi-GPU tests * Reviving one-hot labels * Reviving one-hot labels * Fixing label shapes * Adding random flip back in * Reverting unnecessary linting of test file * Respond to CR * Respond to CR * Respond to CR
9a30bb66 · Karmel Allison · GitHub · 4c054148 · 9a30bb66 · 9a30bb66
Unverified Commit 9a30bb66 authored Feb 16, 2018 by Karmel Allison Committed by GitHub Feb 16, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 110 additions and 228 deletions

official/resnet/imagenet_main.py official/resnet/imagenet_main.py +51 -31

official/resnet/vgg_preprocessing.py official/resnet/vgg_preprocessing.py +59 -197

No files found.
--- a/official/resnet/imagenet_main.py
+++ b/official/resnet/imagenet_main.py
@@ -54,37 +54,58 @@ def get_filenames(is_training, data_dir):
        for i in range(128)]
-def parse_record(raw_record, is_training):
+def _parse_example_proto(example_serialized):
-  """Parse an ImageNet record from `value`."""
+  """Parses an Example proto containing a training example of an image.
-  keys_to_features = {
-      'image/encoded':
+  The dataset contains serialized Example protocol buffers.
-          tf.FixedLenFeature((), tf.string, default_value=''),
+  The Example proto is expected to contain features named
-      'image/format':
+  image/encoded (a JPEG-encoded string) and image/class/label (int)
-          tf.FixedLenFeature((), tf.string, default_value='jpeg'),
-      'image/class/label':
+  Args:
-          tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
+    example_serialized: scalar Tensor tf.string containing a serialized
-      'image/class/text':
+      Example protocol buffer.
-          tf.FixedLenFeature([], dtype=tf.string, default_value=''),
-      'image/object/bbox/xmin':
+  Returns:
-          tf.VarLenFeature(dtype=tf.float32),
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
-      'image/object/bbox/ymin':
+    label: Tensor tf.int64 containing the label.
-          tf.VarLenFeature(dtype=tf.float32),
+  """
-      'image/object/bbox/xmax':
+  # Dense features in Example proto.
-          tf.VarLenFeature(dtype=tf.float32),
+  feature_map = {
-      'image/object/bbox/ymax':
+      'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
-          tf.VarLenFeature(dtype=tf.float32),
+                                          default_value=''),
-      'image/object/class/label':
+      'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
-          tf.VarLenFeature(dtype=tf.int64),
+                                              default_value=-1)
  }
-  parsed = tf.parse_single_example(raw_record, keys_to_features)
+  features = tf.parse_single_example(example_serialized, feature_map)
+  return features['image/encoded'], features['image/class/label']
-  image = tf.image.decode_image(
-      tf.reshape(parsed['image/encoded'], shape=[]),
-      _NUM_CHANNELS)
-  # Note that tf.image.convert_image_dtype scales the image data to [0, 1).
+def parse_record(raw_record, is_training):
-  image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+  """Parses a record containing a training example of an image.
+  The input record is parsed into a label and image, and the image is passed
+  through preprocessing steps (cropping, flipping, and so on).
+  Args:
+    raw_record: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+    is_training: A boolean denoting whether the input is for training.
+  Returns:
+    Tuple with processed image tensor and one-hot-encoded label tensor.
+"""
+  image, label = _parse_example_proto(raw_record)
+  # Decode the string as an RGB JPEG.
+  # Note that the resulting image contains an unknown height and width
+  # that is set dynamically by decode_jpeg. In other words, the height
+  # and width of image is unknown at compile-time.
+  # Results in a 3-D int8 Tensor which we then convert to a float
+  # with values ranging from [0, 1).
+  image = tf.image.decode_jpeg(image, channels=_NUM_CHANNELS)
+  image = tf.image.convert_image_dtype(image, tf.float32)
  image = vgg_preprocessing.preprocess_image(
      image=image,
@@ -92,11 +113,10 @@ def parse_record(raw_record, is_training):
      output_width=_DEFAULT_IMAGE_SIZE,
      is_training=is_training)
-  label = tf.cast(
+  label = tf.cast(tf.reshape(label, shape=[]), dtype=tf.int32)
-      tf.reshape(parsed['image/class/label'], shape=[]),
+  label = tf.one_hot(label, _NUM_CLASSES)
-      dtype=tf.int32)
-  return image, tf.one_hot(label, _NUM_CLASSES)
+  return image, label
 def input_fn(is_training, data_dir, batch_size, num_epochs=1,

--- a/official/resnet/vgg_preprocessing.py
+++ b/official/resnet/vgg_preprocessing.py
@@ -42,155 +42,62 @@ _RESIZE_SIDE_MIN = 256
 _RESIZE_SIDE_MAX = 512
-def _crop(image, offset_height, offset_width, crop_height, crop_width):
+def _get_h_w(image):
-  """Crops the given image using the provided offsets and sizes.
+  """Convenience for grabbing the height and width of an image.
-  Note that the method doesn't assume we know the input image size but it does
-  assume we know the input image rank.
-  Args:
-    image: an image of shape [height, width, channels].
-    offset_height: a scalar tensor indicating the height offset.
-    offset_width: a scalar tensor indicating the width offset.
-    crop_height: the height of the cropped image.
-    crop_width: the width of the cropped image.
-  Returns:
-    the cropped (and resized) image.
-  Raises:
-    InvalidArgumentError: if the rank is not 3 or if the image dimensions are
-      less than the crop size.
  """
-  original_shape = tf.shape(image)
+  shape = tf.shape(image)
+  return shape[0], shape[1]
-  rank_assertion = tf.Assert(
-      tf.equal(tf.rank(image), 3),
-      ['Rank of image must be equal to 3.'])
-  with tf.control_dependencies([rank_assertion]):
-    cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])
-  size_assertion = tf.Assert(
-      tf.logical_and(
-          tf.greater_equal(original_shape[0], crop_height),
-          tf.greater_equal(original_shape[1], crop_width)),
-      ['Crop size greater than the image size.'])
-  offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0]))
-  # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
-  # define the crop size.
-  with tf.control_dependencies([size_assertion]):
-    image = tf.slice(image, offsets, cropped_shape)
-  return tf.reshape(image, cropped_shape)
-def _random_crop(image_list, crop_height, crop_width):
-  """Crops the given list of images.
-  The function applies the same crop to each image in the list. This can be
-  effectively applied when there are multiple image inputs of the same
-  dimension such as:
-    image, depths, normals = _random_crop([image, depths, normals], 120, 150)
+def _random_crop_and_flip(image, crop_height, crop_width):
+  """Crops the given image to a random part of the image, and randomly flips.
  Args:
-    image_list: a list of image tensors of the same dimension but possibly
+    image: a 3-D image tensor
-      varying channel.
    crop_height: the new height.
    crop_width: the new width.
  Returns:
-    the image_list with cropped images.
+    3-D tensor with cropped image.
-  Raises:
-    ValueError: if there are multiple image inputs provided with different size
-      or the images are smaller than the crop dimensions.
  """
-  if not image_list:
+  height, width = _get_h_w(image)
-    raise ValueError('Empty image_list.')
-  # Compute the rank assertions.
-  rank_assertions = []
-  for i in range(len(image_list)):
-    image_rank = tf.rank(image_list[i])
-    rank_assert = tf.Assert(
-        tf.equal(image_rank, 3),
-        ['Wrong rank for tensor  %s [expected] [actual]',
-         image_list[i].name, 3, image_rank])
-    rank_assertions.append(rank_assert)
-  with tf.control_dependencies([rank_assertions[0]]):
-    image_shape = tf.shape(image_list[0])
-  image_height = image_shape[0]
-  image_width = image_shape[1]
-  crop_size_assert = tf.Assert(
-      tf.logical_and(
-          tf.greater_equal(image_height, crop_height),
-          tf.greater_equal(image_width, crop_width)),
-      ['Crop size greater than the image size.'])
-  asserts = [rank_assertions[0], crop_size_assert]
-  for i in range(1, len(image_list)):
-    image = image_list[i]
-    asserts.append(rank_assertions[i])
-    with tf.control_dependencies([rank_assertions[i]]):
-      shape = tf.shape(image)
-    height = shape[0]
-    width = shape[1]
-    height_assert = tf.Assert(
-        tf.equal(height, image_height),
-        ['Wrong height for tensor %s [expected][actual]',
-         image.name, height, image_height])
-    width_assert = tf.Assert(
-        tf.equal(width, image_width),
-        ['Wrong width for tensor %s [expected][actual]',
-         image.name, width, image_width])
-    asserts.extend([height_assert, width_assert])
  # Create a random bounding box.
  #
  # Use tf.random_uniform and not numpy.random.rand as doing the former would
  # generate random numbers at graph eval time, unlike the latter which
  # generates random numbers at graph definition time.
-  with tf.control_dependencies(asserts):
+  total_crop_height = (height - crop_height)
-    max_offset_height = tf.reshape(image_height - crop_height + 1, [])
+  crop_top = tf.random_uniform([], maxval=total_crop_height + 1, dtype=tf.int32)
-  with tf.control_dependencies(asserts):
+  total_crop_width = (width - crop_width)
-    max_offset_width = tf.reshape(image_width - crop_width + 1, [])
+  crop_left = tf.random_uniform([], maxval=total_crop_width + 1, dtype=tf.int32)
-  offset_height = tf.random_uniform(
-      [], maxval=max_offset_height, dtype=tf.int32)
-  offset_width = tf.random_uniform(
-      [], maxval=max_offset_width, dtype=tf.int32)
-  return [_crop(image, offset_height, offset_width,
+  cropped = tf.slice(
-                crop_height, crop_width) for image in image_list]
+      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
-def _central_crop(image_list, crop_height, crop_width):
+def _central_crop(image, crop_height, crop_width):
  """Performs central crops of the given image list.
  Args:
-    image_list: a list of image tensors of the same dimension but possibly
+    image: a 3-D image tensor
-      varying channel.
    crop_height: the height of the image following the crop.
    crop_width: the width of the image following the crop.
  Returns:
-    the list of cropped images.
+    3-D tensor with cropped image.
  """
-  outputs = []
+  height, width = _get_h_w(image)
-  for image in image_list:
-    image_height = tf.shape(image)[0]
-    image_width = tf.shape(image)[1]
-    offset_height = (image_height - crop_height) / 2
+  total_crop_height = (height - crop_height)
-    offset_width = (image_width - crop_width) / 2
+  crop_top = total_crop_height // 2
+  total_crop_width = (width - crop_width)
-    outputs.append(_crop(image, offset_height, offset_width,
+  crop_left = total_crop_width // 2
-                         crop_height, crop_width))
+  return tf.slice(
-  return outputs
+      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
 def _mean_image_subtraction(image, means):
@@ -220,10 +127,10 @@ def _mean_image_subtraction(image, means):
  if len(means) != num_channels:
    raise ValueError('len(means) must match the number of channels')
-  channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
+  # We have a 1-D tensor of means; convert to 3-D.
-  for i in range(num_channels):
+  means = tf.expand_dims(tf.expand_dims(means, 0), 0)
-    channels[i] -= means[i]
-  return tf.concat(axis=2, values=channels)
+  return image - means
 def _smallest_size_at_least(height, width, smallest_side):
@@ -242,17 +149,16 @@ def _smallest_size_at_least(height, width, smallest_side):
    new_height: an int32 scalar tensor indicating the new height.
    new_width: and int32 scalar tensor indicating the new width.
  """
-  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
+  smallest_side = tf.cast(smallest_side, tf.float32)
+  height = tf.cast(height, tf.float32)
+  width = tf.cast(width, tf.float32)
-  height = tf.to_float(height)
+  smaller_dim = tf.minimum(height, width)
-  width = tf.to_float(width)
+  scale_ratio = smallest_side / smaller_dim
-  smallest_side = tf.to_float(smallest_side)
+  new_height = tf.cast(height * scale_ratio, tf.int32)
+  new_width = tf.cast(width * scale_ratio, tf.int32)
-  scale = tf.cond(tf.greater(height, width),
-                  lambda: smallest_side / width,
-                  lambda: smallest_side / height)
-  new_height = tf.to_int32(height * scale)
-  new_width = tf.to_int32(width * scale)
  return new_height, new_width
@@ -269,68 +175,13 @@ def _aspect_preserving_resize(image, smallest_side):
  """
  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
-  shape = tf.shape(image)
+  height, width = _get_h_w(image)
-  height = shape[0]
-  width = shape[1]
  new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
-  image = tf.expand_dims(image, 0)
-  resized_image = tf.image.resize_bilinear(image, [new_height, new_width],
-                                           align_corners=False)
-  resized_image = tf.squeeze(resized_image)
-  resized_image.set_shape([None, None, 3])
-  return resized_image
-def preprocess_for_train(image,
+  resized_image = tf.image.resize_images(
-                         output_height,
+      image, [new_height, new_width], method=tf.image.ResizeMethod.BILINEAR,
-                         output_width,
+      align_corners=False)
-                         resize_side_min=_RESIZE_SIDE_MIN,
+  return resized_image
-                         resize_side_max=_RESIZE_SIDE_MAX):
-  """Preprocesses the given image for training.
-  Note that the actual resizing scale is sampled from
-    [`resize_size_min`, `resize_size_max`].
-  Args:
-    image: A `Tensor` representing an image of arbitrary size.
-    output_height: The height of the image after preprocessing.
-    output_width: The width of the image after preprocessing.
-    resize_side_min: The lower bound for the smallest side of the image for
-      aspect-preserving resizing.
-    resize_side_max: The upper bound for the smallest side of the image for
-      aspect-preserving resizing.
-  Returns:
-    A preprocessed image.
-  """
-  resize_side = tf.random_uniform(
-      [], minval=resize_side_min, maxval=resize_side_max+1, dtype=tf.int32)
-  image = _aspect_preserving_resize(image, resize_side)
-  image = _random_crop([image], output_height, output_width)[0]
-  image.set_shape([output_height, output_width, 3])
-  image = tf.to_float(image)
-  image = tf.image.random_flip_left_right(image)
-  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
-def preprocess_for_eval(image, output_height, output_width, resize_side):
-  """Preprocesses the given image for evaluation.
-  Args:
-    image: A `Tensor` representing an image of arbitrary size.
-    output_height: The height of the image after preprocessing.
-    output_width: The width of the image after preprocessing.
-    resize_side: The smallest side of the image for aspect-preserving resizing.
-  Returns:
-    A preprocessed image.
-  """
-  image = _aspect_preserving_resize(image, resize_side)
-  image = _central_crop([image], output_height, output_width)[0]
-  image.set_shape([output_height, output_width, 3])
-  image = tf.to_float(image)
-  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
 def preprocess_image(image, output_height, output_width, is_training=False,
@@ -356,8 +207,19 @@ def preprocess_image(image, output_height, output_width, is_training=False,
    A preprocessed image.
  """
  if is_training:
-    return preprocess_for_train(image, output_height, output_width,
+    # For training, we want to randomize some of the distortions.
-                                resize_side_min, resize_side_max)
+    resize_side = tf.random_uniform(
+        [], minval=resize_side_min, maxval=resize_side_max + 1, dtype=tf.int32)
+    crop_fn = _random_crop_and_flip
  else:
-    return preprocess_for_eval(image, output_height, output_width,
+    resize_side = resize_side_min
-                               resize_side_min)
+    crop_fn = _central_crop
+  num_channels = image.get_shape().as_list()[-1]
+  image = _aspect_preserving_resize(image, resize_side)
+  image = crop_fn(image, output_height, output_width)
+  image.set_shape([output_height, output_width, num_channels])
+  image = tf.cast(image, tf.float32)
+  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])