vgg_preprocessing.py 7.34 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to preprocess images.

The preprocessing steps for VGG were introduced in the following technical
report:

  Very Deep Convolutional Networks For Large-Scale Image Recognition
  Karen Simonyan and Andrew Zisserman
  arXiv technical report, 2015
  PDF: http://arxiv.org/pdf/1409.1556.pdf
  ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
  CC-BY-4.0

More information can be obtained from the VGG website:
www.robots.ox.ac.uk/~vgg/research/very_deep/
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

Karmel Allison's avatar
Karmel Allison committed
37
38
39
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
40
41
42
43
44

_RESIZE_SIDE_MIN = 256
_RESIZE_SIDE_MAX = 512


45
46
def _get_h_w(image):
  """Convenience for grabbing the height and width of an image.
47
  """
48
49
  shape = tf.shape(image)
  return shape[0], shape[1]
50
51


52
53
def _random_crop_and_flip(image, crop_height, crop_width):
  """Crops the given image to a random part of the image, and randomly flips.
54
55

  Args:
56
    image: a 3-D image tensor
57
58
59
60
    crop_height: the new height.
    crop_width: the new width.

  Returns:
61
    3-D tensor with cropped image.
62
63

  """
64
  height, width = _get_h_w(image)
65
66
67
68
69
70

  # Create a random bounding box.
  #
  # Use tf.random_uniform and not numpy.random.rand as doing the former would
  # generate random numbers at graph eval time, unlike the latter which
  # generates random numbers at graph definition time.
71
72
73
74
  total_crop_height = (height - crop_height)
  crop_top = tf.random_uniform([], maxval=total_crop_height + 1, dtype=tf.int32)
  total_crop_width = (width - crop_width)
  crop_left = tf.random_uniform([], maxval=total_crop_width + 1, dtype=tf.int32)
75

76
77
  cropped = tf.slice(
      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
78

79
80
  cropped = tf.image.random_flip_left_right(cropped)
  return cropped
81

82
def _central_crop(image, crop_height, crop_width):
83
84
85
  """Performs central crops of the given image list.

  Args:
86
    image: a 3-D image tensor
87
88
89
90
    crop_height: the height of the image following the crop.
    crop_width: the width of the image following the crop.

  Returns:
91
    3-D tensor with cropped image.
92
  """
93
  height, width = _get_h_w(image)
94

95
96
97
98
99
100
  total_crop_height = (height - crop_height)
  crop_top = total_crop_height // 2
  total_crop_width = (width - crop_width)
  crop_left = total_crop_width // 2
  return tf.slice(
      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129


def _mean_image_subtraction(image, means):
  """Subtracts the given means from each image channel.

  For example:
    means = [123.68, 116.779, 103.939]
    image = _mean_image_subtraction(image, means)

  Note that the rank of `image` must be known.

  Args:
    image: a tensor of size [height, width, C].
    means: a C-vector of values to subtract from each channel.

  Returns:
    the centered image.

  Raises:
    ValueError: If the rank of `image` is unknown, if `image` has a rank other
      than three or if the number of channels in `image` doesn't match the
      number of values in `means`.
  """
  if image.get_shape().ndims != 3:
    raise ValueError('Input must be of size [height, width, C>0]')
  num_channels = image.get_shape().as_list()[-1]
  if len(means) != num_channels:
    raise ValueError('len(means) must match the number of channels')

130
131
132
133
  # We have a 1-D tensor of means; convert to 3-D.
  means = tf.expand_dims(tf.expand_dims(means, 0), 0)

  return image - means
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149


def _smallest_size_at_least(height, width, smallest_side):
  """Computes new shape with the smallest side equal to `smallest_side`.

  Computes new shape with the smallest side equal to `smallest_side` while
  preserving the original aspect ratio.

  Args:
    height: an int32 scalar tensor indicating the current height.
    width: an int32 scalar tensor indicating the current width.
    smallest_side: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    new_height: an int32 scalar tensor indicating the new height.
Karmel Allison's avatar
Karmel Allison committed
150
    new_width: an int32 scalar tensor indicating the new width.
151
  """
152
153
154
155
  smallest_side = tf.cast(smallest_side, tf.float32)

  height = tf.cast(height, tf.float32)
  width = tf.cast(width, tf.float32)
156

157
158
159
160
  smaller_dim = tf.minimum(height, width)
  scale_ratio = smallest_side / smaller_dim
  new_height = tf.cast(height * scale_ratio, tf.int32)
  new_width = tf.cast(width * scale_ratio, tf.int32)
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177

  return new_height, new_width


def _aspect_preserving_resize(image, smallest_side):
  """Resize images preserving the original aspect ratio.

  Args:
    image: A 3-D image `Tensor`.
    smallest_side: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    resized_image: A 3-D tensor containing the resized image.
  """
  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)

178
  height, width = _get_h_w(image)
179
180
  new_height, new_width = _smallest_size_at_least(height, width, smallest_side)

181
182
183
184
  resized_image = tf.image.resize_images(
      image, [new_height, new_width], method=tf.image.ResizeMethod.BILINEAR,
      align_corners=False)
  return resized_image
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209


def preprocess_image(image, output_height, output_width, is_training=False,
                     resize_side_min=_RESIZE_SIDE_MIN,
                     resize_side_max=_RESIZE_SIDE_MAX):
  """Preprocesses the given image.

  Args:
    image: A `Tensor` representing an image of arbitrary size.
    output_height: The height of the image after preprocessing.
    output_width: The width of the image after preprocessing.
    is_training: `True` if we're preprocessing the image for training and
      `False` otherwise.
    resize_side_min: The lower bound for the smallest side of the image for
      aspect-preserving resizing. If `is_training` is `False`, then this value
      is used for rescaling.
    resize_side_max: The upper bound for the smallest side of the image for
      aspect-preserving resizing. If `is_training` is `False`, this value is
      ignored. Otherwise, the resize side is sampled from
        [resize_size_min, resize_size_max].

  Returns:
    A preprocessed image.
  """
  if is_training:
210
211
212
213
    # For training, we want to randomize some of the distortions.
    resize_side = tf.random_uniform(
        [], minval=resize_side_min, maxval=resize_side_max + 1, dtype=tf.int32)
    crop_fn = _random_crop_and_flip
214
  else:
215
216
217
218
219
220
221
222
223
224
225
    resize_side = resize_side_min
    crop_fn = _central_crop

  num_channels = image.get_shape().as_list()[-1]
  image = _aspect_preserving_resize(image, resize_side)
  image = crop_fn(image, output_height, output_width)

  image.set_shape([output_height, output_width, num_channels])

  image = tf.cast(image, tf.float32)
  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])