resnet_model.py 17.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
15
"""Contains definitions for Residual Networks.
16

17
Residual networks ('v1' ResNets) were originally proposed in:
18
19
20
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385

21
The full preactivation 'v2' ResNet variant was introduced by:
22
23
24
25
26
27
28
29
30
31
32
33
[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Identity Mappings in Deep Residual Networks. arXiv: 1603.05027

The key difference of the full preactivation 'v2' variant compared to the
'v1' variant in [1] is the use of batch normalization before every weight layer
rather than after.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

Karmel Allison's avatar
Karmel Allison committed
34

35
36
import tensorflow as tf

Taylor Robie's avatar
Taylor Robie committed
37

38
39
_BATCH_NORM_DECAY = 0.997
_BATCH_NORM_EPSILON = 1e-5
40
41
DEFAULT_VERSION = 2

42

Karmel Allison's avatar
Karmel Allison committed
43
################################################################################
44
# Convenience functions for building the ResNet model.
Karmel Allison's avatar
Karmel Allison committed
45
################################################################################
46
47
def batch_norm(inputs, training, data_format):
  """Performs a batch normalization using a standard set of parameters."""
48
49
  # We set fused=True for a significant performance boost. See
  # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
50
  return tf.layers.batch_normalization(
51
52
      inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
      momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True,
53
      scale=True, training=training, fused=True)
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83


def fixed_padding(inputs, kernel_size, data_format):
  """Pads the input along the spatial dimensions independently of input size.

  Args:
    inputs: A tensor of size [batch, channels, height_in, width_in] or
      [batch, height_in, width_in, channels] depending on data_format.
    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
                 Should be a positive integer.
    data_format: The input format ('channels_last' or 'channels_first').

  Returns:
    A tensor with the same format as the input with the data either intact
    (if kernel_size == 1) or padded (if kernel_size > 1).
  """
  pad_total = kernel_size - 1
  pad_beg = pad_total // 2
  pad_end = pad_total - pad_beg

  if data_format == 'channels_first':
    padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
                                    [pad_beg, pad_end], [pad_beg, pad_end]])
  else:
    padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
                                    [pad_beg, pad_end], [0, 0]])
  return padded_inputs


def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
84
85
86
  """Strided 2-D convolution with explicit padding."""
  # The padding is consistent and is based only on `kernel_size`, not on the
  # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
87
88
89
90
91
92
93
94
95
96
  if strides > 1:
    inputs = fixed_padding(inputs, kernel_size, data_format)

  return tf.layers.conv2d(
      inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
      padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
      kernel_initializer=tf.variance_scaling_initializer(),
      data_format=data_format)


97
98
99
100
################################################################################
# ResNet block definitions.
################################################################################
def _building_block_v1(inputs, filters, training, projection_shortcut, strides,
101
                       data_format):
102
103
104
105
106
  """
  Convolution then batch normalization then ReLU as described by:
    Deep Residual Learning for Image Recognition
    https://arxiv.org/pdf/1512.03385.pdf
    by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
107
108
109
110
111

  Args:
    inputs: A tensor of size [batch, channels, height_in, width_in] or
      [batch, height_in, width_in, channels] depending on data_format.
    filters: The number of filters for the convolutions.
112
    training: A Boolean for whether the model is in training or inference
113
      mode. Needed for batch normalization.
114
115
    projection_shortcut: The function to use for projection shortcuts
      (typically a 1x1 convolution when downsampling the input).
116
117
118
119
120
121
122
123
124
125
126
    strides: The block's stride. If greater than 1, this block will ultimately
      downsample the input.
    data_format: The input format ('channels_last' or 'channels_first').

  Returns:
    The output tensor of the block.
  """
  shortcut = inputs

  if projection_shortcut is not None:
    shortcut = projection_shortcut(inputs)
127
128
    shortcut = batch_norm(inputs=shortcut, training=training,
                          data_format=data_format)
129
130
131
132

  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
      data_format=data_format)
133
134
  inputs = batch_norm(inputs, training, data_format)
  inputs = tf.nn.relu(inputs)
135
136
137
138

  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=3, strides=1,
      data_format=data_format)
139
140
141
  inputs = batch_norm(inputs, training, data_format)
  inputs += shortcut
  inputs = tf.nn.relu(inputs)
142

143
  return inputs
144
145


146
def _building_block_v2(inputs, filters, training, projection_shortcut, strides,
147
                       data_format):
148
149
150
151
152
  """
  Batch normalization then ReLu then convolution as described by:
    Identity Mappings in Deep Residual Networks
    https://arxiv.org/pdf/1603.05027.pdf
    by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
153
154
155
156

  Args:
    inputs: A tensor of size [batch, channels, height_in, width_in] or
      [batch, height_in, width_in, channels] depending on data_format.
157
    filters: The number of filters for the convolutions.
158
    training: A Boolean for whether the model is in training or inference
159
      mode. Needed for batch normalization.
160
161
    projection_shortcut: The function to use for projection shortcuts
      (typically a 1x1 convolution when downsampling the input).
162
163
164
165
166
167
168
169
    strides: The block's stride. If greater than 1, this block will ultimately
      downsample the input.
    data_format: The input format ('channels_last' or 'channels_first').

  Returns:
    The output tensor of the block.
  """
  shortcut = inputs
170
171
  inputs = batch_norm(inputs, training, data_format)
  inputs = tf.nn.relu(inputs)
172
173
174
175
176
177

  # The projection shortcut should come after the first batch norm and ReLU
  # since it performs a 1x1 convolution.
  if projection_shortcut is not None:
    shortcut = projection_shortcut(inputs)

178
179
180
181
182
183
184
185
186
187
188
189
190
191
  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
      data_format=data_format)

  inputs = batch_norm(inputs, training, data_format)
  inputs = tf.nn.relu(inputs)
  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=3, strides=1,
      data_format=data_format)

  return inputs + shortcut


def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,
192
                         strides, data_format):
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  """
  Similar to _building_block_v1(), except using the "bottleneck" blocks
  described in:
    Convolution then batch normalization then ReLU as described by:
      Deep Residual Learning for Image Recognition
      https://arxiv.org/pdf/1512.03385.pdf
      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
  """
  shortcut = inputs

  if projection_shortcut is not None:
    shortcut = projection_shortcut(inputs)
    shortcut = batch_norm(inputs=shortcut, training=training,
                          data_format=data_format)

208
209
210
  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=1, strides=1,
      data_format=data_format)
211
212
  inputs = batch_norm(inputs, training, data_format)
  inputs = tf.nn.relu(inputs)
213
214
215
216

  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
      data_format=data_format)
217
218
219
220
221
222
223
224
225
226
227
228
229
230
  inputs = batch_norm(inputs, training, data_format)
  inputs = tf.nn.relu(inputs)

  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
      data_format=data_format)
  inputs = batch_norm(inputs, training, data_format)
  inputs += shortcut
  inputs = tf.nn.relu(inputs)

  return inputs


def _bottleneck_block_v2(inputs, filters, training, projection_shortcut,
231
                         strides, data_format):
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
  """
  Similar to _building_block_v2(), except using the "bottleneck" blocks
  described in:
    Convolution then batch normalization then ReLU as described by:
      Deep Residual Learning for Image Recognition
      https://arxiv.org/pdf/1512.03385.pdf
      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.

  adapted to the ordering conventions of:
    Batch normalization then ReLu then convolution as described by:
      Identity Mappings in Deep Residual Networks
      https://arxiv.org/pdf/1603.05027.pdf
      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
  """
  shortcut = inputs
  inputs = batch_norm(inputs, training, data_format)
  inputs = tf.nn.relu(inputs)

  # The projection shortcut should come after the first batch norm and ReLU
  # since it performs a 1x1 convolution.
  if projection_shortcut is not None:
    shortcut = projection_shortcut(inputs)
254

255
256
257
258
259
260
261
262
263
264
265
266
  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=1, strides=1,
      data_format=data_format)

  inputs = batch_norm(inputs, training, data_format)
  inputs = tf.nn.relu(inputs)
  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
      data_format=data_format)

  inputs = batch_norm(inputs, training, data_format)
  inputs = tf.nn.relu(inputs)
267
268
269
270
271
272
273
  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
      data_format=data_format)

  return inputs + shortcut


274
275
def block_layer(inputs, filters, bottleneck, block_fn, blocks, strides,
                training, name, data_format):
276
277
278
279
280
281
  """Creates one layer of blocks for the ResNet model.

  Args:
    inputs: A tensor of size [batch, channels, height_in, width_in] or
      [batch, height_in, width_in, channels] depending on data_format.
    filters: The number of filters for the first convolution of the layer.
282
    bottleneck: Is the block created a bottleneck block.
283
284
285
286
287
    block_fn: The block to use within the model, either `building_block` or
      `bottleneck_block`.
    blocks: The number of blocks contained in the layer.
    strides: The stride to use for the first convolution of the layer. If
      greater than 1, this layer will ultimately downsample the input.
288
    training: Either True or False, whether we are currently training the
289
290
291
292
293
294
295
      model. Needed for batch norm.
    name: A string name for the tensor output of the block layer.
    data_format: The input format ('channels_last' or 'channels_first').

  Returns:
    The output tensor of the block layer.
  """
296

297
  # Bottleneck blocks end with 4x the number of filters as they start with
298
  filters_out = filters * 4 if bottleneck else filters
299
300
301
302
303
304
305

  def projection_shortcut(inputs):
    return conv2d_fixed_padding(
        inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
        data_format=data_format)

  # Only the first block per block_layer uses projection_shortcut and strides
306
  inputs = block_fn(inputs, filters, training, projection_shortcut, strides,
307
308
                    data_format)

309
  for _ in range(1, blocks):
310
    inputs = block_fn(inputs, filters, training, None, 1, data_format)
311
312
313
314

  return tf.identity(inputs, name)


315
class Model(object):
316
  """Base class for building the Resnet Model.
317
318
  """

319
320
  def __init__(self, resnet_size, bottleneck, num_classes, num_filters,
               kernel_size,
321
               conv_stride, first_pool_size, first_pool_stride,
322
323
               second_pool_size, second_pool_stride, block_sizes, block_strides,
               final_size, version=DEFAULT_VERSION, data_format=None):
324
325
326
327
    """Creates a model for classifying an image.

    Args:
      resnet_size: A single integer for the size of the ResNet model.
328
      bottleneck: Use regular blocks or bottleneck blocks.
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
      num_classes: The number of classes used as labels.
      num_filters: The number of filters to use for the first block layer
        of the model. This number is then doubled for each subsequent block
        layer.
      kernel_size: The kernel size to use for convolution.
      conv_stride: stride size for the initial convolutional layer
      first_pool_size: Pool size to be used for the first pooling layer.
        If none, the first pooling layer is skipped.
      first_pool_stride: stride size for the first pooling layer. Not used
        if first_pool_size is None.
      second_pool_size: Pool size to be used for the second pooling layer.
      second_pool_stride: stride size for the final pooling layer
      block_sizes: A list containing n values, where n is the number of sets of
        block layers desired. Each value should be the number of blocks in the
        i-th set.
      block_strides: List of integers representing the desired stride size for
        each of the sets of block layers. Should be same length as block_sizes.
      final_size: The expected size of the model after the second pooling.
347
348
      version: Integer representing which version of the ResNet network to use.
        See README for details. Valid values: [1, 2]
349
350
351
352
353
354
355
356
357
      data_format: Input format ('channels_last', 'channels_first', or None).
        If set to None, the format is dependent on whether a GPU is available.
    """
    self.resnet_size = resnet_size

    if not data_format:
      data_format = (
          'channels_first' if tf.test.is_built_with_cuda() else 'channels_last')

358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
    self.resnet_version = version
    if version not in (1, 2):
      raise ValueError(
          "Resnet version should be 1 or 2. See README for citations.")

    self.bottleneck = bottleneck
    if bottleneck:
      if version == 1:
        self.block_fn = _bottleneck_block_v1
      else:
        self.block_fn = _bottleneck_block_v2
    else:
      if version == 1:
        self.block_fn = _building_block_v1
      else:
        self.block_fn = _building_block_v2

375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
    self.data_format = data_format
    self.num_classes = num_classes
    self.num_filters = num_filters
    self.kernel_size = kernel_size
    self.conv_stride = conv_stride
    self.first_pool_size = first_pool_size
    self.first_pool_stride = first_pool_stride
    self.second_pool_size = second_pool_size
    self.second_pool_stride = second_pool_stride
    self.block_sizes = block_sizes
    self.block_strides = block_strides
    self.final_size = final_size

  def __call__(self, inputs, training):
    """Add operations to classify a batch of input images.

    Args:
      inputs: A Tensor representing a batch of input images.
      training: A boolean. Set to True to add operations required only when
        training the classifier.

    Returns:
      A logits Tensor with shape [<batch_size>, self.num_classes].
    """

    if self.data_format == 'channels_first':
401
402
      # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
      # This provides a large performance boost on GPU. See
403
      # https://www.tensorflow.org/performance/performance_guide#data_formats
404
405
406
      inputs = tf.transpose(inputs, [0, 3, 1, 2])

    inputs = conv2d_fixed_padding(
407
408
        inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
        strides=self.conv_stride, data_format=self.data_format)
409
410
    inputs = tf.identity(inputs, 'initial_conv')

411
412
413
414
415
416
417
418
419
420
    if self.first_pool_size:
      inputs = tf.layers.max_pooling2d(
          inputs=inputs, pool_size=self.first_pool_size,
          strides=self.first_pool_stride, padding='SAME',
          data_format=self.data_format)
      inputs = tf.identity(inputs, 'initial_max_pool')

    for i, num_blocks in enumerate(self.block_sizes):
      num_filters = self.num_filters * (2**i)
      inputs = block_layer(
421
422
423
424
          inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
          block_fn=self.block_fn, blocks=num_blocks,
          strides=self.block_strides[i], training=training,
          name='block_layer{}'.format(i + 1), data_format=self.data_format)
425

426
427
    inputs = batch_norm(inputs, training, self.data_format)
    inputs = tf.nn.relu(inputs)
428
    inputs = tf.layers.average_pooling2d(
429
430
431
        inputs=inputs, pool_size=self.second_pool_size,
        strides=self.second_pool_stride, padding='VALID',
        data_format=self.data_format)
432
433
    inputs = tf.identity(inputs, 'final_avg_pool')

434
435
    inputs = tf.reshape(inputs, [-1, self.final_size])
    inputs = tf.layers.dense(inputs=inputs, units=self.num_classes)
436
437
    inputs = tf.identity(inputs, 'final_dense')
    return inputs
Karmel Allison's avatar
Karmel Allison committed
438