resnet.py

# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Contains definitions for the post-activation form of Residual Networks.

Residual networks (ResNets) were proposed in:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from official.vision.detection.modeling.architecture import nn_ops


# TODO(b/140112644): Refactor the code with Keras style, i.e. build and call.
class Resnet(object):
  """Class to build ResNet family model."""

  def __init__(
      self,
      resnet_depth,
      activation='relu',
      norm_activation=nn_ops.norm_activation_builder(activation='relu'),
      data_format='channels_last'):
    """ResNet initialization function.

    Args:
      resnet_depth: `int` depth of ResNet backbone model.
      norm_activation: an operation that includes a normalization layer followed
        by an optional activation layer.
      data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.
    """
    self._resnet_depth = resnet_depth
    if activation == 'relu':
      self._activation_op = tf.nn.relu
    elif activation == 'swish':
      self._activation_op = tf.nn.swish
    else:
      raise ValueError('Unsupported activation `{}`.'.format(activation))
    self._norm_activation = norm_activation
    self._data_format = data_format

    model_params = {
        10: {
            'block': self.residual_block,
            'layers': [1, 1, 1, 1]
        },
        18: {
            'block': self.residual_block,
            'layers': [2, 2, 2, 2]
        },
        34: {
            'block': self.residual_block,
            'layers': [3, 4, 6, 3]
        },
        50: {
            'block': self.bottleneck_block,
            'layers': [3, 4, 6, 3]
        },
        101: {
            'block': self.bottleneck_block,
            'layers': [3, 4, 23, 3]
        },
        152: {
            'block': self.bottleneck_block,
            'layers': [3, 8, 36, 3]
        },
        200: {
            'block': self.bottleneck_block,
            'layers': [3, 24, 36, 3]
        }
    }

    if resnet_depth not in model_params:
      valid_resnet_depths = ', '.join(
          [str(depth) for depth in sorted(model_params.keys())])
      raise ValueError(
          'The resnet_depth should be in [%s]. Not a valid resnet_depth:' %
          (valid_resnet_depths), self._resnet_depth)
    params = model_params[resnet_depth]
    self._resnet_fn = self.resnet_v1_generator(params['block'],
                                               params['layers'])

  def __call__(self, inputs, is_training=None):
    """Returns the ResNet model for a given size and number of output classes.

    Args:
      inputs: a `Tesnor` with shape [batch_size, height, width, 3] representing
        a batch of images.
      is_training: `bool` if True, the model is in training mode.

    Returns:
      a `dict` containing `int` keys for continuous feature levels [2, 3, 4, 5].
      The values are corresponding feature hierarchy in ResNet with shape
      [batch_size, height_l, width_l, num_filters].
    """
    with tf.name_scope('resnet%s' % self._resnet_depth):
      return self._resnet_fn(inputs, is_training)

  def fixed_padding(self, inputs, kernel_size):
    """Pads the input along the spatial dimensions independently of input size.

    Args:
      inputs: `Tensor` of size `[batch, channels, height, width]` or `[batch,
        height, width, channels]` depending on `data_format`.
      kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
        operations. Should be a positive integer.

    Returns:
      A padded `Tensor` of the same `data_format` with size either intact
      (if `kernel_size == 1`) or padded (if `kernel_size > 1`).
    """
    pad_total = kernel_size - 1
    pad_beg = pad_total // 2
    pad_end = pad_total - pad_beg
    if self._data_format == 'channels_first':
      padded_inputs = tf.pad(
          tensor=inputs,
          paddings=[[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
    else:
      padded_inputs = tf.pad(
          tensor=inputs,
          paddings=[[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])

    return padded_inputs

  def conv2d_fixed_padding(self, inputs, filters, kernel_size, strides):
    """Strided 2-D convolution with explicit padding.

    The padding is consistent and is based only on `kernel_size`, not on the
    dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).

    Args:
      inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
      filters: `int` number of filters in the convolution.
      kernel_size: `int` size of the kernel to be used in the convolution.
      strides: `int` strides of the convolution.

    Returns:
      A `Tensor` of shape `[batch, filters, height_out, width_out]`.
    """
    if strides > 1:
      inputs = self.fixed_padding(inputs, kernel_size)

    return tf.keras.layers.Conv2D(
        filters=filters,
        kernel_size=kernel_size,
        strides=strides,
        padding=('SAME' if strides == 1 else 'VALID'),
        use_bias=False,
        kernel_initializer=tf.initializers.VarianceScaling(),
        data_format=self._data_format)(
            inputs=inputs)

  def residual_block(self,
                     inputs,
                     filters,
                     strides,
                     use_projection=False,
                     is_training=None):
    """Standard building block for residual networks with BN after convolutions.

    Args:
      inputs: `Tensor` of size `[batch, channels, height, width]`.
      filters: `int` number of filters for the first two convolutions. Note that
        the third and final convolution will use 4 times as many filters.
      strides: `int` block stride. If greater than 1, this block will ultimately
        downsample the input.
      use_projection: `bool` for whether this block should use a projection
        shortcut (versus the default identity shortcut). This is usually `True`
        for the first block of a block group, which may change the number of
        filters and the resolution.
      is_training: `bool` if True, the model is in training mode.

    Returns:
      The output `Tensor` of the block.
    """
    shortcut = inputs
    if use_projection:
      # Projection shortcut in first layer to match filters and strides
      shortcut = self.conv2d_fixed_padding(
          inputs=inputs, filters=filters, kernel_size=1, strides=strides)
      shortcut = self._norm_activation(use_activation=False)(
          shortcut, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
    inputs = self._norm_activation()(inputs, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=3, strides=1)
    inputs = self._norm_activation(
        use_activation=False, init_zero=True)(
            inputs, is_training=is_training)

    return self._activation_op(inputs + shortcut)

  def bottleneck_block(self,
                       inputs,
                       filters,
                       strides,
                       use_projection=False,
                       is_training=None):
    """Bottleneck block variant for residual networks with BN after convolutions.

    Args:
      inputs: `Tensor` of size `[batch, channels, height, width]`.
      filters: `int` number of filters for the first two convolutions. Note that
        the third and final convolution will use 4 times as many filters.
      strides: `int` block stride. If greater than 1, this block will ultimately
        downsample the input.
      use_projection: `bool` for whether this block should use a projection
        shortcut (versus the default identity shortcut). This is usually `True`
        for the first block of a block group, which may change the number of
        filters and the resolution.
      is_training: `bool` if True, the model is in training mode.

    Returns:
      The output `Tensor` of the block.
    """
    shortcut = inputs
    if use_projection:
      # Projection shortcut only in first block within a group. Bottleneck
      # blocks end with 4 times the number of filters.
      filters_out = 4 * filters
      shortcut = self.conv2d_fixed_padding(
          inputs=inputs, filters=filters_out, kernel_size=1, strides=strides)
      shortcut = self._norm_activation(use_activation=False)(
          shortcut, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=1, strides=1)
    inputs = self._norm_activation()(inputs, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
    inputs = self._norm_activation()(inputs, is_training=is_training)

    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=4 * filters, kernel_size=1, strides=1)
    inputs = self._norm_activation(
        use_activation=False, init_zero=True)(
            inputs, is_training=is_training)

    return self._activation_op(inputs + shortcut)

  def block_group(self, inputs, filters, block_fn, blocks, strides, name,
                  is_training):
    """Creates one group of blocks for the ResNet model.

    Args:
      inputs: `Tensor` of size `[batch, channels, height, width]`.
      filters: `int` number of filters for the first convolution of the layer.
      block_fn: `function` for the block to use within the model
      blocks: `int` number of blocks contained in the layer.
      strides: `int` stride to use for the first convolution of the layer. If
        greater than 1, this layer will downsample the input.
      name: `str`name for the Tensor output of the block layer.
      is_training: `bool` if True, the model is in training mode.

    Returns:
      The output `Tensor` of the block layer.
    """
    # Only the first block per block_group uses projection shortcut and strides.
    inputs = block_fn(
        inputs, filters, strides, use_projection=True, is_training=is_training)

    for _ in range(1, blocks):
      inputs = block_fn(inputs, filters, 1, is_training=is_training)

    return tf.identity(inputs, name)

  def resnet_v1_generator(self, block_fn, layers):
    """Generator for ResNet v1 models.

    Args:
      block_fn: `function` for the block to use within the model. Either
        `residual_block` or `bottleneck_block`.
      layers: list of 4 `int`s denoting the number of blocks to include in each
        of the 4 block groups. Each group consists of blocks that take inputs of
        the same resolution.

    Returns:
      Model `function` that takes in `inputs` and `is_training` and returns the
      output `Tensor` of the ResNet model.
    """

    def model(inputs, is_training=None):
      """Creation of the model graph."""
      inputs = self.conv2d_fixed_padding(
          inputs=inputs, filters=64, kernel_size=7, strides=2)
      inputs = tf.identity(inputs, 'initial_conv')
      inputs = self._norm_activation()(inputs, is_training=is_training)

      inputs = tf.keras.layers.MaxPool2D(
          pool_size=3, strides=2, padding='SAME',
          data_format=self._data_format)(
              inputs)
      inputs = tf.identity(inputs, 'initial_max_pool')

      c2 = self.block_group(
          inputs=inputs,
          filters=64,
          block_fn=block_fn,
          blocks=layers[0],
          strides=1,
          name='block_group1',
          is_training=is_training)
      c3 = self.block_group(
          inputs=c2,
          filters=128,
          block_fn=block_fn,
          blocks=layers[1],
          strides=2,
          name='block_group2',
          is_training=is_training)
      c4 = self.block_group(
          inputs=c3,
          filters=256,
          block_fn=block_fn,
          blocks=layers[2],
          strides=2,
          name='block_group3',
          is_training=is_training)
      c5 = self.block_group(
          inputs=c4,
          filters=512,
          block_fn=block_fn,
          blocks=layers[3],
          strides=2,
          name='block_group4',
          is_training=is_training)
      return {2: c2, 3: c3, 4: c4, 5: c5}

    return model