Merge branch 'master' of https://github.com/tensorflow/models into RTESuperGLUE

bb124157 · stephenwu · 2e9bb539 · 0edeb7f6 · bb124157 · bb124157
Commit bb124157 authored Mar 10, 2021 by stephenwu
20 changed files
--- a/official/utils/misc/distribution_utils.py
+++ b/official/utils/misc/distribution_utils.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Helper functions for running models in a distributed setting."""
 # pylint: disable=wildcard-import
 from official.common.distribute_utils import *
--- a/official/utils/misc/keras_utils.py
+++ b/official/utils/misc/keras_utils.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Helper functions for the Keras implementations of models."""

 import multiprocessing

--- a/official/utils/misc/model_helpers.py
+++ b/official/utils/misc/model_helpers.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Miscellaneous functions that can be called by models."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Miscellaneous functions that can be called by models."""

 import numbers


--- a/official/utils/misc/model_helpers_test.py
+++ b/official/utils/misc/model_helpers_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for Model Helper functions."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for Model Helper functions."""

 import tensorflow as tf  # pylint: disable=g-bad-import-order


--- a/official/utils/testing/__init__.py
+++ b/official/utils/testing/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/utils/testing/integration.py
+++ b/official/utils/testing/integration.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Helper code to run complete models from within python."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Helper code to run complete models from within python."""

 import os
 import shutil

--- a/official/utils/testing/mock_task.py
+++ b/official/utils/testing/mock_task.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Mock task for testing."""

 import dataclasses
@@ -89,7 +88,7 @@ class MockTask(base_task.Task):
          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
    return state

-  def reduce_aggregated_logs(self, aggregated_logs):
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
    for k, v in aggregated_logs.items():
      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
    return aggregated_logs

--- a/official/vision/beta/MODEL_GARDEN.md
+++ b/official/vision/beta/MODEL_GARDEN.md
@@ -65,9 +65,9 @@ ResNet-RS-350 | 320x320    | 164.3   | 84.2  | 96.9  | [config](https://github.c

 | backbone        | resolution    | epochs  | FLOPs (B)     | params (M) |  box AP |   download |
 | ------------ |:-------------:| ---------:|-----------:|--------:|---------:|-----------:|
-| SpineNet-49  | 640x640       |    500    | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml)|
-| SpineNet-96  | 1024x1024     |    500    | 265.4 | 43.0 | 48.5 |  [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) |
-| SpineNet-143 | 1280x1280     |    500    | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml)|
+| SpineNet-49  | 640x640       |    500    | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml) [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
+| SpineNet-96  | 1024x1024     |    500    | 265.4 | 43.0 | 48.5 |  [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
+| SpineNet-143 | 1280x1280     |    500    | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml) [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|


 ### Instance Segmentation Baselines

--- a/official/vision/beta/dataloaders/video_input.py
+++ b/official/vision/beta/dataloaders/video_input.py
@@ -29,7 +29,7 @@ IMAGE_KEY = 'image/encoded'
 LABEL_KEY = 'clip/label/index'


-def _process_image(image: tf.Tensor,
+def process_image(image: tf.Tensor,
                  is_training: bool = True,
                  num_frames: int = 32,
                  stride: int = 1,
@@ -112,7 +112,7 @@ def _process_image(image: tf.Tensor,
  return preprocess_ops_3d.normalize_image(image, zero_centering_image)


-def _postprocess_image(image: tf.Tensor,
+def postprocess_image(image: tf.Tensor,
                      is_training: bool = True,
                      num_frames: int = 32,
                      num_test_clips: int = 1,
@@ -147,7 +147,7 @@ def _postprocess_image(image: tf.Tensor,
  return image


-def _process_label(label: tf.Tensor,
+def process_label(label: tf.Tensor,
                  one_hot_label: bool = True,
                  num_classes: Optional[int] = None) -> tf.Tensor:
  """Processes label Tensor."""
@@ -175,15 +175,13 @@ class Decoder(decoder.Decoder):
  """A tf.Example decoder for classification task."""

  def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY):
-    self._image_key = image_key
-    self._label_key = label_key
    self._context_description = {
        # One integer stored in context.
-        self._label_key: tf.io.VarLenFeature(tf.int64),
+        label_key: tf.io.VarLenFeature(tf.int64),
    }
    self._sequence_description = {
        # Each image is a string encoding JPEG.
-        self._image_key: tf.io.FixedLenSequenceFeature((), tf.string),
+        image_key: tf.io.FixedLenSequenceFeature((), tf.string),
    }

  def add_feature(self, feature_name: str,
@@ -245,7 +243,7 @@ class Parser(parser.Parser):
    """Parses data for training."""
    # Process image and label.
    image = decoded_tensors[self._image_key]
-    image = _process_image(
+    image = process_image(
        image=image,
        is_training=True,
        num_frames=self._num_frames,
@@ -261,7 +259,7 @@ class Parser(parser.Parser):
    features = {'image': image}

    label = decoded_tensors[self._label_key]
-    label = _process_label(label, self._one_hot_label, self._num_classes)
+    label = process_label(label, self._one_hot_label, self._num_classes)

    if self._output_audio:
      audio = decoded_tensors[self._audio_feature]
@@ -279,7 +277,7 @@ class Parser(parser.Parser):
  ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
    """Parses data for evaluation."""
    image = decoded_tensors[self._image_key]
-    image = _process_image(
+    image = process_image(
        image=image,
        is_training=False,
        num_frames=self._num_frames,
@@ -292,14 +290,14 @@ class Parser(parser.Parser):
    features = {'image': image}

    label = decoded_tensors[self._label_key]
-    label = _process_label(label, self._one_hot_label, self._num_classes)
+    label = process_label(label, self._one_hot_label, self._num_classes)

    if self._output_audio:
      audio = decoded_tensors[self._audio_feature]
      audio = tf.cast(audio, dtype=self._dtype)
      audio = preprocess_ops_3d.sample_sequence(
          audio, 20, random=False, stride=1)
-      audio = tf.ensure_shape(audio, [20, 2048])
+      audio = tf.ensure_shape(audio, self._audio_shape)
      features['audio'] = audio

    return features, label
@@ -318,9 +316,9 @@ class PostBatchProcessor(object):
  def __call__(self, features: Dict[str, tf.Tensor],
               label: tf.Tensor) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
    """Parses a single tf.Example into image and label tensors."""
-    for key in ['image', 'audio']:
+    for key in ['image']:
      if key in features:
-        features[key] = _postprocess_image(
+        features[key] = postprocess_image(
            image=features[key],
            is_training=self._is_training,
            num_frames=self._num_frames,

--- a/official/vision/beta/modeling/__init__.py
+++ b/official/vision/beta/modeling/__init__.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Modeling package definition."""
+
+from official.vision.beta.modeling import backbones
+from official.vision.beta.modeling import decoders
--- a/official/vision/beta/modeling/backbones/efficientnet.py
+++ b/official/vision/beta/modeling/backbones/efficientnet.py
@@ -51,14 +51,14 @@ SCALING_MAP = {


 def round_repeats(repeats, multiplier, skip=False):
-  """Round number of filters based on depth multiplier."""
+  """Returns rounded number of filters based on depth multiplier."""
  if skip or not multiplier:
    return repeats
  return int(math.ceil(multiplier * repeats))


 def block_spec_decoder(specs, width_scale, depth_scale):
-  """Decode specs for a block."""
+  """Decodes and returns specs for a block."""
  decoded_specs = []
  for s in specs:
    s = s + (
@@ -87,7 +87,13 @@ class BlockSpec(object):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class EfficientNet(tf.keras.Model):
-  """Class to build EfficientNet family model."""
+  """Creates an EfficientNet family model.
+
+  This implements the EfficientNet model from:
+    Mingxing Tan, Quoc V. Le.
+    EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks.
+    (https://arxiv.org/pdf/1905.11946)
+  """

  def __init__(self,
               model_id,
@@ -102,25 +108,25 @@ class EfficientNet(tf.keras.Model):
               norm_momentum=0.99,
               norm_epsilon=0.001,
               **kwargs):
-    """EfficientNet initialization function.
+    """Initializes an EfficientNet model.

    Args:
-      model_id: `str` model id of EfficientNet.
-      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
-      se_ratio: `float` squeeze and excitation ratio for inverted bottleneck
-        blocks.
-      stochastic_depth_drop_rate: `float` drop rate for drop connect layer.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      model_id: A `str` of model ID of EfficientNet.
+      input_specs: A `tf.keras.layers.InputSpec` of the input tensor.
+      se_ratio: A `float` of squeeze and excitation ratio for inverted
+        bottleneck blocks.
+      stochastic_depth_drop_rate: A `float` of drop rate for drop connect layer.
+      kernel_initializer: A `str` for kernel initializer of convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
        Default to None.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
+      activation: A `str` of name of the activation function.
+      use_sync_bn: If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
    """
    self._model_id = model_id
    self._input_specs = input_specs
@@ -203,12 +209,12 @@ class EfficientNet(tf.keras.Model):
    """Creates one group of blocks for the EfficientNet model.

    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]`.
-      specs: specifications for one inverted bottleneck block group.
-      name: `str`name for the block.
+      inputs: A `tf.Tensor` of size `[batch, channels, height, width]`.
+      specs: The specifications for one inverted bottleneck block group.
+      name: A `str` name for the block.

    Returns:
-      The output `Tensor` of the block layer.
+      The output `tf.Tensor` of the block layer.
    """
    if specs.block_fn == 'mbconv':
      block_fn = nn_blocks.InvertedBottleneckBlock
@@ -282,7 +288,7 @@ def build_efficientnet(
    input_specs: tf.keras.layers.InputSpec,
    model_config,
    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
-  """Builds ResNet 3d backbone from a config."""
+  """Builds EfficientNet backbone from a config."""
  backbone_type = model_config.backbone.type
  backbone_cfg = model_config.backbone.get()
  norm_activation_config = model_config.norm_activation

--- a/official/vision/beta/modeling/backbones/factory.py
+++ b/official/vision/beta/modeling/backbones/factory.py
@@ -70,10 +70,10 @@ def register_backbone_builder(key: str):
  ```

  Args:
-    key: the key to look up the builder.
+    key: A `str` of key to look up the builder.

  Returns:
-    A callable for use as class decorator that registers the decorated class
+    A callable for using as class decorator that registers the decorated class
    for creation from an instance of task_config_cls.
  """
  return registry.register(_REGISTERED_BACKBONE_CLS, key)
@@ -85,12 +85,13 @@ def build_backbone(input_specs: tf.keras.layers.InputSpec,
  """Builds backbone from a config.

  Args:
-    input_specs: tf.keras.layers.InputSpec.
-    model_config: a OneOfConfig. Model config.
-    l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None.
+    input_specs: A `tf.keras.layers.InputSpec` of input.
+    model_config: A `OneOfConfig` of model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` object. Default to
+      None.

  Returns:
-    tf.keras.Model instance of the backbone.
+    A `tf.keras.Model` instance of the backbone.
  """
  backbone_builder = registry.lookup(_REGISTERED_BACKBONE_CLS,
                                     model_config.backbone.type)

--- a/official/vision/beta/modeling/backbones/mobilenet.py
+++ b/official/vision/beta/modeling/backbones/mobilenet.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Contains definitions of Mobilenet Networks."""
+"""Contains definitions of MobileNet Networks."""

-from typing import Text, Optional, Dict, Any, Tuple
+from typing import Optional, Dict, Any, Tuple

 # Import libraries
 import dataclasses
@@ -41,8 +41,8 @@ class Conv2DBNBlock(tf.keras.layers.Layer):
      kernel_size: int = 3,
      strides: int = 1,
      use_bias: bool = False,
-      activation: Text = 'relu6',
-      kernel_initializer: Text = 'VarianceScaling',
+      activation: str = 'relu6',
+      kernel_initializer: str = 'VarianceScaling',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
      use_normalization: bool = True,
@@ -53,25 +53,25 @@ class Conv2DBNBlock(tf.keras.layers.Layer):
    """A convolution block with batch normalization.

    Args:
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      kernel_size: `int` an integer specifying the height and width of the
-        2D convolution window.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      use_bias: if True, use biase in the convolution layer.
-      activation: `str` name of the activation function.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      kernel_size: An `int` specifying the height and width of the 2D
+        convolution window.
+      strides: An `int` of block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      use_bias: If True, use bias in the convolution layer.
+      activation: A `str` name of the activation function.
+      kernel_initializer: A `str` for kernel initializer of convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
-                        Default to None.
-      use_normalization: if True, use batch normalization.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization momentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
+      use_normalization: If True, use batch normalization.
+      use_sync_bn: If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(Conv2DBNBlock, self).__init__(**kwargs)
    self._filters = filters
@@ -375,13 +375,13 @@ SUPPORTED_SPECS_MAP = {
 class BlockSpec(hyperparams.Config):
  """A container class that specifies the block configuration for MobileNet."""

-  block_fn: Text = 'convbn'
+  block_fn: str = 'convbn'
  kernel_size: int = 3
  strides: int = 1
  filters: int = 32
  use_bias: bool = False
  use_normalization: bool = True
-  activation: Text = 'relu6'
+  activation: str = 'relu6'
  # used for block type InvertedResConv
  expand_ratio: Optional[float] = 6.
  # used for block type InvertedResConv with SE
@@ -395,22 +395,22 @@ def block_spec_decoder(specs: Dict[Any, Any],
                       # set to 1 for mobilenetv1
                       divisible_by: int = 8,
                       finegrain_classification_mode: bool = True):
-  """Decode specs for a block.
+  """Decodes specs for a block.

  Args:
-    specs: `dict` specification of block specs of a mobilenet version.
-    filter_size_scale: `float` multiplier for the filter size
-      for all convolution ops. The value must be greater than zero. Typical
-      usage will be to set this value in (0, 1) to reduce the number of
-      parameters or computation cost of the model.
-    divisible_by: `int` ensures all inner dimensions are divisible by
+    specs: A `dict` specification of block specs of a mobilenet version.
+    filter_size_scale: A `float` multiplier for the filter size for all
+      convolution ops. The value must be greater than zero. Typical usage will
+      be to set this value in (0, 1) to reduce the number of parameters or
+      computation cost of the model.
+    divisible_by: An `int` that ensures all inner dimensions are divisible by
      this number.
-    finegrain_classification_mode: if True, the model
-      will keep the last layer large even for small multipliers. Following
-      https://arxiv.org/abs/1801.04381
+    finegrain_classification_mode: If True, the model will keep the last layer
+      large even for small multipliers, following
+      https://arxiv.org/abs/1801.04381.

  Returns:
-    List[BlockSpec]` defines structure of the base network.
+    A list of `BlockSpec` that defines structure of the base network.
  """

  spec_name = specs['spec_name']
@@ -449,17 +449,18 @@ def block_spec_decoder(specs: Dict[Any, Any],

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class MobileNet(tf.keras.Model):
-  """Class to build MobileNet family model."""
+  """Creates a MobileNet family model."""

-  def __init__(self,
-               model_id: Text = 'MobileNetV2',
+  def __init__(
+      self,
+      model_id: str = 'MobileNetV2',
      filter_size_scale: float = 1.0,
      input_specs: layers.InputSpec = layers.InputSpec(
          shape=[None, None, None, 3]),
      # The followings are for hyper-parameter tuning
      norm_momentum: float = 0.99,
      norm_epsilon: float = 0.001,
-               kernel_initializer: Text = 'VarianceScaling',
+      kernel_initializer: str = 'VarianceScaling',
      kernel_regularizer: Optional[regularizers.Regularizer] = None,
      bias_regularizer: Optional[regularizers.Regularizer] = None,
      # The followings should be kept the same most of the times
@@ -473,42 +474,43 @@ class MobileNet(tf.keras.Model):
      # finegrain is not used in MobileNetV1
      finegrain_classification_mode: bool = True,
      **kwargs):
-    """MobileNet initializer.
+    """Initializes a MobileNet model.

    Args:
-      model_id: `str` version of MobileNet. The supported values are
-       'MobileNetV1', 'MobileNetV2', 'MobileNetV3Large', 'MobileNetV3Small',
-        and 'MobileNetV3EdgeTPU'.
-      filter_size_scale: `float` multiplier for the filters (number of channels)
-        for all convolution ops. The value must be greater than zero. Typical
-        usage will be to set this value in (0, 1) to reduce the number of
-        parameters or computation cost of the model.
-      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      kernel_initializer: `str` kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      model_id: A `str` of MobileNet version. The supported values are
+        `MobileNetV1`, `MobileNetV2`, `MobileNetV3Large`, `MobileNetV3Small`,
+        and `MobileNetV3EdgeTPU`.
+      filter_size_scale: A `float` of multiplier for the filters (number of
+        channels) for all convolution ops. The value must be greater than zero.
+        Typical usage will be to set this value in (0, 1) to reduce the number
+        of parameters or computation cost of the model.
+      input_specs: A `tf.keras.layers.InputSpec` of specs of the input tensor.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_initializer: A `str` for kernel initializer of convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
        Default to None.
-      output_stride: `int` specifies the requested ratio of input to output
-        spatial resolution. If not None, then we invoke atrous convolution
-        if necessary to prevent the network from reducing the spatial resolution
-        of activation maps. Allowed values are 8 (accurate fully convolutional
-        mode), 16 (fast fully convolutional mode), 32 (classification mode).
-      min_depth: `int` minimum depth (number of channels) for all conv ops.
-        Enforced when filter_size_scale < 1, and not an active constraint when
-        filter_size_scale >= 1.
-      divisible_by: `int` ensures all inner dimensions are divisible by
+      output_stride: An `int` that specifies the requested ratio of input to
+        output spatial resolution. If not None, then we invoke atrous
+        convolution if necessary to prevent the network from reducing the
+        spatial resolution of activation maps. Allowed values are 8 (accurate
+        fully convolutional mode), 16 (fast fully convolutional mode), 32
+        (classification mode).
+      min_depth: An `int` of minimum depth (number of channels) for all
+        convolution ops. Enforced when filter_size_scale < 1, and not an active
+        constraint when filter_size_scale >= 1.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
        this number.
-      stochastic_depth_drop_rate: `float` drop rate for drop connect layer.
-      regularize_depthwise: if Ture, apply regularization on depthwise.
-      use_sync_bn: if True, use synchronized batch normalization.
-      finegrain_classification_mode: if True, the model
-        will keep the last layer large even for small multipliers. Following
-        https://arxiv.org/abs/1801.04381
-      **kwargs: keyword arguments to be passed.
+      stochastic_depth_drop_rate: A `float` of drop rate for drop connect layer.
+      regularize_depthwise: If Ture, apply regularization on depthwise.
+      use_sync_bn: If True, use synchronized batch normalization.
+      finegrain_classification_mode: If True, the model will keep the last layer
+        large even for small multipliers, following
+        https://arxiv.org/abs/1801.04381.
+      **kwargs: Additional keyword arguments to be passed.
    """
    if model_id not in SUPPORTED_SPECS_MAP:
      raise ValueError('The MobileNet version {} '
@@ -567,10 +569,10 @@ class MobileNet(tf.keras.Model):
  def _mobilenet_base(self,
                      inputs: tf.Tensor
                      ) -> Tuple[tf.Tensor, Dict[int, tf.Tensor]]:
-    """Build the base MobileNet architecture.
+    """Builds the base MobileNet architecture.

    Args:
-      inputs: Input tensor of shape [batch_size, height, width, channels].
+      inputs: A `tf.Tensor` of shape `[batch_size, height, width, channels]`.

    Returns:
      A tuple of output Tensor and dictionary that collects endpoints.
@@ -725,7 +727,7 @@ def build_mobilenet(
    input_specs: tf.keras.layers.InputSpec,
    model_config,
    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
-  """Builds MobileNet 3d backbone from a config."""
+  """Builds MobileNet backbone from a config."""
  backbone_type = model_config.backbone.type
  backbone_cfg = model_config.backbone.get()
  norm_activation_config = model_config.norm_activation

--- a/official/vision/beta/modeling/backbones/resnet.py
+++ b/official/vision/beta/modeling/backbones/resnet.py
@@ -12,12 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Contains definitions of Residual Networks.
-
-Residual networks (ResNets) were proposed in:
-[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Deep Residual Learning for Image Recognition. arXiv:1512.03385
-"""
+"""Contains definitions of Residual Networks."""

 # Import libraries
 import tensorflow as tf
@@ -92,7 +87,13 @@ RESNET_SPECS = {

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class ResNet(tf.keras.Model):
-  """Class to build ResNet family model."""
+  """Creates a ResNet family model.
+
+  This implements the Deep Residual Network from:
+    Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+    Deep Residual Learning for Image Recognition.
+    (https://arxiv.org/pdf/1512.03385)
+  """

  def __init__(self,
               model_id,
@@ -111,32 +112,31 @@ class ResNet(tf.keras.Model):
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
-    """ResNet initialization function.
+    """Initializes a ResNet model.

    Args:
-      model_id: `int` depth of ResNet backbone model.
-      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
-      depth_multiplier: `float` a depth multiplier to uniformaly scale up all
-        layers in channel size in ResNet.
-      stem_type: `str` stem type of ResNet. Default to `v0`. If set to `v1`,
-        use ResNet-D type stem (https://arxiv.org/abs/1812.01187).
-      resnetd_shortcut: `bool` whether to use ResNet-D shortcut in downsampling
-        blocks.
-      replace_stem_max_pool: `bool` if True, replace the max pool in stem with
-        a stride-2 conv,
-      se_ratio: `float` or None. Ratio of the Squeeze-and-Excitation layer.
-      init_stochastic_depth_rate: `float` initial stochastic depth rate.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-                          Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      model_id: An `int` of the depth of ResNet backbone model.
+      input_specs: A `tf.keras.layers.InputSpec` of the input tensor.
+      depth_multiplier: A `float` of the depth multiplier to uniformaly scale up
+        all layers in channel size in ResNet.
+      stem_type: A `str` of stem type of ResNet. Default to `v0`. If set to
+        `v1`, use ResNet-D type stem (https://arxiv.org/abs/1812.01187).
+      resnetd_shortcut: A `bool` of whether to use ResNet-D shortcut in
+        downsampling blocks.
+      replace_stem_max_pool: A `bool` of whether to replace the max pool in stem
+        with a stride-2 conv,
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      init_stochastic_depth_rate: A `float` of initial stochastic depth rate.
+      activation: A `str` name of the activation function.
+      use_sync_bn: If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A small `float` added to variance to avoid dividing by zero.
+      kernel_initializer: A str for kernel initializer of convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
        Default to None.
-      **kwargs: keyword arguments to be passed.
+      **kwargs: Additional keyword arguments to be passed.
    """
    self._model_id = model_id
    self._input_specs = input_specs
@@ -279,17 +279,20 @@ class ResNet(tf.keras.Model):
    """Creates one group of blocks for the ResNet model.

    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]`.
-      filters: `int` number of filters for the first convolution of the layer.
-      strides: `int` stride to use for the first convolution of the layer. If
-        greater than 1, this layer will downsample the input.
-      block_fn: Either `nn_blocks.ResidualBlock` or `nn_blocks.BottleneckBlock`.
-      block_repeats: `int` number of blocks contained in the layer.
-      stochastic_depth_drop_rate: `float` drop rate of the current block group.
-      name: `str`name for the block.
+      inputs: A `tf.Tensor` of size `[batch, channels, height, width]`.
+      filters: An `int` number of filters for the first convolution of the
+        layer.
+      strides: An `int` stride to use for the first convolution of the layer.
+        If greater than 1, this layer will downsample the input.
+      block_fn: The type of block group. Either `nn_blocks.ResidualBlock` or
+        `nn_blocks.BottleneckBlock`.
+      block_repeats: An `int` number of blocks contained in the layer.
+      stochastic_depth_drop_rate: A `float` of drop rate of the current block
+        group.
+      name: A `str` name for the block.

    Returns:
-      The output `Tensor` of the block layer.
+      The output `tf.Tensor` of the block layer.
    """
    x = block_fn(
        filters=filters,

--- a/official/vision/beta/modeling/backbones/resnet_3d.py
+++ b/official/vision/beta/modeling/backbones/resnet_3d.py
@@ -41,7 +41,7 @@ RESNET_SPECS = {

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class ResNet3D(tf.keras.Model):
-  """Class to build 3D ResNet family model."""
+  """Creates a 3D ResNet family model."""

  def __init__(self,
               model_id: int,
@@ -60,32 +60,33 @@ class ResNet3D(tf.keras.Model):
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
-    """ResNet3D initialization function.
+    """Initializes a 3D ResNet model.

    Args:
-      model_id: `int` depth of ResNet backbone model.
-      temporal_strides: a list of integers that specifies the temporal strides
+      model_id: An `int` of depth of ResNet backbone model.
+      temporal_strides: A list of integers that specifies the temporal strides
        for all 3d blocks.
-      temporal_kernel_sizes: a list of tuples that specifies the temporal kernel
+      temporal_kernel_sizes: A list of tuples that specifies the temporal kernel
        sizes for all 3d blocks in different block groups.
-      use_self_gating: a list of booleans to specify applying self-gating module
+      use_self_gating: A list of booleans to specify applying self-gating module
        or not in each block group. If None, self-gating is not applied.
-      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
-      stem_conv_temporal_kernel_size: `int` temporal kernel size for the first
-        conv layer.
-      stem_conv_temporal_stride: `int` temporal stride for the first conv layer.
-      stem_pool_temporal_stride: `int` temporal stride for the first pool layer.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      input_specs: A `tf.keras.layers.InputSpec` of the input tensor.
+      stem_conv_temporal_kernel_size: An `int` of temporal kernel size for the
+        first conv layer.
+      stem_conv_temporal_stride: An `int` of temporal stride for the first conv
+        layer.
+      stem_pool_temporal_stride: An `int` of temporal stride for the first pool
+        layer.
+      activation: A `str` of name of the activation function.
+      use_sync_bn: If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_initializer: A str for kernel initializer of convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
-        Default to None.
-      **kwargs: keyword arguments to be passed.
+      **kwargs: Additional keyword arguments to be passed.
    """
    self._model_id = model_id
    self._temporal_strides = temporal_strides
@@ -181,21 +182,23 @@ class ResNet3D(tf.keras.Model):
    """Creates one group of blocks for the ResNet3D model.

    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]`.
-      filters: `int` number of filters for the first convolution of the layer.
-      temporal_kernel_sizes: a tuple that specifies the temporal kernel sizes
+      inputs: A `tf.Tensor` of size `[batch, channels, height, width]`.
+      filters: An `int` of number of filters for the first convolution of the
+        layer.
+      temporal_kernel_sizes: A tuple that specifies the temporal kernel sizes
        for each block in the current group.
-      temporal_strides: `int` temporal strides for the first convolution in this
-        group.
-      spatial_strides: `int` stride to use for the first convolution of the
+      temporal_strides: An `int` of temporal strides for the first convolution
+        in this group.
+      spatial_strides: An `int` stride to use for the first convolution of the
        layer. If greater than 1, this layer will downsample the input.
      block_fn: Either `nn_blocks.ResidualBlock` or `nn_blocks.BottleneckBlock`.
-      block_repeats: `int` number of blocks contained in the layer.
-      use_self_gating: `bool` apply self-gating module or not.
-      name: `str`name for the block.
+      block_repeats: An `int` of number of blocks contained in the layer.
+      use_self_gating: A `bool` that specifies whether to apply self-gating
+        module or not.
+      name: A `str` name for the block.

    Returns:
-      The output `Tensor` of the block layer.
+      The output `tf.Tensor` of the block layer.
    """
    if len(temporal_kernel_sizes) != block_repeats:
      raise ValueError(

--- a/official/vision/beta/modeling/backbones/resnet_deeplab.py
+++ b/official/vision/beta/modeling/backbones/resnet_deeplab.py
@@ -45,12 +45,12 @@ RESNET_SPECS = {

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class DilatedResNet(tf.keras.Model):
-  """Class to build ResNet model with Deeplabv3 modifications.
+  """Creates a ResNet model with Deeplabv3 modifications.

-  This backbone is suitable for semantic segmentation. It was proposed in:
-  [1] Liang-Chieh Chen, George Papandreou, Florian Schroff, Hartwig Adam
+  This backbone is suitable for semantic segmentation. This implements
+    Liang-Chieh Chen, George Papandreou, Florian Schroff, Hartwig Adam.
    Rethinking Atrous Convolution for Semantic Image Segmentation.
-    arXiv:1706.05587
+    (https://arxiv.org/pdf/1706.05587)
  """

  def __init__(self,
@@ -70,30 +70,31 @@ class DilatedResNet(tf.keras.Model):
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
-    """ResNet with DeepLab modification initialization function.
+    """Initializes a ResNet model with DeepLab modification.

    Args:
-      model_id: `int` depth of ResNet backbone model.
-      output_stride: `int` output stride, ratio of input to output resolution.
-      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
-      stem_type: `standard` or `deeplab`, deeplab replaces 7x7 conv by 3 3x3
-        convs.
-      se_ratio: `float` or None. Ratio of the Squeeze-and-Excitation layer.
-      init_stochastic_depth_rate: `float` initial stochastic depth rate.
-      multigrid: `Tuple` of the same length as the number of blocks in the last
+      model_id: An `int` specifies depth of ResNet backbone model.
+      output_stride: An `int` of output stride, ratio of input to output
+        resolution.
+      input_specs: A `tf.keras.layers.InputSpec` of the input tensor.
+      stem_type: A `str` of stem type. Can be `standard` or `deeplab`. `deeplab`
+        replaces 7x7 conv by 3 3x3 convs.
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      init_stochastic_depth_rate: A `float` of initial stochastic depth rate.
+      multigrid: A tuple of the same length as the number of blocks in the last
        resnet stage.
-      last_stage_repeats: `int`, how many times last stage is repeated.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      last_stage_repeats: An `int` that specifies how many times last stage is
+        repeated.
+      activation: A `str` name of the activation function.
+      use_sync_bn: If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_initializer: A str for kernel initializer of convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
-                        Default to None.
-      **kwargs: keyword arguments to be passed.
+      **kwargs: Additional keyword arguments to be passed.
    """
    self._model_id = model_id
    self._output_stride = output_stride
@@ -247,20 +248,22 @@ class DilatedResNet(tf.keras.Model):
    Deeplab applies strides at the last block.

    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]`.
-      filters: `int` number of filters for the first convolution of the layer.
-      strides: `int` stride to use for the first convolution of the layer. If
-        greater than 1, this layer will downsample the input.
-      dilation_rate: `int`, diluted convolution rates.
+      inputs: A `tf.Tensor` of size `[batch, channels, height, width]`.
+      filters: An `int` off number of filters for the first convolution of the
+        layer.
+      strides: An `int` of stride to use for the first convolution of the layer.
+        If greater than 1, this layer will downsample the input.
+      dilation_rate: An `int` of diluted convolution rates.
      block_fn: Either `nn_blocks.ResidualBlock` or `nn_blocks.BottleneckBlock`.
-      block_repeats: `int` number of blocks contained in the layer.
-      stochastic_depth_drop_rate: `float` drop rate of the current block group.
-      multigrid: List of ints or None, if specified, dilation rates for each
+      block_repeats: An `int` of number of blocks contained in the layer.
+      stochastic_depth_drop_rate: A `float` of drop rate of the current block
+        group.
+      multigrid: A list of `int` or None. If specified, dilation rates for each
        block is scaled up by its corresponding factor in the multigrid.
-      name: `str`name for the block.
+      name: A `str` name for the block.

    Returns:
-      The output `Tensor` of the block layer.
+      The output `tf.Tensor` of the block layer.
    """
    if multigrid is not None and len(multigrid) != block_repeats:
      raise ValueError('multigrid has to match number of block_repeats')

--- a/official/vision/beta/modeling/backbones/revnet.py
+++ b/official/vision/beta/modeling/backbones/revnet.py
@@ -13,12 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================="""
-"""RevNet Implementation.
-
-[1] Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse
-    The Reversible Residual Network: Backpropagation Without Storing Activations
-    https://arxiv.org/pdf/1707.04585.pdf
-"""
+"""Contains definitions of RevNet."""

 from typing import Any, Callable, Dict, Optional
 # Import libraries
@@ -55,7 +50,14 @@ REVNET_SPECS = {

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class RevNet(tf.keras.Model):
-  """Reversible ResNet, RevNet implementation."""
+  """Creates a Reversible ResNet (RevNet) family model.
+
+  This implements:
+    Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+    The Reversible Residual Network: Backpropagation Without Storing
+    Activations.
+    (https://arxiv.org/pdf/1707.04585.pdf)
+  """

  def __init__(self,
               model_id: int,
@@ -68,19 +70,19 @@ class RevNet(tf.keras.Model):
               kernel_initializer: str = 'VarianceScaling',
               kernel_regularizer: tf.keras.regularizers.Regularizer = None,
               **kwargs):
-    """RevNet initialization function.
+    """Initializes a RevNet model.

    Args:
-      model_id: `int` depth/id of ResNet backbone model.
-      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
-      activation: `str` name of the activation function.
-      use_sync_bn: `bool` if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      kernel_initializer: `str` kernel_initializer for convolutional layers.
-      kernel_regularizer: `tf.keras.regularizers.Regularizer` for Conv2D.
-      **kwargs: additional keyword arguments to be passed.
+      model_id: An `int` of depth/id of ResNet backbone model.
+      input_specs: A `tf.keras.layers.InputSpec` of the input tensor.
+      activation: A `str` name of the activation function.
+      use_sync_bn: If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_initializer: A str for kernel initializer of convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      **kwargs: Additional keyword arguments to be passed.
    """
    self._model_id = model_id
    self._input_specs = input_specs
@@ -148,19 +150,21 @@ class RevNet(tf.keras.Model):
    """Creates one reversible block for RevNet model.

    Args:
-      inputs: `Tensor` of size `[batch, channels, height, width]`.
-      filters: `int` number of filters for the first convolution of the layer.
-      strides: `int` stride to use for the first convolution of the layer. If
+      inputs: A `tf.Tensor` of size `[batch, channels, height, width]`.
+      filters: An `int` number of filters for the first convolution of the
+        layer.
+      strides: An `int` stride to use for the first convolution of the layer. If
        greater than 1, this block group will downsample the input.
      inner_block_fn: Either `nn_blocks.ResidualInner` or
        `nn_blocks.BottleneckResidualInner`.
-      block_repeats: `int` number of blocks contained in this block group.
-      batch_norm_first: `bool` whether to apply BatchNormalization and
-        activation layer before feeding into convolution layers.
-      name: `str`name for the block.
+      block_repeats: An `int` number of blocks contained in this block group.
+      batch_norm_first: A `bool` that specifies whether to apply
+        BatchNormalization and activation layer before feeding into convolution
+        layers.
+      name: A `str` name for the block.

    Returns:
-      The output `Tensor` of the block layer.
+      The output `tf.Tensor` of the block layer.
    """
    x = inputs
    for i in range(block_repeats):
@@ -210,7 +214,7 @@ def build_revnet(
    input_specs: tf.keras.layers.InputSpec,
    model_config,
    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
-  """Builds ResNet 3d backbone from a config."""
+  """Builds RevNet backbone from a config."""
  backbone_type = model_config.backbone.type
  backbone_cfg = model_config.backbone.get()
  norm_activation_config = model_config.norm_activation

--- a/official/vision/beta/modeling/backbones/spinenet.py
+++ b/official/vision/beta/modeling/backbones/spinenet.py
@@ -13,12 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of SpineNet model.
-
-X. Du, T-Y. Lin, P. Jin, G. Ghiasi, M. Tan, Y. Cui, Q. V. Le, X. Song
-SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization
-https://arxiv.org/abs/1912.05027
-"""
+"""Contains definitions of SpineNet Networks."""
 import math

 # Import libraries
@@ -117,7 +112,14 @@ def build_block_specs(block_specs=None):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class SpineNet(tf.keras.Model):
-  """Class to build SpineNet models."""
+  """Creates a SpineNet family model.
+
+  This implements:
+    Xianzhi Du, Tsung-Yi Lin, Pengchong Jin, Golnaz Ghiasi, Mingxing Tan,
+    Yin Cui, Quoc V. Le, Xiaodan Song.
+    SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization.
+    (https://arxiv.org/abs/1912.05027)
+  """

  def __init__(self,
               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
@@ -137,7 +139,34 @@ class SpineNet(tf.keras.Model):
               norm_momentum=0.99,
               norm_epsilon=0.001,
               **kwargs):
-    """SpineNet model."""
+    """Initializes a SpineNet model.
+
+    Args:
+      input_specs: A `tf.keras.layers.InputSpec` of the input tensor.
+      min_level: An `int` of min level for output mutiscale features.
+      max_level: An `int` of max level for output mutiscale features.
+      block_specs: The block specifications for the SpineNet model discovered by
+        NAS.
+      endpoints_num_filters: An `int` of feature dimension for the output
+        endpoints.
+      resample_alpha: A `float` of resampling factor in cross-scale connections.
+      block_repeats: An `int` of number of blocks contained in the layer.
+      filter_size_scale: A `float` of multiplier for the filters (number of
+        channels) for all convolution ops. The value must be greater than zero.
+        Typical usage will be to set this value in (0, 1) to reduce the number
+        of parameters or computation cost of the model.
+      init_stochastic_depth_rate: A `float` of initial stochastic depth rate.
+      kernel_initializer: A str for kernel initializer of convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A small `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
+    """
    self._input_specs = input_specs
    self._min_level = min_level
    self._max_level = max_level
@@ -235,7 +264,7 @@ class SpineNet(tf.keras.Model):
    return tf.identity(x, name=name)

  def _build_stem(self, inputs):
-    """Build SpineNet stem."""
+    """Builds SpineNet stem."""
    x = layers.Conv2D(
        filters=64,
        kernel_size=7,
@@ -271,7 +300,7 @@ class SpineNet(tf.keras.Model):
                                    net,
                                    input_width,
                                    weighted_fusion=False):
-    """Build scale-permuted network."""
+    """Builds scale-permuted network."""
    net_sizes = [int(math.ceil(input_width / 2**2))] * len(net)
    net_block_fns = [self._init_block_fn] * len(net)
    num_outgoing_connections = [0] * len(net)
@@ -363,7 +392,7 @@ class SpineNet(tf.keras.Model):
    return endpoints

  def _build_endpoints(self, net):
-    """Match filter size for endpoints before sharing conv layers."""
+    """Matches filter size for endpoints before sharing conv layers."""
    endpoints = {}
    for level in range(self._min_level, self._max_level + 1):
      x = layers.Conv2D(
@@ -392,7 +421,7 @@ class SpineNet(tf.keras.Model):
                           target_num_filters,
                           target_block_fn,
                           alpha=0.5):
-    """Match resolution and feature dimension."""
+    """Matches resolution and feature dimension."""
    _, _, _, input_num_filters = inputs.get_shape().as_list()
    if input_block_fn == 'bottleneck':
      input_num_filters /= 4
@@ -493,7 +522,7 @@ def build_spinenet(
    input_specs: tf.keras.layers.InputSpec,
    model_config,
    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
-  """Builds ResNet 3d backbone from a config."""
+  """Builds SpineNet backbone from a config."""
  backbone_type = model_config.backbone.type
  backbone_cfg = model_config.backbone.get()
  norm_activation_config = model_config.norm_activation

--- a/official/vision/beta/modeling/layers/box_sampler.py
+++ b/official/vision/beta/modeling/layers/box_sampler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Box sampler."""
+"""Contains definitions of box sampler."""

 # Import libraries
 import tensorflow as tf
@@ -22,19 +22,19 @@ from official.vision.beta.ops import sampling_ops

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class BoxSampler(tf.keras.layers.Layer):
-  """Sample positive and negative boxes."""
+  """Creates a BoxSampler to sample positive and negative boxes."""

  def __init__(self,
               num_samples=512,
               foreground_fraction=0.25,
               **kwargs):
-    """Initializes a ROI sampler.
+    """Initializes a box sampler.

    Args:
-      num_samples: int, the number of sampled boxes per image.
-      foreground_fraction: float in [0, 1], what percentage of boxes should be
-        sampled from the positive examples.
-      **kwargs: other key word arguments passed to Layer.
+      num_samples: An `int` of the number of sampled boxes per image.
+      foreground_fraction: A `float` in [0, 1], what percentage of boxes should
+        be sampled from the positive examples.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'num_samples': num_samples,
@@ -43,22 +43,22 @@ class BoxSampler(tf.keras.layers.Layer):
    super(BoxSampler, self).__init__(**kwargs)

  def call(self, positive_matches, negative_matches, ignored_matches):
-    """Sample and select positive and negative instances.
+    """Samples and selects positive and negative instances.

    Args:
-      positive_matches: a `bool` tensor of shape of [batch, N] where N is the
+      positive_matches: A `bool` tensor of shape of [batch, N] where N is the
        number of instances. For each element, `True` means the instance
        corresponds to a positive example.
-      negative_matches: a `bool` tensor of shape of [batch, N] where N is the
+      negative_matches: A `bool` tensor of shape of [batch, N] where N is the
        number of instances. For each element, `True` means the instance
        corresponds to a negative example.
-      ignored_matches: a `bool` tensor of shape of [batch, N] where N is the
-        number of instances. For each element, `True` means the instance
-        should be ignored.
+      ignored_matches: A `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance should
+        be ignored.

    Returns:
-      selected_indices: a tensor of shape of [batch_size, K], storing the
-        indices of the sampled examples, where K is `num_samples`.
+      A `tf.tensor` of shape of [batch_size, K], storing the indices of the
+        sampled examples, where K is `num_samples`.
    """
    sample_candidates = tf.logical_and(
        tf.logical_or(positive_matches, negative_matches),

--- a/official/vision/beta/modeling/layers/detection_generator.py
+++ b/official/vision/beta/modeling/layers/detection_generator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Generators to generate the final detections."""
+"""Contains definitions of generators to generate the final detections."""

 # Import libraries

@@ -28,39 +28,41 @@ def _generate_detections_v1(boxes,
                            pre_nms_score_threshold=0.05,
                            nms_iou_threshold=0.5,
                            max_num_detections=100):
-  """Generate the final detections given the model outputs.
+  """Generates the final detections given the model outputs.

  The implementation unrolls the batch dimension and process images one by one.
  It required the batch dimension to be statically known and it is TPU
  compatible.

  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or
-      [batch_size, N, 1, 4], which box predictions on all feature levels. The N
-      is the number of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
      stacks class probability on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
-    pre_nms_top_k: an int number of top candidate detections per class
-      before NMS.
-    pre_nms_score_threshold: a float representing the threshold for deciding
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
-    nms_iou_threshold: a float representing the threshold for deciding whether
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
-    max_num_detections: a scalar representing maximum number of boxes retained
+    max_num_detections: A scalar representing maximum number of boxes retained
      over all classes.

  Returns:
-    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
-      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
-      representing sorted confidence scores for detected boxes. The values are
-      between [0, 1].
-    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
-      representing classes for detected boxes.
-    valid_detections: `int` Tensor of shape [batch_size] only the top
-      `valid_detections` boxes are valid detections.
+    nms_boxes: A `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections, 4]` representing top detected boxes in
+      `[y1, x1, y2, x2]`.
+    nms_scores: A `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections]` representing sorted confidence scores
+      for detected boxes. The values are between `[0, 1]`.
+    nms_classes: An `int` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections]` representing classes for detected
+      boxes.
+    valid_detections: An `int` type `tf.Tensor` of shape `[batch_size]` only the
+       top `valid_detections` boxes are valid detections.
  """
  with tf.name_scope('generate_detections'):
    batch_size = scores.get_shape().as_list()[0]
@@ -94,34 +96,35 @@ def _generate_detections_per_image(boxes,
                                   pre_nms_score_threshold=0.05,
                                   nms_iou_threshold=0.5,
                                   max_num_detections=100):
-  """Generate the final detections per image given the model outputs.
+  """Generates the final detections per image given the model outputs.

  Args:
-    boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box
-      predictions on all feature levels. The N is the number of total anchors on
-      all levels.
-    scores: a tensor with shape [N, num_classes], which stacks class probability
-      on all feature levels. The N is the number of total anchors on all levels.
-      The num_classes is the number of classes predicted by the model. Note that
-      the class_outputs here is the raw score.
-    pre_nms_top_k: an int number of top candidate detections per class
-      before NMS.
-    pre_nms_score_threshold: a float representing the threshold for deciding
+    boxes: A  `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which
+      box predictions on all feature levels. The N is the number of total
+      anchors on all levels.
+    scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class
+      probability on all feature levels. The N is the number of total anchors on
+      all levels. The num_classes is the number of classes predicted by the
+      model. Note that the class_outputs here is the raw score.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
-    nms_iou_threshold: a float representing the threshold for deciding whether
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
-    max_num_detections: a scalar representing maximum number of boxes retained
+    max_num_detections: A `scalar` representing maximum number of boxes retained
      over all classes.

  Returns:
-    nms_boxes: `float` Tensor of shape [max_num_detections, 4] representing top
-      detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [max_num_detections] representing sorted
-      confidence scores for detected boxes. The values are between [0, 1].
-    nms_classes: `int` Tensor of shape [max_num_detections] representing classes
-      for detected boxes.
-    valid_detections: `int` Tensor of shape [1] only the top `valid_detections`
-      boxes are valid detections.
+    nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]`
+      representing top detected boxes in `[y1, x1, y2, x2]`.
+    nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing
+      sorted confidence scores for detected boxes. The values are between [0,
+      1].
+    nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing
+      classes for detected boxes.
+    valid_detections: An `int` tf.Tensor of shape [1] only the top
+      `valid_detections` boxes are valid detections.
  """
  nmsed_boxes = []
  nmsed_scores = []
@@ -171,18 +174,18 @@ def _generate_detections_per_image(boxes,


 def _select_top_k_scores(scores_in, pre_nms_num_detections):
-  """Select top_k scores and indices for each class.
+  """Selects top_k scores and indices for each class.

  Args:
-    scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
-      class logit outputs on all feature levels. The N is the number of total
-      anchors on all levels. The num_classes is the number of classes predicted
-      by the model.
+    scores_in: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class logit outputs on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model.
    pre_nms_num_detections: Number of candidates before NMS.

  Returns:
-    scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
-      num_classes].
+    scores and indices: A `tf.Tensor` with shape
+      `[batch_size, pre_nms_num_detections, num_classes]`.
  """
  batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
@@ -206,7 +209,7 @@ def _generate_detections_v2(boxes,
                            pre_nms_score_threshold=0.05,
                            nms_iou_threshold=0.5,
                            max_num_detections=100):
-  """Generate the final detections given the model outputs.
+  """Generates the final detections given the model outputs.

  This implementation unrolls classes dimension while using the tf.while_loop
  to implement the batched NMS, so that it can be parallelized at the batch
@@ -214,31 +217,31 @@ def _generate_detections_v2(boxes,
  It is TPU compatible.

  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
-      N, 1, 4], which box predictions on all feature levels. The N is the number
-      of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which stacks class
-      probability on all feature levels. The N is the number of total anchors on
-      all levels. The num_classes is the number of classes predicted by the
-      model. Note that the class_outputs here is the raw score.
-    pre_nms_top_k: an int number of top candidate detections per class
-      before NMS.
-    pre_nms_score_threshold: a float representing the threshold for deciding
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
-    nms_iou_threshold: a float representing the threshold for deciding whether
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
-    max_num_detections: a scalar representing maximum number of boxes retained
+    max_num_detections: A `scalar` representing maximum number of boxes retained
      over all classes.

  Returns:
-    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
+    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
+    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
      representing sorted confidence scores for detected boxes. The values are
      between [0, 1].
-    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
+    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
      representing classes for detected boxes.
-    valid_detections: `int` Tensor of shape [batch_size] only the top
+    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
      `valid_detections` boxes are valid detections.
  """
  with tf.name_scope('generate_detections'):
@@ -294,29 +297,29 @@ def _generate_detections_batched(boxes,
  supported on TPU currently.

  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or
-      [batch_size, N, 1, 4], which box predictions on all feature levels. The N
-      is the number of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
      stacks class probability on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
-    pre_nms_score_threshold: a float representing the threshold for deciding
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
-    nms_iou_threshold: a float representing the threshold for deciding whether
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
-    max_num_detections: a scalar representing maximum number of boxes retained
+    max_num_detections: A `scalar` representing maximum number of boxes retained
      over all classes.

  Returns:
-    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
+    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
+    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
      representing sorted confidence scores for detected boxes. The values are
      between [0, 1].
-    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
+    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
      representing classes for detected boxes.
-    valid_detections: `int` Tensor of shape [batch_size] only the top
+    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
      `valid_detections` boxes are valid detections.
  """
  with tf.name_scope('generate_detections'):
@@ -348,18 +351,19 @@ class DetectionGenerator(tf.keras.layers.Layer):
    """Initializes a detection generator.

    Args:
-      apply_nms: bool, whether or not apply non maximum suppression. If False,
-        the decoded boxes and their scores are returned.
-      pre_nms_top_k: int, the number of top scores proposals to be kept before
-        applying NMS.
-      pre_nms_score_threshold: float, the score threshold to apply before
+      apply_nms: A `bool` of whether or not apply non maximum suppression.
+        If False, the decoded boxes and their scores are returned.
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
        applying  NMS. Proposals whose scores are below this threshold are
        thrown away.
-      nms_iou_threshold: float in [0, 1], the NMS IoU threshold.
-      max_num_detections: int, the final number of total detections to generate.
-      use_batched_nms: bool, whether or not use
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      max_num_detections: An `int` of the final number of total detections to
+        generate.
+      use_batched_nms: A `bool` of whether or not use
        `tf.image.combined_non_max_suppression`.
-      **kwargs: other key word arguments passed to Layer.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'apply_nms': apply_nms,
@@ -376,35 +380,36 @@ class DetectionGenerator(tf.keras.layers.Layer):
               raw_scores,
               anchor_boxes,
               image_shape):
-    """Generate final detections.
+    """Generates final detections.

    Args:
-      raw_boxes: a tensor of shape of [batch_size, K, num_classes * 4]
+      raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]`
        representing the class-specific box coordinates relative to anchors.
-      raw_scores: a tensor of shape of [batch_size, K, num_classes]
+      raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]`
        representing the class logits before applying score activiation.
-      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
-        corresponding anchor boxes w.r.t `box_outputs`.
-      image_shape: a tensor of shape of [batch_size, 2] storing the image height
-        and width w.r.t. the scaled image, i.e. the same image space as
+      anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing
+        the corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image
+        height and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.

    Returns:
      If `apply_nms` = True, the return is a dictionary with keys:
-        `detection_boxes`: float Tensor of shape [batch, max_num_detections, 4]
-          representing top detected boxes in [y1, x1, y2, x2].
-        `detection_scores`: float Tensor of shape [batch, max_num_detections]
-          representing sorted confidence scores for detected boxes. The values
-          are between [0, 1].
-        `detection_classes`: int Tensor of shape [batch, max_num_detections]
-          representing classes for detected boxes.
-        `num_detections`: int Tensor of shape [batch] only the first
+        `detection_boxes`: A `float` tf.Tensor of shape
+          [batch, max_num_detections, 4] representing top detected boxes in
+          [y1, x1, y2, x2].
+        `detection_scores`: A `float` `tf.Tensor` of shape
+          [batch, max_num_detections] representing sorted confidence scores for
+          detected boxes. The values are between [0, 1].
+        `detection_classes`: An `int` tf.Tensor of shape
+          [batch, max_num_detections] representing classes for detected boxes.
+        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
          `num_detections` boxes are valid detections
      If `apply_nms` = False, the return is a dictionary with keys:
-        `decoded_boxes`: float Tensor of shape [batch, num_raw_boxes, 4]
+        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
          representing all the decoded boxes.
-        `decoded_box_scores`: float Tensor of shape [batch, num_raw_boxes]
-          representing socres of all the decoded boxes.
+        `decoded_box_scores`: A `float` tf.Tensor of shape
+          [batch, num_raw_boxes] representing socres of all the decoded boxes.
    """
    box_scores = tf.nn.softmax(raw_scores, axis=-1)

@@ -496,21 +501,22 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
               max_num_detections=100,
               use_batched_nms=False,
               **kwargs):
-    """Initializes a detection generator.
+    """Initializes a multi-level detection generator.

    Args:
-      apply_nms: bool, whether or not apply non maximum suppression. If False,
-        the decoded boxes and their scores are returned.
-      pre_nms_top_k: int, the number of top scores proposals to be kept before
-        applying NMS.
-      pre_nms_score_threshold: float, the score threshold to apply before
-        applying  NMS. Proposals whose scores are below this threshold are
-        thrown away.
-      nms_iou_threshold: float in [0, 1], the NMS IoU threshold.
-      max_num_detections: int, the final number of total detections to generate.
-      use_batched_nms: bool, whether or not use
+      apply_nms: A `bool` of whether or not apply non maximum suppression. If
+        False, the decoded boxes and their scores are returned.
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying NMS. Proposals whose scores are below this threshold are thrown
+        away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      max_num_detections: An `int` of the final number of total detections to
+        generate.
+      use_batched_nms: A `bool` of whether or not use
        `tf.image.combined_non_max_suppression`.
-      **kwargs: other key word arguments passed to Layer.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'apply_nms': apply_nms,
@@ -527,37 +533,38 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
               raw_scores,
               anchor_boxes,
               image_shape):
-    """Generate final detections.
+    """Generates final detections.

    Args:
-      raw_boxes: a dict with keys representing FPN levels and values
-        representing box tenors of shape
-        [batch, feature_h, feature_w, num_anchors * 4].
-      raw_scores: a dict with keys representing FPN levels and values
-        representing logit tensors of shape
-        [batch, feature_h, feature_w, num_anchors].
-      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
-        corresponding anchor boxes w.r.t `box_outputs`.
-      image_shape: a tensor of shape of [batch_size, 2] storing the image height
-        and width w.r.t. the scaled image, i.e. the same image space as
+      raw_boxes: A `dict` with keys representing FPN levels and values
+        representing box tenors of shape `[batch, feature_h, feature_w,
+        num_anchors * 4]`.
+      raw_scores: A `dict` with keys representing FPN levels and values
+        representing logit tensors of shape `[batch, feature_h, feature_w,
+        num_anchors]`.
+      anchor_boxes: A `tf.Tensor` of shape of [batch_size, K, 4] representing
+        the corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image
+        height and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.

    Returns:
      If `apply_nms` = True, the return is a dictionary with keys:
-        `detection_boxes`: float Tensor of shape [batch, max_num_detections, 4]
-          representing top detected boxes in [y1, x1, y2, x2].
-        `detection_scores`: float Tensor of shape [batch, max_num_detections]
-          representing sorted confidence scores for detected boxes. The values
-          are between [0, 1].
-        `detection_classes`: int Tensor of shape [batch, max_num_detections]
-          representing classes for detected boxes.
-        `num_detections`: int Tensor of shape [batch] only the first
+        `detection_boxes`: A `float` tf.Tensor of shape
+          [batch, max_num_detections, 4] representing top detected boxes in
+          [y1, x1, y2, x2].
+        `detection_scores`: A `float` tf.Tensor of shape
+          [batch, max_num_detections] representing sorted confidence scores for
+          detected boxes. The values are between [0, 1].
+        `detection_classes`: An `int` tf.Tensor of shape
+          [batch, max_num_detections] representing classes for detected boxes.
+        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
          `num_detections` boxes are valid detections
      If `apply_nms` = False, the return is a dictionary with keys:
-        `decoded_boxes`: float Tensor of shape [batch, num_raw_boxes, 4]
+        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
          representing all the decoded boxes.
-        `decoded_box_scores`: float Tensor of shape [batch, num_raw_boxes]
-          representing socres of all the decoded boxes.
+        `decoded_box_scores`: A `float` tf.Tensor of shape
+          [batch, num_raw_boxes] representing socres of all the decoded boxes.
    """
    # Collects outputs from all levels into a list.
    boxes = []