Internal change to docstring.

PiperOrigin-RevId: 362111110

Internal change to docstring.
PiperOrigin-RevId: 362111110
5b952c08 · Fan Yang · A. Unique TensorFlower · 5df0cd30 · 5b952c08 · 5b952c08
Commit 5b952c08 authored Mar 10, 2021 by Fan Yang Committed by A. Unique TensorFlower Mar 10, 2021
9 changed files
--- a/official/vision/beta/modeling/layers/box_sampler.py
+++ b/official/vision/beta/modeling/layers/box_sampler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Box sampler."""
+"""Contains definitions of box sampler."""

 # Import libraries
 import tensorflow as tf
@@ -22,19 +22,19 @@ from official.vision.beta.ops import sampling_ops

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class BoxSampler(tf.keras.layers.Layer):
-  """Sample positive and negative boxes."""
+  """Creates a BoxSampler to sample positive and negative boxes."""

  def __init__(self,
               num_samples=512,
               foreground_fraction=0.25,
               **kwargs):
-    """Initializes a ROI sampler.
+    """Initializes a box sampler.

    Args:
-      num_samples: int, the number of sampled boxes per image.
-      foreground_fraction: float in [0, 1], what percentage of boxes should be
-        sampled from the positive examples.
-      **kwargs: other key word arguments passed to Layer.
+      num_samples: An `int` of the number of sampled boxes per image.
+      foreground_fraction: A `float` in [0, 1], what percentage of boxes should
+        be sampled from the positive examples.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'num_samples': num_samples,
@@ -43,22 +43,22 @@ class BoxSampler(tf.keras.layers.Layer):
    super(BoxSampler, self).__init__(**kwargs)

  def call(self, positive_matches, negative_matches, ignored_matches):
-    """Sample and select positive and negative instances.
+    """Samples and selects positive and negative instances.

    Args:
-      positive_matches: a `bool` tensor of shape of [batch, N] where N is the
+      positive_matches: A `bool` tensor of shape of [batch, N] where N is the
        number of instances. For each element, `True` means the instance
        corresponds to a positive example.
-      negative_matches: a `bool` tensor of shape of [batch, N] where N is the
+      negative_matches: A `bool` tensor of shape of [batch, N] where N is the
        number of instances. For each element, `True` means the instance
        corresponds to a negative example.
-      ignored_matches: a `bool` tensor of shape of [batch, N] where N is the
-        number of instances. For each element, `True` means the instance
-        should be ignored.
+      ignored_matches: A `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance should
+        be ignored.

    Returns:
-      selected_indices: a tensor of shape of [batch_size, K], storing the
-        indices of the sampled examples, where K is `num_samples`.
+      A `tf.tensor` of shape of [batch_size, K], storing the indices of the
+        sampled examples, where K is `num_samples`.
    """
    sample_candidates = tf.logical_and(
        tf.logical_or(positive_matches, negative_matches),

--- a/official/vision/beta/modeling/layers/detection_generator.py
+++ b/official/vision/beta/modeling/layers/detection_generator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Generators to generate the final detections."""
+"""Contains definitions of generators to generate the final detections."""

 # Import libraries

@@ -28,39 +28,41 @@ def _generate_detections_v1(boxes,
                            pre_nms_score_threshold=0.05,
                            nms_iou_threshold=0.5,
                            max_num_detections=100):
-  """Generate the final detections given the model outputs.
+  """Generates the final detections given the model outputs.

  The implementation unrolls the batch dimension and process images one by one.
  It required the batch dimension to be statically known and it is TPU
  compatible.

  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or
-      [batch_size, N, 1, 4], which box predictions on all feature levels. The N
-      is the number of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
      stacks class probability on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
-    pre_nms_top_k: an int number of top candidate detections per class
-      before NMS.
-    pre_nms_score_threshold: a float representing the threshold for deciding
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
-    nms_iou_threshold: a float representing the threshold for deciding whether
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
-    max_num_detections: a scalar representing maximum number of boxes retained
+    max_num_detections: A scalar representing maximum number of boxes retained
      over all classes.

  Returns:
-    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
-      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
-      representing sorted confidence scores for detected boxes. The values are
-      between [0, 1].
-    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
-      representing classes for detected boxes.
-    valid_detections: `int` Tensor of shape [batch_size] only the top
-      `valid_detections` boxes are valid detections.
+    nms_boxes: A `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections, 4]` representing top detected boxes in
+      `[y1, x1, y2, x2]`.
+    nms_scores: A `float` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections]` representing sorted confidence scores
+      for detected boxes. The values are between `[0, 1]`.
+    nms_classes: An `int` type `tf.Tensor` of shape
+      `[batch_size, max_num_detections]` representing classes for detected
+      boxes.
+    valid_detections: An `int` type `tf.Tensor` of shape `[batch_size]` only the
+       top `valid_detections` boxes are valid detections.
  """
  with tf.name_scope('generate_detections'):
    batch_size = scores.get_shape().as_list()[0]
@@ -94,34 +96,35 @@ def _generate_detections_per_image(boxes,
                                   pre_nms_score_threshold=0.05,
                                   nms_iou_threshold=0.5,
                                   max_num_detections=100):
-  """Generate the final detections per image given the model outputs.
+  """Generates the final detections per image given the model outputs.

  Args:
-    boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box
-      predictions on all feature levels. The N is the number of total anchors on
-      all levels.
-    scores: a tensor with shape [N, num_classes], which stacks class probability
-      on all feature levels. The N is the number of total anchors on all levels.
-      The num_classes is the number of classes predicted by the model. Note that
-      the class_outputs here is the raw score.
-    pre_nms_top_k: an int number of top candidate detections per class
-      before NMS.
-    pre_nms_score_threshold: a float representing the threshold for deciding
+    boxes: A  `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which
+      box predictions on all feature levels. The N is the number of total
+      anchors on all levels.
+    scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class
+      probability on all feature levels. The N is the number of total anchors on
+      all levels. The num_classes is the number of classes predicted by the
+      model. Note that the class_outputs here is the raw score.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
-    nms_iou_threshold: a float representing the threshold for deciding whether
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
-    max_num_detections: a scalar representing maximum number of boxes retained
+    max_num_detections: A `scalar` representing maximum number of boxes retained
      over all classes.

  Returns:
-    nms_boxes: `float` Tensor of shape [max_num_detections, 4] representing top
-      detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [max_num_detections] representing sorted
-      confidence scores for detected boxes. The values are between [0, 1].
-    nms_classes: `int` Tensor of shape [max_num_detections] representing classes
-      for detected boxes.
-    valid_detections: `int` Tensor of shape [1] only the top `valid_detections`
-      boxes are valid detections.
+    nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]`
+      representing top detected boxes in `[y1, x1, y2, x2]`.
+    nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing
+      sorted confidence scores for detected boxes. The values are between [0,
+      1].
+    nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing
+      classes for detected boxes.
+    valid_detections: An `int` tf.Tensor of shape [1] only the top
+      `valid_detections` boxes are valid detections.
  """
  nmsed_boxes = []
  nmsed_scores = []
@@ -171,18 +174,18 @@ def _generate_detections_per_image(boxes,


 def _select_top_k_scores(scores_in, pre_nms_num_detections):
-  """Select top_k scores and indices for each class.
+  """Selects top_k scores and indices for each class.

  Args:
-    scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
-      class logit outputs on all feature levels. The N is the number of total
-      anchors on all levels. The num_classes is the number of classes predicted
-      by the model.
+    scores_in: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class logit outputs on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model.
    pre_nms_num_detections: Number of candidates before NMS.

  Returns:
-    scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
-      num_classes].
+    scores and indices: A `tf.Tensor` with shape
+      `[batch_size, pre_nms_num_detections, num_classes]`.
  """
  batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
@@ -206,7 +209,7 @@ def _generate_detections_v2(boxes,
                            pre_nms_score_threshold=0.05,
                            nms_iou_threshold=0.5,
                            max_num_detections=100):
-  """Generate the final detections given the model outputs.
+  """Generates the final detections given the model outputs.

  This implementation unrolls classes dimension while using the tf.while_loop
  to implement the batched NMS, so that it can be parallelized at the batch
@@ -214,31 +217,31 @@ def _generate_detections_v2(boxes,
  It is TPU compatible.

  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
-      N, 1, 4], which box predictions on all feature levels. The N is the number
-      of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which stacks class
-      probability on all feature levels. The N is the number of total anchors on
-      all levels. The num_classes is the number of classes predicted by the
-      model. Note that the class_outputs here is the raw score.
-    pre_nms_top_k: an int number of top candidate detections per class
-      before NMS.
-    pre_nms_score_threshold: a float representing the threshold for deciding
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    pre_nms_top_k: An `int` number of top candidate detections per class before
+      NMS.
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
-    nms_iou_threshold: a float representing the threshold for deciding whether
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
-    max_num_detections: a scalar representing maximum number of boxes retained
+    max_num_detections: A `scalar` representing maximum number of boxes retained
      over all classes.

  Returns:
-    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
+    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
+    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
      representing sorted confidence scores for detected boxes. The values are
      between [0, 1].
-    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
+    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
      representing classes for detected boxes.
-    valid_detections: `int` Tensor of shape [batch_size] only the top
+    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
      `valid_detections` boxes are valid detections.
  """
  with tf.name_scope('generate_detections'):
@@ -294,29 +297,29 @@ def _generate_detections_batched(boxes,
  supported on TPU currently.

  Args:
-    boxes: a tensor with shape [batch_size, N, num_classes, 4] or
-      [batch_size, N, 1, 4], which box predictions on all feature levels. The N
-      is the number of total anchors on all levels.
-    scores: a tensor with shape [batch_size, N, num_classes], which
+    boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
+      `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
+      N is the number of total anchors on all levels.
+    scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
      stacks class probability on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
-    pre_nms_score_threshold: a float representing the threshold for deciding
+    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
-    nms_iou_threshold: a float representing the threshold for deciding whether
+    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
-    max_num_detections: a scalar representing maximum number of boxes retained
+    max_num_detections: A `scalar` representing maximum number of boxes retained
      over all classes.

  Returns:
-    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
+    nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
      representing top detected boxes in [y1, x1, y2, x2].
-    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
+    nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
      representing sorted confidence scores for detected boxes. The values are
      between [0, 1].
-    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
+    nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
      representing classes for detected boxes.
-    valid_detections: `int` Tensor of shape [batch_size] only the top
+    valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
      `valid_detections` boxes are valid detections.
  """
  with tf.name_scope('generate_detections'):
@@ -348,18 +351,19 @@ class DetectionGenerator(tf.keras.layers.Layer):
    """Initializes a detection generator.

    Args:
-      apply_nms: bool, whether or not apply non maximum suppression. If False,
-        the decoded boxes and their scores are returned.
-      pre_nms_top_k: int, the number of top scores proposals to be kept before
-        applying NMS.
-      pre_nms_score_threshold: float, the score threshold to apply before
+      apply_nms: A `bool` of whether or not apply non maximum suppression.
+        If False, the decoded boxes and their scores are returned.
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
        applying  NMS. Proposals whose scores are below this threshold are
        thrown away.
-      nms_iou_threshold: float in [0, 1], the NMS IoU threshold.
-      max_num_detections: int, the final number of total detections to generate.
-      use_batched_nms: bool, whether or not use
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      max_num_detections: An `int` of the final number of total detections to
+        generate.
+      use_batched_nms: A `bool` of whether or not use
        `tf.image.combined_non_max_suppression`.
-      **kwargs: other key word arguments passed to Layer.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'apply_nms': apply_nms,
@@ -376,35 +380,36 @@ class DetectionGenerator(tf.keras.layers.Layer):
               raw_scores,
               anchor_boxes,
               image_shape):
-    """Generate final detections.
+    """Generates final detections.

    Args:
-      raw_boxes: a tensor of shape of [batch_size, K, num_classes * 4]
+      raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]`
        representing the class-specific box coordinates relative to anchors.
-      raw_scores: a tensor of shape of [batch_size, K, num_classes]
+      raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]`
        representing the class logits before applying score activiation.
-      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
-        corresponding anchor boxes w.r.t `box_outputs`.
-      image_shape: a tensor of shape of [batch_size, 2] storing the image height
-        and width w.r.t. the scaled image, i.e. the same image space as
+      anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing
+        the corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image
+        height and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.

    Returns:
      If `apply_nms` = True, the return is a dictionary with keys:
-        `detection_boxes`: float Tensor of shape [batch, max_num_detections, 4]
-          representing top detected boxes in [y1, x1, y2, x2].
-        `detection_scores`: float Tensor of shape [batch, max_num_detections]
-          representing sorted confidence scores for detected boxes. The values
-          are between [0, 1].
-        `detection_classes`: int Tensor of shape [batch, max_num_detections]
-          representing classes for detected boxes.
-        `num_detections`: int Tensor of shape [batch] only the first
+        `detection_boxes`: A `float` tf.Tensor of shape
+          [batch, max_num_detections, 4] representing top detected boxes in
+          [y1, x1, y2, x2].
+        `detection_scores`: A `float` `tf.Tensor` of shape
+          [batch, max_num_detections] representing sorted confidence scores for
+          detected boxes. The values are between [0, 1].
+        `detection_classes`: An `int` tf.Tensor of shape
+          [batch, max_num_detections] representing classes for detected boxes.
+        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
          `num_detections` boxes are valid detections
      If `apply_nms` = False, the return is a dictionary with keys:
-        `decoded_boxes`: float Tensor of shape [batch, num_raw_boxes, 4]
+        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
          representing all the decoded boxes.
-        `decoded_box_scores`: float Tensor of shape [batch, num_raw_boxes]
-          representing socres of all the decoded boxes.
+        `decoded_box_scores`: A `float` tf.Tensor of shape
+          [batch, num_raw_boxes] representing socres of all the decoded boxes.
    """
    box_scores = tf.nn.softmax(raw_scores, axis=-1)

@@ -496,21 +501,22 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
               max_num_detections=100,
               use_batched_nms=False,
               **kwargs):
-    """Initializes a detection generator.
+    """Initializes a multi-level detection generator.

    Args:
-      apply_nms: bool, whether or not apply non maximum suppression. If False,
-        the decoded boxes and their scores are returned.
-      pre_nms_top_k: int, the number of top scores proposals to be kept before
-        applying NMS.
-      pre_nms_score_threshold: float, the score threshold to apply before
-        applying  NMS. Proposals whose scores are below this threshold are
-        thrown away.
-      nms_iou_threshold: float in [0, 1], the NMS IoU threshold.
-      max_num_detections: int, the final number of total detections to generate.
-      use_batched_nms: bool, whether or not use
+      apply_nms: A `bool` of whether or not apply non maximum suppression. If
+        False, the decoded boxes and their scores are returned.
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
+        applying NMS. Proposals whose scores are below this threshold are thrown
+        away.
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      max_num_detections: An `int` of the final number of total detections to
+        generate.
+      use_batched_nms: A `bool` of whether or not use
        `tf.image.combined_non_max_suppression`.
-      **kwargs: other key word arguments passed to Layer.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'apply_nms': apply_nms,
@@ -527,37 +533,38 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
               raw_scores,
               anchor_boxes,
               image_shape):
-    """Generate final detections.
+    """Generates final detections.

    Args:
-      raw_boxes: a dict with keys representing FPN levels and values
-        representing box tenors of shape
-        [batch, feature_h, feature_w, num_anchors * 4].
-      raw_scores: a dict with keys representing FPN levels and values
-        representing logit tensors of shape
-        [batch, feature_h, feature_w, num_anchors].
-      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
-        corresponding anchor boxes w.r.t `box_outputs`.
-      image_shape: a tensor of shape of [batch_size, 2] storing the image height
-        and width w.r.t. the scaled image, i.e. the same image space as
+      raw_boxes: A `dict` with keys representing FPN levels and values
+        representing box tenors of shape `[batch, feature_h, feature_w,
+        num_anchors * 4]`.
+      raw_scores: A `dict` with keys representing FPN levels and values
+        representing logit tensors of shape `[batch, feature_h, feature_w,
+        num_anchors]`.
+      anchor_boxes: A `tf.Tensor` of shape of [batch_size, K, 4] representing
+        the corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image
+        height and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.

    Returns:
      If `apply_nms` = True, the return is a dictionary with keys:
-        `detection_boxes`: float Tensor of shape [batch, max_num_detections, 4]
-          representing top detected boxes in [y1, x1, y2, x2].
-        `detection_scores`: float Tensor of shape [batch, max_num_detections]
-          representing sorted confidence scores for detected boxes. The values
-          are between [0, 1].
-        `detection_classes`: int Tensor of shape [batch, max_num_detections]
-          representing classes for detected boxes.
-        `num_detections`: int Tensor of shape [batch] only the first
+        `detection_boxes`: A `float` tf.Tensor of shape
+          [batch, max_num_detections, 4] representing top detected boxes in
+          [y1, x1, y2, x2].
+        `detection_scores`: A `float` tf.Tensor of shape
+          [batch, max_num_detections] representing sorted confidence scores for
+          detected boxes. The values are between [0, 1].
+        `detection_classes`: An `int` tf.Tensor of shape
+          [batch, max_num_detections] representing classes for detected boxes.
+        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
          `num_detections` boxes are valid detections
      If `apply_nms` = False, the return is a dictionary with keys:
-        `decoded_boxes`: float Tensor of shape [batch, num_raw_boxes, 4]
+        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
          representing all the decoded boxes.
-        `decoded_box_scores`: float Tensor of shape [batch, num_raw_boxes]
-          representing socres of all the decoded boxes.
+        `decoded_box_scores`: A `float` tf.Tensor of shape
+          [batch, num_raw_boxes] representing socres of all the decoded boxes.
    """
    # Collects outputs from all levels into a list.
    boxes = []

--- a/official/vision/beta/modeling/layers/mask_sampler.py
+++ b/official/vision/beta/modeling/layers/mask_sampler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Mask sampler."""
+"""Contains definitions of mask sampler."""

 # Import libraries
 import tensorflow as tf
@@ -30,34 +30,34 @@ def _sample_and_crop_foreground_masks(candidate_rois,
  """Samples and creates cropped foreground masks for training.

  Args:
-    candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
+    candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is the
      number of candidate RoIs to be considered for mask sampling. It includes
      both positive and negative RoIs. The `num_mask_samples_per_image` positive
      RoIs will be sampled to create mask training targets.
-    candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
-      corresponding groundtruth boxes to the `candidate_rois`.
-    candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
+    candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
+      the corresponding groundtruth boxes to the `candidate_rois`.
+    candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing the
      corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
      corresponds to the background class, i.e. negative RoIs.
-    candidate_gt_indices: a tensor of shape [batch_size, N], storing the
+    candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
      corresponding groundtruth instance indices to the `candidate_gt_boxes`,
      i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
-      gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is the
-      superset of candidate_gt_boxes.
-    gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
-      containing all the groundtruth masks which sample masks are drawn from.
-    num_sampled_masks: an integer which specifies the number of masks
-      to sample.
-    mask_target_size: an integer which specifies the final cropped mask size
-      after sampling. The output masks are resized w.r.t the sampled RoIs.
+      gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
+      the superset of candidate_gt_boxes.
+    gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
+      mask_width] containing all the groundtruth masks which sample masks are
+      drawn from.
+    num_sampled_masks: An `int` that specifies the number of masks to sample.
+    mask_target_size: An `int` that specifies the final cropped mask size after
+      sampling. The output masks are resized w.r.t the sampled RoIs.

  Returns:
-    foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
-      that corresponds to the sampled foreground masks, where
+    foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
+      RoI that corresponds to the sampled foreground masks, where
      K = num_mask_samples_per_image.
-    foreground_classes: a tensor of shape of [batch_size, K] storing the classes
-      corresponding to the sampled foreground masks.
-    cropoped_foreground_masks: a tensor of shape of
+    foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
+      classes corresponding to the sampled foreground masks.
+    cropoped_foreground_masks: A `tf.Tensor` of shape of
      [batch_size, K, mask_target_size, mask_target_size] storing the cropped
      foreground masks used for training.
  """
@@ -120,34 +120,36 @@ class MaskSampler(tf.keras.layers.Layer):
           candidate_gt_classes,
           candidate_gt_indices,
           gt_masks):
-    """Sample and create mask targets for training.
+    """Samples and creates mask targets for training.

    Args:
-      candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
-        number of candidate RoIs to be considered for mask sampling. It includes
-        both positive and negative RoIs. The `num_mask_samples_per_image`
-        positive RoIs will be sampled to create mask training targets.
-      candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
-        corresponding groundtruth boxes to the `candidate_rois`.
-      candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
-        corresponding groundtruth classes to the `candidate_rois`. 0 in the
+      candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is
+        the number of candidate RoIs to be considered for mask sampling. It
+        includes both positive and negative RoIs. The
+        `num_mask_samples_per_image` positive RoIs will be sampled to create
+        mask training targets.
+      candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
+        the corresponding groundtruth boxes to the `candidate_rois`.
+      candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing
+        the corresponding groundtruth classes to the `candidate_rois`. 0 in the
        tensor corresponds to the background class, i.e. negative RoIs.
-      candidate_gt_indices: a tensor of shape [batch_size, N], storing the
+      candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
        corresponding groundtruth instance indices to the `candidate_gt_boxes`,
        i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
-        where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N,
-        is the superset of candidate_gt_boxes.
-      gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
-        containing all the groundtruth masks which sample masks are drawn from.
-        after sampling. The output masks are resized w.r.t the sampled RoIs.
+          where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
+          N, is the superset of candidate_gt_boxes.
+      gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
+        mask_width] containing all the groundtruth masks which sample masks are
+        drawn from. after sampling. The output masks are resized w.r.t the
+        sampled RoIs.

    Returns:
-      foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
-        that corresponds to the sampled foreground masks, where
+      foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
+        RoI that corresponds to the sampled foreground masks, where
        K = num_mask_samples_per_image.
-      foreground_classes: a tensor of shape of [batch_size, K] storing the
+      foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
        classes corresponding to the sampled foreground masks.
-      cropoped_foreground_masks: a tensor of shape of
+      cropoped_foreground_masks: A `tf.Tensor` of shape of
        [batch_size, K, mask_target_size, mask_target_size] storing the
        cropped foreground masks used for training.
    """

--- a/official/vision/beta/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/modeling/layers/nn_blocks.py
@@ -73,33 +73,33 @@ class ResidualBlock(tf.keras.layers.Layer):
               norm_momentum=0.99,
               norm_epsilon=0.001,
               **kwargs):
-    """A residual block with BN after convolutions.
+    """Initializes a residual block with BN after convolutions.

    Args:
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      use_projection: `bool` for whether this block should use a projection
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      use_projection: A `bool` for whether this block should use a projection
        shortcut (versus the default identity shortcut). This is usually `True`
        for the first block of a block group, which may change the number of
        filters and the resolution.
-      se_ratio: `float` or None. Ratio of the Squeeze-and-Excitation layer.
-      resnetd_shortcut: `bool` if True, apply the resnetd style modification to
-        the shortcut connection. Not implemented in residual blocks.
-      stochastic_depth_drop_rate: `float` or None. if not None, drop rate for
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      resnetd_shortcut: A `bool` if True, apply the resnetd style modification
+        to the shortcut connection. Not implemented in residual blocks.
+      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
        the stochastic depth layer.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-                          Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
-                        Default to None.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(ResidualBlock, self).__init__(**kwargs)

@@ -250,34 +250,34 @@ class BottleneckBlock(tf.keras.layers.Layer):
               norm_momentum=0.99,
               norm_epsilon=0.001,
               **kwargs):
-    """A standard bottleneck block with BN after convolutions.
+    """Initializes a standard bottleneck block with BN after convolutions.

    Args:
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      dilation_rate: `int` dilation_rate of convolutions. Default to 1.
-      use_projection: `bool` for whether this block should use a projection
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      dilation_rate: An `int` dilation_rate of convolutions. Default to 1.
+      use_projection: A `bool` for whether this block should use a projection
        shortcut (versus the default identity shortcut). This is usually `True`
        for the first block of a block group, which may change the number of
        filters and the resolution.
-      se_ratio: `float` or None. Ratio of the Squeeze-and-Excitation layer.
-      resnetd_shortcut: `bool` if True, apply the resnetd style modification to
-        the shortcut connection.
-      stochastic_depth_drop_rate: `float` or None. if not None, drop rate for
+      se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
+      resnetd_shortcut: A `bool`. If True, apply the resnetd style modification
+        to the shortcut connection.
+      stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
        the stochastic depth layer.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-                          Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
-                        Default to None.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
+        Default to None.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(BottleneckBlock, self).__init__(**kwargs)

@@ -472,47 +472,48 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
               norm_momentum=0.99,
               norm_epsilon=0.001,
               **kwargs):
-    """An inverted bottleneck block with BN after convolutions.
+    """Initializes an inverted bottleneck block with BN after convolutions.

    Args:
-      in_filters: `int` number of filters of the input tensor.
-      out_filters: `int` number of filters of the output tensor.
-      expand_ratio: `int` expand_ratio for an inverted bottleneck block.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      kernel_size: `int` kernel_size of the depthwise conv layer.
-      se_ratio: `float` or None. If not None, se ratio for the squeeze and
+      in_filters: An `int` number of filters of the input tensor.
+      out_filters: An `int` number of filters of the output tensor.
+      expand_ratio: An `int` of expand_ratio for an inverted bottleneck block.
+      strides: An `int` block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      kernel_size: An `int` kernel_size of the depthwise conv layer.
+      se_ratio: A `float` or None. If not None, se ratio for the squeeze and
        excitation layer.
-      stochastic_depth_drop_rate: `float` or None. if not None, drop rate for
+      stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
        the stochastic depth layer.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
        Default to None.
-      activation: `str` name of the activation function.
-      se_inner_activation: Squeeze excitation inner activation.
-      se_gating_activation: Squeeze excitation gating activation.
-      expand_se_in_filters: Whether or not to expand in_filter in squeeze and
-        excitation layer.
-      depthwise_activation: `str` name of the activation function for depthwise
-        only.
-      use_sync_bn: if True, use synchronized batch normalization.
-      dilation_rate: `int` an integer specifying the dilation rate to use for.
-      divisible_by: `int` ensures all inner dimensions are divisible by this
-        number.
-      dilated convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      regularize_depthwise: `bool` whether or not apply regularization on
+      activation: A `str` name of the activation function.
+      se_inner_activation: A `str` name of squeeze-excitation inner activation.
+      se_gating_activation: A `str` name of squeeze-excitation gating
+        activation.
+      expand_se_in_filters: A `bool` of whether or not to expand in_filter in
+        squeeze and excitation layer.
+      depthwise_activation: A `str` name of the activation function for
+        depthwise only.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      dilation_rate: An `int` that specifies the dilation rate to use for.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      dilated convolution: An `int` to specify the same value for all spatial
+        dimensions.
+      regularize_depthwise: A `bool` of whether or not apply regularization on
        depthwise.
-      use_depthwise: `bool` whether to uses fused convolutions instead of
+      use_depthwise: A `bool` of whether to uses fused convolutions instead of
        depthwise.
-      use_residual: `bool`whether to include residual connection between input
-      and output.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
+      use_residual: A `bool` of whether to include residual connection between
+        input and output.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(InvertedBottleneckBlock, self).__init__(**kwargs)

@@ -702,10 +703,12 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class ResidualInner(tf.keras.layers.Layer):
-  """Single inner block of a residual.
+  """Creates a single inner block of a residual.

  This corresponds to `F`/`G` functions in the RevNet paper:
-  https://arxiv.org/pdf/1707.04585.pdf
+  Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+  The Reversible Residual Network: Backpropagation Without Storing Activations.
+  (https://arxiv.org/pdf/1707.04585.pdf)
  """

  def __init__(
@@ -721,22 +724,21 @@ class ResidualInner(tf.keras.layers.Layer):
      norm_epsilon: float = 0.001,
      batch_norm_first: bool = True,
      **kwargs):
-    """ResidualInner Initialization.
+    """Initializes a ResidualInner.

    Args:
-      filters: `int` output filter size.
-      strides: `int` stride size for convolution for the residual block.
-      kernel_initializer: `str` or `tf.keras.initializers.Initializer` instance
-        for convolutional layers.
-      kernel_regularizer: `tf.keras.regularizers.Regularizer` for Conv2D.
-      activation: `str` or `callable` instance of the activation function.
-      use_sync_bn: `bool` if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      batch_norm_first: `bool` whether to apply activation and batch norm
+      filters: An `int` of output filter size.
+      strides: An `int` of stride size for convolution for the residual block.
+      kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
+        instance for convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
+      activation: A `str` or `callable` instance of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      batch_norm_first: A `bool` of whether to apply activation and batch norm
        before conv.
-      **kwargs: additional keyword arguments to be passed.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(ResidualInner, self).__init__(**kwargs)

@@ -824,10 +826,12 @@ class ResidualInner(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class BottleneckResidualInner(tf.keras.layers.Layer):
-  """Single inner block of a bottleneck residual.
+  """Creates a single inner block of a bottleneck.

  This corresponds to `F`/`G` functions in the RevNet paper:
-  https://arxiv.org/pdf/1707.04585.pdf
+  Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+  The Reversible Residual Network: Backpropagation Without Storing Activations.
+  (https://arxiv.org/pdf/1707.04585.pdf)
  """

  def __init__(
@@ -843,24 +847,23 @@ class BottleneckResidualInner(tf.keras.layers.Layer):
      norm_epsilon: float = 0.001,
      batch_norm_first: bool = True,
      **kwargs):
-    """BottleneckResidualInner Initialization.
+    """Initializes a BottleneckResidualInner.

    Args:
-      filters: `int` number of filters for first 2 convolutions. Last
-        Last, and thus the number of output channels from the bottlneck
-        block is `4*filters`
-      strides: `int` stride size for convolution for the residual block.
-      kernel_initializer: `str` or `tf.keras.initializers.Initializer` instance
-        for convolutional layers.
-      kernel_regularizer: `tf.keras.regularizers.Regularizer` for Conv2D.
-      activation: `str` or `callable` instance of the activation function.
-      use_sync_bn: `bool` if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      batch_norm_first: `bool` whether to apply activation and batch norm
+      filters: An `int` number of filters for first 2 convolutions. Last Last,
+        and thus the number of output channels from the bottlneck block is
+        `4*filters`
+      strides: An `int` of stride size for convolution for the residual block.
+      kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
+        instance for convolutional layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
+      activation: A `str` or `callable` instance of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      batch_norm_first: A `bool` of whether to apply activation and batch norm
        before conv.
-      **kwargs: additional keyword arguments to be passed.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(BottleneckResidualInner, self).__init__(**kwargs)

@@ -962,7 +965,7 @@ class BottleneckResidualInner(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class ReversibleLayer(tf.keras.layers.Layer):
-  """A reversible layer.
+  """Creates a reversible layer.

  Computes y1 = x1 + f(x2), y2 = x2 + g(y1), where f and g can be arbitrary
  layers that are stateless, which in this case are `ResidualInner` layers.
@@ -973,20 +976,21 @@ class ReversibleLayer(tf.keras.layers.Layer):
               g: tf.keras.layers.Layer,
               manual_grads: bool = True,
               **kwargs):
-    """ReversibleLayer Initialization.
+    """Initializes a ReversibleLayer.

    Args:
-      f: `tf.keras.layers.Layer` f inner block referred to in paper. Each
-        reversible layer consists of two inner functions. For example, in RevNet
-        the reversible residual consists of two f/g inner (bottleneck) residual
-        functions. Where the input to the reversible layer is x, the input gets
-        partitioned in the channel dimension and the forward pass follows (eq8):
-        x = [x1; x2], z1 = x1 + f(x2), y2 = x2 + g(z1), y1 = stop_gradient(z1).
-      g: `tf.keras.layers.Layer` g inner block referred to in paper. Detailed
-        explanation same as above as `f` arg.
-      manual_grads: `bool` [Testing Only] whether to manually take gradients
-        as in Algorithm 1 or defer to autograd.
-      **kwargs: additional keyword arguments to be passed.
+      f: A `tf.keras.layers.Layer` instance of `f` inner block referred to in
+        paper. Each reversible layer consists of two inner functions. For
+        example, in RevNet the reversible residual consists of two f/g inner
+        (bottleneck) residual functions. Where the input to the reversible layer
+        is x, the input gets partitioned in the channel dimension and the
+        forward pass follows (eq8): x = [x1; x2], z1 = x1 + f(x2), y2 = x2 +
+          g(z1), y1 = stop_gradient(z1).
+      g: A `tf.keras.layers.Layer` instance of `g` inner block referred to in
+        paper. Detailed explanation same as above as `f` arg.
+      manual_grads: A `bool` [Testing Only] of whether to manually take
+        gradients as in Algorithm 1 or defer to autograd.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(ReversibleLayer, self).__init__(**kwargs)

@@ -1030,16 +1034,19 @@ class ReversibleLayer(tf.keras.layers.Layer):
        x: tf.Tensor
    ) -> Tuple[tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor],
                                                List[tf.Tensor]]]]:
-      """Implements Algorithm 1 in RevNet paper.
+      """Implements Algorithm 1 in the RevNet paper.

-      Paper: https://arxiv.org/pdf/1707.04585.pdf
+         Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
+         The Reversible Residual Network: Backpropagation Without Storing
+         Activations.
+         (https://arxiv.org/pdf/1707.04585.pdf)

      Args:
-        x: input tensor.
+        x: An input `tf.Tensor.

      Returns:
-        y: the output [y1; y2] in algorithm 1.
-        grad_fn: callable function that computes the gradients.
+        y: The output [y1; y2] in Algorithm 1.
+        grad_fn: A callable function that computes the gradients.
      """
      with tf.GradientTape() as fwdtape:
        fwdtape.watch(x)
@@ -1135,7 +1142,7 @@ class ReversibleLayer(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
-  """An depthwise separable convolution block with batch normalization."""
+  """Creates an depthwise separable convolution block with batch normalization."""

  def __init__(
      self,
@@ -1151,29 +1158,29 @@ class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
      norm_momentum: float = 0.99,
      norm_epsilon: float = 0.001,
      **kwargs):
-    """An convolution block with batch normalization.
+    """Initializes a convolution block with batch normalization.

    Args:
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      kernel_size: `int` an integer specifying the height and width of the
-      2D convolution window.
-      strides: `int` block stride. If greater than 1, this block will ultimately
-        downsample the input.
-      regularize_depthwise: if Ture, apply regularization on depthwise.
-      activation: `str` name of the activation function.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-                          Default to None.
-      dilation_rate: an integer or tuple/list of 2 integers, specifying
-        the dilation rate to use for dilated convolution.
-        Can be a single integer to specify the same value for
-        all spatial dimensions.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      kernel_size: An `int` that specifies the height and width of the 2D
+        convolution window.
+      strides: An `int` of block stride. If greater than 1, this block will
+        ultimately downsample the input.
+      regularize_depthwise: A `bool`. If Ture, apply regularization on
+        depthwise.
+      activation: A `str` name of the activation function.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      dilation_rate: An `int` or tuple/list of 2 `int`, specifying the dilation
+        rate to use for dilated convolution. Can be a single integer to specify
+        the same value for all spatial dimensions.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(DepthwiseSeparableConvBlock, self).__init__(**kwargs)
    self._filters = filters

--- a/official/vision/beta/modeling/layers/nn_blocks_3d.py
+++ b/official/vision/beta/modeling/layers/nn_blocks_3d.py
@@ -21,14 +21,21 @@ from official.modeling import tf_utils

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class SelfGating(tf.keras.layers.Layer):
-  """Feature gating as used in S3D-G (https://arxiv.org/pdf/1712.04851.pdf)."""
+  """Feature gating as used in S3D-G.
+
+  This implements the S3D-G network from:
+  Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, Kevin Murphy.
+  Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video
+  Classification.
+  (https://arxiv.org/pdf/1712.04851.pdf)
+  """

  def __init__(self, filters, **kwargs):
-    """Constructor.
+    """Initializes a self-gating layer.

    Args:
-      filters: `int` number of filters for the convolutional layer.
-      **kwargs: keyword arguments to be passed.
+      filters: An `int` number of filters for the convolutional layer.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(SelfGating, self).__init__(**kwargs)
    self._filters = filters
@@ -61,7 +68,7 @@ class SelfGating(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class BottleneckBlock3D(tf.keras.layers.Layer):
-  """A 3D bottleneck block."""
+  """Creates a 3D bottleneck block."""

  def __init__(self,
               filters,
@@ -77,28 +84,29 @@ class BottleneckBlock3D(tf.keras.layers.Layer):
               norm_momentum=0.99,
               norm_epsilon=0.001,
               **kwargs):
-    """A 3D bottleneck block with BN after convolutions.
+    """Initializes a 3D bottleneck block with BN after convolutions.

    Args:
-      filters: `int` number of filters for the first two convolutions. Note that
-        the third and final convolution will use 4 times as many filters.
-      temporal_kernel_size: `int` kernel size for the temporal convolutional
-        layer.
-      temporal_strides: `int` temporal stride for the temporal convolutional
+      filters: An `int` number of filters for the first two convolutions. Note
+        that the third and final convolution will use 4 times as many filters.
+      temporal_kernel_size: An `int` of kernel size for the temporal
+        convolutional layer.
+      temporal_strides: An `int` of ftemporal stride for the temporal
+        convolutional layer.
+      spatial_strides: An `int` of spatial stride for the spatial convolutional
        layer.
-      spatial_strides: `int` spatial stride for the spatial convolutional layer.
-      use_self_gating: `bool` apply self-gating module or not.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      use_self_gating: A `bool` of whether to apply self-gating module or not.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
        Default to None.
-      activation: `str` name of the activation function.
-      use_sync_bn: if True, use synchronized batch normalization.
-      norm_momentum: `float` normalization omentum for the moving average.
-      norm_epsilon: `float` small float added to variance to avoid dividing by
-        zero.
-      **kwargs: keyword arguments to be passed.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool`. If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(BottleneckBlock3D, self).__init__(**kwargs)


--- a/official/vision/beta/modeling/layers/nn_layers.py
+++ b/official/vision/beta/modeling/layers/nn_layers.py
@@ -22,7 +22,7 @@ import tensorflow as tf
 from official.modeling import tf_utils


-# Type annotations
+# Type annotations.
 States = Dict[str, tf.Tensor]
 Activation = Union[str, Callable]

@@ -34,12 +34,12 @@ def make_divisible(value: float,
  """This is to ensure that all layers have channels that are divisible by 8.

  Args:
-    value: `float` original value.
-    divisor: `int` the divisor that need to be checked upon.
-    min_value: `float` minimum value threshold.
+    value: A `float` of original value.
+    divisor: An `int` off the divisor that need to be checked upon.
+    min_value: A `float` of  minimum value threshold.

  Returns:
-    The adjusted value in `int` that divisible against divisor.
+    The adjusted value in `int` that is divisible against divisor.
  """
  if min_value is None:
    min_value = divisor
@@ -55,7 +55,7 @@ def round_filters(filters: int,
                  divisor: int = 8,
                  min_depth: Optional[int] = None,
                  skip: bool = False):
-  """Round number of filters based on width multiplier."""
+  """Rounds number of filters based on width multiplier."""
  orig_f = filters
  if skip or not multiplier:
    return filters
@@ -70,7 +70,7 @@ def round_filters(filters: int,

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class SqueezeExcitation(tf.keras.layers.Layer):
-  """Squeeze and excitation layer."""
+  """Creates a squeeze and excitation layer."""

  def __init__(self,
               in_filters,
@@ -84,25 +84,26 @@ class SqueezeExcitation(tf.keras.layers.Layer):
               activation='relu',
               gating_activation='sigmoid',
               **kwargs):
-    """Implementation for squeeze and excitation.
+    """Initializes a squeeze and excitation layer.

    Args:
-      in_filters: `int` number of filters of the input tensor.
-      out_filters: `int` number of filters of the output tensor.
-      se_ratio: `float` or None. If not None, se ratio for the squeeze and
+      in_filters: An `int` number of filters of the input tensor.
+      out_filters: An `int` number of filters of the output tensor.
+      se_ratio: A `float` or None. If not None, se ratio for the squeeze and
        excitation layer.
-      divisible_by: `int` ensures all inner dimensions are divisible by this
-        number.
-      use_3d_input: `bool` 2D image or 3D input type.
-      kernel_initializer: kernel_initializer for convolutional layers.
-      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      divisible_by: An `int` that ensures all inner dimensions are divisible by
+        this number.
+      use_3d_input: A `bool` of whether input is 2D or 3D image.
+      kernel_initializer: A `str` of kernel_initializer for convolutional
+        layers.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default to None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
        Default to None.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
-        Default to None.
-      activation: `str` name of the activation function.
-      gating_activation: `str` name of the activation function for final gating
-        function.
-      **kwargs: keyword arguments to be passed.
+      activation: A `str` name of the activation function.
+      gating_activation: A `str` name of the activation function for final
+        gating function.
+      **kwargs: Additional keyword arguments to be passed.
    """
    super(SqueezeExcitation, self).__init__(**kwargs)

@@ -183,9 +184,9 @@ def get_stochastic_depth_rate(init_rate, i, n):
  """Get drop connect rate for the ith block.

  Args:
-    init_rate: `float` initial drop rate.
-    i: `int` order of the current block.
-    n: `int` total number of blocks.
+    init_rate: A `float` of initial drop rate.
+    i: An `int` of order of the current block.
+    n: An `int` total number of blocks.

  Returns:
    Drop rate of the ith block.
@@ -201,17 +202,17 @@ def get_stochastic_depth_rate(init_rate, i, n):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class StochasticDepth(tf.keras.layers.Layer):
-  """Stochastic depth layer."""
+  """Creates a stochastic depth layer."""

  def __init__(self, stochastic_depth_drop_rate, **kwargs):
-    """Initialize stochastic depth.
+    """Initializes a stochastic depth layer.

    Args:
-      stochastic_depth_drop_rate: `float` drop rate.
-      **kwargs: keyword arguments to be passed.
+      stochastic_depth_drop_rate: A `float` of drop rate.
+      **kwargs: Additional keyword arguments to be passed.

    Returns:
-      A output tensor, which should have the same shape as input.
+      A output `tf.Tensor` of which should have the same shape as input.
    """
    super(StochasticDepth, self).__init__(**kwargs)
    self._drop_rate = stochastic_depth_drop_rate
@@ -239,15 +240,15 @@ class StochasticDepth(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 def pyramid_feature_fusion(inputs, target_level):
-  """Fuse all feature maps in the feature pyramid at the target level.
+  """Fuses all feature maps in the feature pyramid at the target level.

  Args:
-    inputs: a dictionary containing the feature pyramid. The size of the input
+    inputs: A dictionary containing the feature pyramid. The size of the input
      tensor needs to be fixed.
-    target_level: `int` the target feature level for feature fusion.
+    target_level: An `int` of the target feature level for feature fusion.

  Returns:
-    A float Tensor of shape [batch_size, feature_height, feature_width,
+    A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
      feature_channel].
  """
  # Convert keys to int.
@@ -279,8 +280,13 @@ def pyramid_feature_fusion(inputs, target_level):
 class Scale(tf.keras.layers.Layer):
  """Scales the input by a trainable scalar weight.

-  Useful for applying ReZero to layers, which improves convergence speed.
-  Reference: https://arxiv.org/pdf/2003.04887.pdf
+  This is useful for applying ReZero to layers, which improves convergence
+  speed. This implements the paper:
+
+  Thomas Bachlechner, Bodhisattwa Prasad Majumder, Huanru Henry Mao,
+  Garrison W. Cottrell, Julian McAuley.
+  ReZero is All You Need: Fast Convergence at Large Depth.
+  (https://arxiv.org/pdf/2003.04887.pdf).
  """

  def __init__(
@@ -288,15 +294,15 @@ class Scale(tf.keras.layers.Layer):
      initializer: tf.keras.initializers.Initializer = 'ones',
      regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
      **kwargs):
-    """Initializes scale layer.
+    """Initializes a scale layer.

    Args:
-      initializer: initializer for the scalar weight.
-      regularizer: regularizer for the scalar weight.
-      **kwargs: keyword arguments to be passed to this layer.
+      initializer: A `str` of initializer for the scalar weight.
+      regularizer: A `tf.keras.regularizers.Regularizer` for the scalar weight.
+      **kwargs: Additional keyword arguments to be passed to this layer.

    Returns:
-      A output tensor, which should have the same shape as input.
+      An `tf.Tensor` of which should have the same shape as input.
    """
    super(Scale, self).__init__(**kwargs)

@@ -328,11 +334,15 @@ class Scale(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class TemporalSoftmaxPool(tf.keras.layers.Layer):
-  """Network layer corresponding to temporal softmax pooling.
+  """Creates a network layer corresponding to temporal softmax pooling.

-  This is useful for multi-class logits (used in e.g., Charades).
-  Modified from AssembleNet Charades evaluation.
-  Reference: https://arxiv.org/pdf/1905.13209.pdf.
+  This is useful for multi-class logits (used in e.g., Charades). Modified from
+  AssembleNet Charades evaluation from:
+
+  Michael S. Ryoo, AJ Piergiovanni, Mingxing Tan, Anelia Angelova.
+  AssembleNet: Searching for Multi-Stream Neural Connectivity in Video
+  Architectures.
+  (https://arxiv.org/pdf/1905.13209.pdf).
  """

  def call(self, inputs):
@@ -347,13 +357,16 @@ class TemporalSoftmaxPool(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class PositionalEncoding(tf.keras.layers.Layer):
-  """Network layer that adds a sinusoidal positional encoding.
+  """Creates a network layer that adds a sinusoidal positional encoding.

  Positional encoding is incremented across frames, and is added to the input.
  The positional encoding is first weighted at 0 so that the network can choose
-  to ignore it.
+  to ignore it. This implements:

-  Reference: https://arxiv.org/pdf/1706.03762.pdf
+  Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+  Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin.
+  Attention Is All You Need.
+  (https://arxiv.org/pdf/1706.03762.pdf).
  """

  def __init__(self,
@@ -363,15 +376,15 @@ class PositionalEncoding(tf.keras.layers.Layer):
    """Initializes positional encoding.

    Args:
-      initializer: initializer for weighting the positional encoding.
-      cache_encoding: if True, cache the positional encoding tensor after
-          calling build. Otherwise, rebuild the tensor for every call. Setting
-          this to False can be useful when we want to input a variable number of
-          frames, so the positional encoding tensor can change shape.
-      **kwargs: keyword arguments to be passed to this layer.
+      initializer: A `str` of initializer for weighting the positional encoding.
+      cache_encoding: A `bool`. If True, cache the positional encoding tensor
+        after calling build. Otherwise, rebuild the tensor for every call.
+        Setting this to False can be useful when we want to input a variable
+        number of frames, so the positional encoding tensor can change shape.
+      **kwargs: Additional keyword arguments to be passed to this layer.

    Returns:
-      An output tensor, which should have the same shape as input.
+      A `tf.Tensor` of which should have the same shape as input.
    """
    super(PositionalEncoding, self).__init__(**kwargs)
    self._initializer = initializer
@@ -395,9 +408,9 @@ class PositionalEncoding(tf.keras.layers.Layer):
    """Creates a sequence of sinusoidal positional encoding vectors.

    Args:
-      num_positions: the number of positions (frames).
-      hidden_size: the number of channels used for the hidden vectors.
-      dtype: the dtype of the output tensor.
+      num_positions: An `int` of number of positions (frames).
+      hidden_size: An `int` of number of channels used for the hidden vectors.
+      dtype: The dtype of the output tensor.

    Returns:
      The positional encoding tensor with shape [num_positions, hidden_size].
@@ -430,10 +443,10 @@ class PositionalEncoding(tf.keras.layers.Layer):
    """Builds the layer with the given input shape.

    Args:
-      input_shape: the input shape.
+      input_shape: The input shape.

    Raises:
-      ValueError: if using 'channels_first' data format.
+      ValueError: If using 'channels_first' data format.
    """
    if tf.keras.backend.image_data_format() == 'channels_first':
      raise ValueError('"channels_first" mode is unsupported.')
@@ -457,7 +470,7 @@ class PositionalEncoding(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class GlobalAveragePool3D(tf.keras.layers.Layer):
-  """Global average pooling layer with causal mode.
+  """Creates a global average pooling layer with causal mode.

  Implements causal mode, which runs a cumulative sum (with `tf.cumsum`) across
  frames in the time dimension, allowing the use of a stream buffer. Sums any
@@ -469,15 +482,16 @@ class GlobalAveragePool3D(tf.keras.layers.Layer):
               keepdims: bool = False,
               causal: bool = False,
               **kwargs):
-    """Initializes global average pool.
+    """Initializes a global average pool layer.

    Args:
-      keepdims: if True, keep the averaged dimensions.
-      causal: run in causal mode with a cumulative sum across frames.
-      **kwargs: keyword arguments to be passed to this layer.
+      keepdims: A `bool`. If True, keep the averaged dimensions.
+      causal: A `bool` of whether to run in causal mode with a cumulative sum
+        across frames.
+      **kwargs: Additional keyword arguments to be passed to this layer.

    Returns:
-      An output tensor.
+      An output `tf.Tensor`.
    """
    super(GlobalAveragePool3D, self).__init__(**kwargs)

@@ -514,14 +528,14 @@ class GlobalAveragePool3D(tf.keras.layers.Layer):
    """Calls the layer with the given inputs.

    Args:
-      inputs: the input tensor.
-      states: a dict of states such that, if any of the keys match for this
-          layer, will overwrite the contents of the buffer(s).
-      output_states: if True, returns the output tensor and output states.
-          Returns just the output tensor otherwise.
+      inputs: An input `tf.Tensor`.
+      states: A `dict` of states such that, if any of the keys match for this
+        layer, will overwrite the contents of the buffer(s).
+      output_states: A `bool`. If True, returns the output tensor and output
+        states. Returns just the output tensor otherwise.

    Returns:
-      the output tensor (and optionally the states if `output_states=True`).
+      An output `tf.Tensor` (and optionally the states if `output_states=True`).
      If `causal=True`, the output tensor will have shape
      `[batch_size, num_frames, 1, 1, channels]` if `keepdims=True`. We keep
      the frame dimension in this case to simulate a cumulative global average
@@ -531,7 +545,7 @@ class GlobalAveragePool3D(tf.keras.layers.Layer):
      buffer stored in `states`).

    Raises:
-      ValueError: if using 'channels_first' data format.
+      ValueError: If using 'channels_first' data format.
    """
    states = dict(states) if states is not None else {}

@@ -592,18 +606,17 @@ class GlobalAveragePool3D(tf.keras.layers.Layer):

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class SpatialAveragePool3D(tf.keras.layers.Layer):
-  """Global average pooling layer pooling across spatial dimentions.
-  """
+  """Creates a global average pooling layer pooling across spatial dimentions."""

  def __init__(self, keepdims: bool = False, **kwargs):
-    """Initializes global average pool.
+    """Initializes a global average pool layer.

    Args:
-      keepdims: if True, keep the averaged dimensions.
-      **kwargs: keyword arguments to be passed to this layer.
+      keepdims: A `bool`. If True, keep the averaged dimensions.
+      **kwargs: Additional keyword arguments to be passed to this layer.

    Returns:
-      An output tensor.
+      An output `tf.Tensor`.
    """
    super(SpatialAveragePool3D, self).__init__(**kwargs)
    self._keepdims = keepdims
@@ -650,10 +663,10 @@ class CausalConvMixin:
    """Calculates padding for 'causal' option for conv layers.

    Args:
-      inputs: optional input tensor to be padded.
-      use_buffered_input: if True, use 'valid' padding along the time dimension.
-          This should be set when applying the stream buffer.
-      time_axis: the axis of the time dimension
+      inputs: An optional input `tf.Tensor` to be padded.
+      use_buffered_input: A `bool`. If True, use 'valid' padding along the time
+        dimension. This should be set when applying the stream buffer.
+      time_axis: An `int` of the axis of the time dimension.

    Returns:
      A list of paddings for `tf.pad`.
@@ -719,14 +732,14 @@ class Conv2D(tf.keras.layers.Conv2D, CausalConvMixin):
    """Initializes conv2d.

    Args:
-      *args: arguments to be passed.
-      use_buffered_input: if True, the input is expected to be padded
-          beforehand. In effect, calling this layer will use 'valid' padding on
-          the temporal dimension to simulate 'causal' padding.
-      **kwargs: keyword arguments to be passed.
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.

    Returns:
-      A output tensor of the Conv2D operation.
+      An output `tf.Tensor` of the Conv2D operation.
    """
    super(Conv2D, self).__init__(*args, **kwargs)
    self._use_buffered_input = use_buffered_input
@@ -767,14 +780,14 @@ class DepthwiseConv2D(tf.keras.layers.DepthwiseConv2D, CausalConvMixin):
    """Initializes depthwise conv2d.

    Args:
-      *args: arguments to be passed.
-      use_buffered_input: if True, the input is expected to be padded
-          beforehand. In effect, calling this layer will use 'valid' padding on
-          the temporal dimension to simulate 'causal' padding.
-      **kwargs: keyword arguments to be passed.
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.

    Returns:
-      A output tensor of the DepthwiseConv2D operation.
+      An output `tf.Tensor` of the DepthwiseConv2D operation.
    """
    super(DepthwiseConv2D, self).__init__(*args, **kwargs)
    self._use_buffered_input = use_buffered_input
@@ -829,14 +842,14 @@ class Conv3D(tf.keras.layers.Conv3D, CausalConvMixin):
    """Initializes conv3d.

    Args:
-      *args: arguments to be passed.
-      use_buffered_input: if True, the input is expected to be padded
-          beforehand. In effect, calling this layer will use 'valid' padding on
-          the temporal dimension to simulate 'causal' padding.
-      **kwargs: keyword arguments to be passed.
+      *args: Arguments to be passed.
+      use_buffered_input: A `bool`. If True, the input is expected to be padded
+        beforehand. In effect, calling this layer will use 'valid' padding on
+        the temporal dimension to simulate 'causal' padding.
+      **kwargs: Additional keyword arguments to be passed.

    Returns:
-      A output tensor of the Conv3D operation.
+      An output `tf.Tensor` of the Conv3D operation.
    """
    super(Conv3D, self).__init__(*args, **kwargs)
    self._use_buffered_input = use_buffered_input

--- a/official/vision/beta/modeling/layers/roi_aligner.py
+++ b/official/vision/beta/modeling/layers/roi_aligner.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""ROI align."""
+"""Contains definitions of ROI aligner."""

 import tensorflow as tf

@@ -30,9 +30,9 @@ class MultilevelROIAligner(tf.keras.layers.Layer):
    """Initializes a ROI aligner.

    Args:
-      crop_size: int, the output size of the cropped features.
-      sample_offset: float in [0, 1], the subpixel sample offset.
-      **kwargs: other key word arguments passed to Layer.
+      crop_size: An `int` of the output size of the cropped features.
+      sample_offset: A `float` in [0, 1] of the subpixel sample offset.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'crop_size': crop_size,
@@ -47,13 +47,13 @@ class MultilevelROIAligner(tf.keras.layers.Layer):
      features: A dictionary with key as pyramid level and value as features.
        The features are in shape of
        [batch_size, height_l, width_l, num_filters].
-      boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row
+      boxes: A 3-D `tf.Tensor` of shape [batch_size, num_boxes, 4]. Each row
        represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
        from grid point.
-      training: bool, whether it is in training mode.
+      training: A `bool` of whether it is in training mode.

    Returns:
-      roi_features: A 5-D tensor representing feature crop of shape
+      A 5-D `tf.Tensor` representing feature crop of shape
      [batch_size, num_boxes, crop_size, crop_size, num_filters].
    """
    roi_features = spatial_transform_ops.multilevel_crop_and_resize(

--- a/official/vision/beta/modeling/layers/roi_generator.py
+++ b/official/vision/beta/modeling/layers/roi_generator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""ROI generator."""
+"""Contains definitions of ROI generator."""

 # Import libraries
 import tensorflow as tf
@@ -48,46 +48,48 @@ def _multilevel_propose_rois(raw_boxes,
    3. Apply an overall top k to generate the final selected RoIs.

  Args:
-    raw_boxes: a dict with keys representing FPN levels and values representing
-      box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
-    raw_scores: a dict with keys representing FPN levels and values representing
-      logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
-    anchor_boxes: a dict with keys representing FPN levels and values
+    raw_boxes: A `dict` with keys representing FPN levels and values
+      representing box tenors of shape
+      [batch_size, feature_h, feature_w, num_anchors * 4].
+    raw_scores: A `dict` with keys representing FPN levels and values
+      representing logit tensors of shape
+      [batch_size, feature_h, feature_w, num_anchors].
+    anchor_boxes: A `dict` with keys representing FPN levels and values
      representing anchor box tensors of shape
      [batch_size, feature_h * feature_w * num_anchors, 4].
-    image_shape: a tensor of shape [batch_size, 2] where the last dimension are
-      [height, width] of the scaled image.
-    pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
-      keep before applying NMS. Default: 2000.
-    pre_nms_score_threshold: a float between 0 and 1 representing the minimal
-      box  score to keep before applying NMS. This is often used as a
+    image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
+      are [height, width] of the scaled image.
+    pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
+      before applying NMS. Default: 2000.
+    pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
+      box score to keep before applying NMS. This is often used as a
      pre-filtering step for better performance. Default: 0, no filtering is
      applied.
-    pre_nms_min_size_threshold: a float representing the minimal box size in
+    pre_nms_min_size_threshold: A `float` representing the minimal box size in
      each side (w.r.t. the scaled image) to keep before applying NMS. This is
      often used as a pre-filtering step for better performance. Default: 0, no
      filtering is applied.
-    nms_iou_threshold: a float between 0 and 1 representing the IoU threshold
+    nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
-    num_proposals: an integer of top scoring RPN proposals *in total* to
-      keep after applying NMS. Default: 1000.
-    use_batched_nms: a boolean indicating whether NMS is applied in batch using
+    num_proposals: An `int` of top scoring RPN proposals *in total* to keep
+      after applying NMS. Default: 1000.
+    use_batched_nms: A `bool` indicating whether NMS is applied in batch using
      `tf.image.combined_non_max_suppression`. Currently only available in
-      CPU/GPU. Default: False.
-    decode_boxes: a boolean indicating whether `raw_boxes` needs to be decoded
+      CPU/GPU. Default is False.
+    decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
      using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
-      `anchor_boxes`. Default: True.
-    clip_boxes: a boolean indicating whether boxes are first clipped to the
+      `anchor_boxes`. Default is True.
+    clip_boxes: A `bool` indicating whether boxes are first clipped to the
      scaled image size before appliying NMS. If False, no clipping is applied
-      and `image_shape` is ignored. Default: True.
-    apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
-      `raw_scores` before applying NMS. Default: True.
+      and `image_shape` is ignored. Default is True.
+    apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
+      `raw_scores` before applying NMS. Default is True.

  Returns:
-    selected_rois: a tensor of shape [batch_size, num_proposals, 4],
+    selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
      representing the box coordinates of the selected proposals w.r.t. the
      scaled image.
-    selected_roi_scores: a tensor of shape [batch_size, num_proposals, 1],
+    selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
      representing the scores of the selected proposals.
  """
  with tf.name_scope('multilevel_propose_rois'):
@@ -196,30 +198,31 @@ class MultilevelROIGenerator(tf.keras.layers.Layer):
    The ROI generator transforms the raw predictions from RPN to ROIs.

    Args:
-      pre_nms_top_k: int, the number of top scores proposals to be kept before
-        applying NMS.
-      pre_nms_score_threshold: float, the score threshold to apply before
+      pre_nms_top_k: An `int` of the number of top scores proposals to be kept
+        before applying NMS.
+      pre_nms_score_threshold: A `float` of the score threshold to apply before
        applying NMS. Proposals whose scores are below this threshold are
        thrown away.
-      pre_nms_min_size_threshold: float, the threshold of each side of the box
-        (w.r.t. the scaled image). Proposals whose sides are below this
+      pre_nms_min_size_threshold: A `float` of the threshold of each side of the
+        box (w.r.t. the scaled image). Proposals whose sides are below this
        threshold are thrown away.
-      nms_iou_threshold: float in [0, 1], the NMS IoU threshold.
-      num_proposals: int, the final number of proposals to generate.
-      test_pre_nms_top_k: int, the number of top scores proposals to be kept
-        before applying NMS in testing.
-      test_pre_nms_score_threshold: float, the score threshold to apply before
-        applying NMS in testing. Proposals whose scores are below this threshold
-        are thrown away.
-      test_pre_nms_min_size_threshold: float, the threshold of each side of the
-        box (w.r.t. the scaled image) in testing. Proposals whose sides are
-        below this threshold are thrown away.
-      test_nms_iou_threshold: float in [0, 1], the NMS IoU threshold in testing.
-      test_num_proposals: int, the final number of proposals to generate in
+      nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
+      num_proposals: An `int` of the final number of proposals to generate.
+      test_pre_nms_top_k: An `int` of the number of top scores proposals to be
+        kept before applying NMS in testing.
+      test_pre_nms_score_threshold: A `float` of the score threshold to apply
+        before applying NMS in testing. Proposals whose scores are below this
+        threshold are thrown away.
+      test_pre_nms_min_size_threshold: A `float` of the threshold of each side
+        of the box (w.r.t. the scaled image) in testing. Proposals whose sides
+        are below this threshold are thrown away.
+      test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in
        testing.
-      use_batched_nms: bool, whether or not use
+      test_num_proposals: An `int` of the final number of proposals to generate
+        in testing.
+      use_batched_nms: A `bool` of whether or not use
        `tf.image.combined_non_max_suppression`.
-      **kwargs: other key word arguments passed to Layer.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'pre_nms_top_k': pre_nms_top_k,
@@ -257,23 +260,24 @@ class MultilevelROIGenerator(tf.keras.layers.Layer):
      3. Apply an overall top k to generate the final selected RoIs.

    Args:
-      raw_boxes: a dict with keys representing FPN levels and values
+      raw_boxes: A `dict` with keys representing FPN levels and values
        representing box tenors of shape
        [batch, feature_h, feature_w, num_anchors * 4].
-      raw_scores: a dict with keys representing FPN levels and values
+      raw_scores: A `dict` with keys representing FPN levels and values
        representing logit tensors of shape
        [batch, feature_h, feature_w, num_anchors].
-      anchor_boxes: a dict with keys representing FPN levels and values
+      anchor_boxes: A `dict` with keys representing FPN levels and values
        representing anchor box tensors of shape
        [batch, feature_h * feature_w * num_anchors, 4].
-      image_shape: a tensor of shape [batch, 2] where the last dimension are
-        [height, width] of the scaled image.
-      training: a bool indicat whether it is in training mode.
+      image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension
+        are [height, width] of the scaled image.
+      training: A `bool` that indicates whether it is in training mode.

    Returns:
-     roi_boxes: [batch, num_proposals, 4], the proposed ROIs in the scaled
-        image coordinate.
-      roi_scores: [batch, num_proposals], scores of the proposed ROIs.
+      roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed
+        ROIs in the scaled image coordinate.
+      roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the
+        proposed ROIs.
    """
    roi_boxes, roi_scores = _multilevel_propose_rois(
        raw_boxes,

--- a/official/vision/beta/modeling/layers/roi_sampler.py
+++ b/official/vision/beta/modeling/layers/roi_sampler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""ROI sampler."""
+"""Contains definitions of ROI sampler."""

 # Import libraries
 import tensorflow as tf
@@ -23,7 +23,7 @@ from official.vision.beta.modeling.layers import box_sampler

 @tf.keras.utils.register_keras_serializable(package='Vision')
 class ROISampler(tf.keras.layers.Layer):
-  """Sample ROIs and assign targets to the sampled ROIs."""
+  """Samples ROIs and assigns targets to the sampled ROIs."""

  def __init__(self,
               mix_gt_boxes=True,
@@ -36,20 +36,20 @@ class ROISampler(tf.keras.layers.Layer):
    """Initializes a ROI sampler.

    Args:
-      mix_gt_boxes: bool, whether to mix the groundtruth boxes with proposed
-        ROIs.
-      num_sampled_rois: int, the number of sampled ROIs per image.
-      foreground_fraction: float in [0, 1], what percentage of proposed ROIs
+      mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with
+        proposed ROIs.
+      num_sampled_rois: An `int` of the number of sampled ROIs per image.
+      foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs
        should be sampled from the foreground boxes.
-      foreground_iou_threshold: float, represent the IoU threshold for a box to
-        be considered as positive (if >= `foreground_iou_threshold`).
-      background_iou_high_threshold: float, represent the IoU threshold for a
-        box to be considered as negative (if overlap in
+      foreground_iou_threshold: A `float` that represents the IoU threshold for
+        a box to be considered as positive (if >= `foreground_iou_threshold`).
+      background_iou_high_threshold: A `float` that represents the IoU threshold
+        for a box to be considered as negative (if overlap in
        [`background_iou_low_threshold`, `background_iou_high_threshold`]).
-      background_iou_low_threshold: float, represent the IoU threshold for a box
-        to be considered as negative (if overlap in
+      background_iou_low_threshold: A `float` that represents the IoU threshold
+        for a box to be considered as negative (if overlap in
        [`background_iou_low_threshold`, `background_iou_high_threshold`])
-      **kwargs: other key word arguments passed to Layer.
+      **kwargs: Additional keyword arguments passed to Layer.
    """
    self._config_dict = {
        'mix_gt_boxes': mix_gt_boxes,
@@ -85,29 +85,30 @@ class ROISampler(tf.keras.layers.Layer):
         returns box_targets, class_targets, and RoIs.

    Args:
-      boxes: a tensor of shape of [batch_size, N, 4]. N is the number of
+      boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of
        proposals before groundtruth assignment. The last dimension is the
        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
        format.
-      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4].
+      gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4].
        The coordinates of gt_boxes are in the pixel coordinates of the scaled
        image. This tensor might have padding of values -1 indicating the
        invalid box coordinates.
-      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
-        tensor might have paddings with values of -1 indicating the invalid
+      gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES].
+        This tensor might have paddings with values of -1 indicating the invalid
        classes.

    Returns:
-      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
-        coordinates of the sampled RoIs, where K is the number of the sampled
-        RoIs, i.e. K = num_samples_per_image.
-      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
-        box coordinates of the matched groundtruth boxes of the samples RoIs.
-      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
+      sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing
+        the coordinates of the sampled RoIs, where K is the number of the
+        sampled RoIs, i.e. K = num_samples_per_image.
+      sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing
+        the box coordinates of the matched groundtruth boxes of the samples
+        RoIs.
+      sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the
        classes of the matched groundtruth boxes of the sampled RoIs.
-      sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
+      sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the
        indices of the sampled groudntruth boxes in the original `gt_boxes`
-        tensor, i.e.
+        tensor, i.e.,
        gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
    """
    if self._config_dict['mix_gt_boxes']: