Internal change

PiperOrigin-RevId: 329754787

Internal change
PiperOrigin-RevId: 329754787
cc748b2a · Abdullah Rashwan · A. Unique TensorFlower · 2f788e1d · cc748b2a · cc748b2a
Commit cc748b2a authored Sep 02, 2020 by Abdullah Rashwan Committed by A. Unique TensorFlower Sep 02, 2020
10 changed files
--- a/official/vision/beta/ops/spatial_transform_ops.py
+++ b/official/vision/beta/ops/spatial_transform_ops.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Spatial transform ops."""
+
+import tensorflow as tf
+
+_EPSILON = 1e-8
+
+
+def _feature_bilinear_interpolation(features, kernel_y, kernel_x):
+  """Feature bilinear interpolation.
+
+  The RoIAlign feature f can be computed by bilinear interpolation
+  of four neighboring feature points f0, f1, f2, and f3.
+
+  f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+                        [f10, f11]]
+  f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
+  f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
+  kernel_y = [hy, ly]
+  kernel_x = [hx, lx]
+
+  Args:
+    features: The features are in shape of [batch_size, num_boxes, output_size *
+      2, output_size * 2, num_filters].
+    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
+
+  Returns:
+    A 5-D tensor representing feature crop of shape
+    [batch_size, num_boxes, output_size, output_size, num_filters].
+
+  """
+  batch_size, num_boxes, output_size, _, num_filters = (
+      features.get_shape().as_list())
+  if batch_size is None:
+    batch_size = tf.shape(features)[0]
+  output_size = output_size // 2
+  kernel_y = tf.reshape(kernel_y, [batch_size, num_boxes, output_size * 2, 1])
+  kernel_x = tf.reshape(kernel_x, [batch_size, num_boxes, 1, output_size * 2])
+  # Use implicit broadcast to generate the interpolation kernel. The
+  # multiplier `4` is for avg pooling.
+  interpolation_kernel = kernel_y * kernel_x * 4
+
+  # Interpolate the gathered features with computed interpolation kernels.
+  features *= tf.cast(
+      tf.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype)
+  features = tf.reshape(
+      features,
+      [batch_size * num_boxes, output_size * 2, output_size * 2, num_filters])
+  features = tf.nn.avg_pool(features, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID')
+  features = tf.reshape(
+      features, [batch_size, num_boxes, output_size, output_size, num_filters])
+  return features
+
+
+def _compute_grid_positions(boxes, boundaries, output_size, sample_offset):
+  """Computes the grid position w.r.t. the corresponding feature map.
+
+  Args:
+    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
+      information of each box w.r.t. the corresponding feature map.
+      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
+      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
+        in terms of the number of pixels of the corresponding feature map size.
+    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
+      the boundary (in (y, x)) of the corresponding feature map for each box.
+      Any resampled grid points that go beyond the bounary will be clipped.
+    output_size: a scalar indicating the output crop size.
+    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
+      from grid point.
+
+  Returns:
+    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
+    box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
+  """
+  batch_size, num_boxes, _ = boxes.get_shape().as_list()
+  if batch_size is None:
+    batch_size = tf.shape(boxes)[0]
+  box_grid_x = []
+  box_grid_y = []
+  for i in range(output_size):
+    box_grid_x.append(boxes[:, :, 1] +
+                      (i + sample_offset) * boxes[:, :, 3] / output_size)
+    box_grid_y.append(boxes[:, :, 0] +
+                      (i + sample_offset) * boxes[:, :, 2] / output_size)
+  box_grid_x = tf.stack(box_grid_x, axis=2)
+  box_grid_y = tf.stack(box_grid_y, axis=2)
+
+  box_grid_y0 = tf.floor(box_grid_y)
+  box_grid_x0 = tf.floor(box_grid_x)
+  box_grid_x0 = tf.maximum(tf.cast(0., dtype=box_grid_x0.dtype), box_grid_x0)
+  box_grid_y0 = tf.maximum(tf.cast(0., dtype=box_grid_y0.dtype), box_grid_y0)
+
+  box_grid_x0 = tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1))
+  box_grid_x1 = tf.minimum(box_grid_x0 + 1,
+                           tf.expand_dims(boundaries[:, :, 1], -1))
+  box_grid_y0 = tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1))
+  box_grid_y1 = tf.minimum(box_grid_y0 + 1,
+                           tf.expand_dims(boundaries[:, :, 0], -1))
+
+  box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1)
+  box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1)
+
+  # The RoIAlign feature f can be computed by bilinear interpolation of four
+  # neighboring feature points f0, f1, f2, and f3.
+  # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+  #                       [f10, f11]]
+  # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
+  # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
+  ly = box_grid_y - box_grid_y0
+  lx = box_grid_x - box_grid_x0
+  hy = 1.0 - ly
+  hx = 1.0 - lx
+  kernel_y = tf.reshape(
+      tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1])
+  kernel_x = tf.reshape(
+      tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1])
+  return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
+
+
+def multilevel_crop_and_resize(features,
+                               boxes,
+                               output_size=7,
+                               sample_offset=0.5):
+  """Crop and resize on multilevel feature pyramid.
+
+  Generate the (output_size, output_size) set of pixels for each input box
+  by first locating the box into the correct feature level, and then cropping
+  and resizing it using the correspoding feature map of that level.
+
+  Args:
+    features: A dictionary with key as pyramid level and value as features. The
+      features are in shape of [batch_size, height_l, width_l, num_filters].
+    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
+      a box with [y1, x1, y2, x2] in un-normalized coordinates.
+    output_size: A scalar to indicate the output crop size.
+    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
+      from grid point.
+
+  Returns:
+    A 5-D tensor representing feature crop of shape
+    [batch_size, num_boxes, output_size, output_size, num_filters].
+  """
+
+  with tf.name_scope('multilevel_crop_and_resize'):
+    levels = list(features.keys())
+    min_level = min(levels)
+    max_level = max(levels)
+    batch_size, max_feature_height, max_feature_width, num_filters = (
+        features[min_level].get_shape().as_list())
+    if batch_size is None:
+      batch_size = tf.shape(features[min_level])[0]
+    _, num_boxes, _ = boxes.get_shape().as_list()
+
+    # Stack feature pyramid into a features_all of shape
+    # [batch_size, levels, height, width, num_filters].
+    features_all = []
+    feature_heights = []
+    feature_widths = []
+    for level in range(min_level, max_level + 1):
+      shape = features[level].get_shape().as_list()
+      feature_heights.append(shape[1])
+      feature_widths.append(shape[2])
+      # Concat tensor of [batch_size, height_l * width_l, num_filters] for each
+      # levels.
+      features_all.append(
+          tf.reshape(features[level], [batch_size, -1, num_filters]))
+    features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])
+
+    # Calculate height_l * width_l for each level.
+    level_dim_sizes = [
+        feature_widths[i] * feature_heights[i]
+        for i in range(len(feature_widths))
+    ]
+    # level_dim_offsets is accumulated sum of level_dim_size.
+    level_dim_offsets = [0]
+    for i in range(len(feature_widths) - 1):
+      level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
+    batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
+    level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
+    height_dim_sizes = tf.constant(feature_widths, tf.int32)
+
+    # Assigns boxes to the right level.
+    box_width = boxes[:, :, 3] - boxes[:, :, 1]
+    box_height = boxes[:, :, 2] - boxes[:, :, 0]
+    areas_sqrt = tf.cast(tf.sqrt(box_height * box_width), tf.float32)
+    levels = tf.cast(
+        tf.math.floordiv(
+            tf.math.log(tf.divide(areas_sqrt, 224.0)),
+            tf.math.log(2.0)) + 4.0,
+        dtype=tf.int32)
+    # Maps levels between [min_level, max_level].
+    levels = tf.minimum(max_level, tf.maximum(levels, min_level))
+
+    # Projects box location and sizes to corresponding feature levels.
+    scale_to_level = tf.cast(
+        tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)),
+        dtype=boxes.dtype)
+    boxes /= tf.expand_dims(scale_to_level, axis=2)
+    box_width /= scale_to_level
+    box_height /= scale_to_level
+    boxes = tf.concat([boxes[:, :, 0:2],
+                       tf.expand_dims(box_height, -1),
+                       tf.expand_dims(box_width, -1)], axis=-1)
+
+    # Maps levels to [0, max_level-min_level].
+    levels -= min_level
+    level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
+    boundary = tf.cast(
+        tf.concat([
+            tf.expand_dims(
+                [[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1,
+                axis=-1),
+            tf.expand_dims(
+                [[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1,
+                axis=-1),
+        ],
+                  axis=-1), boxes.dtype)
+
+    # Compute grid positions.
+    kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = _compute_grid_positions(
+        boxes, boundary, output_size, sample_offset)
+
+    x_indices = tf.cast(
+        tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
+        dtype=tf.int32)
+    y_indices = tf.cast(
+        tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
+        dtype=tf.int32)
+
+    batch_size_offset = tf.tile(
+        tf.reshape(
+            tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
+        [1, num_boxes, output_size * 2, output_size * 2])
+    # Get level offset for each box. Each box belongs to one level.
+    levels_offset = tf.tile(
+        tf.reshape(
+            tf.gather(level_dim_offsets, levels),
+            [batch_size, num_boxes, 1, 1]),
+        [1, 1, output_size * 2, output_size * 2])
+    y_indices_offset = tf.tile(
+        tf.reshape(
+            y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
+            [batch_size, num_boxes, output_size * 2, 1]),
+        [1, 1, 1, output_size * 2])
+    x_indices_offset = tf.tile(
+        tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
+        [1, 1, output_size * 2, 1])
+    indices = tf.reshape(
+        batch_size_offset + levels_offset + y_indices_offset + x_indices_offset,
+        [-1])
+
+    # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
+    # performance.
+    features_per_box = tf.reshape(
+        tf.gather(features_r2, indices),
+        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
+
+    # Bilinear interpolation.
+    features_per_box = _feature_bilinear_interpolation(
+        features_per_box, kernel_y, kernel_x)
+    return features_per_box
+
+
+def _selective_crop_and_resize(features,
+                               boxes,
+                               box_levels,
+                               boundaries,
+                               output_size=7,
+                               sample_offset=0.5,
+                               use_einsum_gather=False):
+  """Crop and resize boxes on a set of feature maps.
+
+  Given multiple features maps indexed by different levels, and a set of boxes
+  where each box is mapped to a certain level, it selectively crops and resizes
+  boxes from the corresponding feature maps to generate the box features.
+
+  We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
+  figure 3 for reference). Specifically, for each feature map, we select an
+  (output_size, output_size) set of pixels corresponding to the box location,
+  and then use bilinear interpolation to select the feature value for each
+  pixel.
+
+  For performance, we perform the gather and interpolation on all layers as a
+  single operation. In this op the multi-level features are first stacked and
+  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
+  interpolation is performed on the gathered feature points to generate
+  [output_size, output_size] RoIAlign feature map.
+
+  Here is the step-by-step algorithm:
+    1. The multi-level features are gathered into a
+       [batch_size, num_boxes, output_size*2, output_size*2, num_filters]
+       Tensor. The Tensor contains four neighboring feature points for each
+       vertex in the output grid.
+    2. Compute the interpolation kernel of shape
+       [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
+       can be seen as stacking 2x2 interpolation kernels for all vertices in the
+       output grid.
+    3. Element-wise multiply the gathered features and interpolation kernel.
+       Then apply 2x2 average pooling to reduce spatial dimension to
+       output_size.
+
+  Args:
+    features: a 5-D tensor of shape [batch_size, num_levels, max_height,
+      max_width, num_filters] where cropping and resizing are based.
+    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
+      information of each box w.r.t. the corresponding feature map.
+      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
+      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
+        in terms of the number of pixels of the corresponding feature map size.
+    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
+      the 0-based corresponding feature level index of each box.
+    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
+      the boundary (in (y, x)) of the corresponding feature map for each box.
+      Any resampled grid points that go beyond the bounary will be clipped.
+    output_size: a scalar indicating the output crop size.
+    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
+      from grid point.
+    use_einsum_gather: use einsum to replace gather or not. Replacing einsum
+      with gather can improve performance when feature size is not large, einsum
+      is friendly with model partition as well. Gather's performance is better
+      when feature size is very large and there are multiple box levels.
+
+  Returns:
+    features_per_box: a 5-D tensor of shape
+      [batch_size, num_boxes, output_size, output_size, num_filters]
+      representing the cropped features.
+  """
+  (batch_size, num_levels, max_feature_height, max_feature_width,
+   num_filters) = features.get_shape().as_list()
+  if batch_size is None:
+    batch_size = tf.shape(features)[0]
+  _, num_boxes, _ = boxes.get_shape().as_list()
+
+  kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = _compute_grid_positions(
+      boxes, boundaries, output_size, sample_offset)
+  x_indices = tf.cast(
+      tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
+      dtype=tf.int32)
+  y_indices = tf.cast(
+      tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
+      dtype=tf.int32)
+
+  if use_einsum_gather:
+    # Blinear interpolation is done during the last two gathers:
+    #        f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+    #                              [f10, f11]]
+    #        [[f00, f01],
+    #         [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
+    #       where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.
+    y_indices = tf.cast(
+        tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size, 2]),
+        dtype=tf.int32)
+    x_indices = tf.cast(
+        tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size, 2]),
+        dtype=tf.int32)
+
+    # shape is [batch_size, num_boxes, output_size, 2, height]
+    grid_y_one_hot = tf.one_hot(
+        tf.cast(y_indices, tf.int32), max_feature_height, dtype=kernel_y.dtype)
+    # shape is [batch_size, num_boxes, output_size, 2, width]
+    grid_x_one_hot = tf.one_hot(
+        tf.cast(x_indices, tf.int32), max_feature_width, dtype=kernel_x.dtype)
+
+    # shape is [batch_size, num_boxes, output_size, height]
+    grid_y_weight = tf.reduce_sum(
+        tf.multiply(grid_y_one_hot, kernel_y), axis=-2)
+    # shape is [batch_size, num_boxes, output_size, width]
+    grid_x_weight = tf.reduce_sum(
+        tf.multiply(grid_x_one_hot, kernel_x), axis=-2)
+
+    # Gather for y_axis.
+    # shape is [batch_size, num_boxes, output_size, width, features]
+    features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features,
+                                 tf.cast(grid_y_weight, features.dtype))
+    # Gather for x_axis.
+    # shape is [batch_size, num_boxes, output_size, output_size, features]
+    features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box,
+                                 tf.cast(grid_x_weight, features.dtype))
+  else:
+    height_dim_offset = max_feature_width
+    level_dim_offset = max_feature_height * height_dim_offset
+    batch_dim_offset = num_levels * level_dim_offset
+
+    batch_size_offset = tf.tile(
+        tf.reshape(
+            tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]),
+        [1, num_boxes, output_size * 2, output_size * 2])
+    box_levels_offset = tf.tile(
+        tf.reshape(box_levels * level_dim_offset,
+                   [batch_size, num_boxes, 1, 1]),
+        [1, 1, output_size * 2, output_size * 2])
+    y_indices_offset = tf.tile(
+        tf.reshape(y_indices * height_dim_offset,
+                   [batch_size, num_boxes, output_size * 2, 1]),
+        [1, 1, 1, output_size * 2])
+    x_indices_offset = tf.tile(
+        tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
+        [1, 1, output_size * 2, 1])
+
+    indices = tf.reshape(
+        batch_size_offset + box_levels_offset + y_indices_offset +
+        x_indices_offset, [-1])
+
+    features = tf.reshape(features, [-1, num_filters])
+    # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
+    # performance.
+    features_per_box = tf.reshape(
+        tf.gather(features, indices),
+        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
+    features_per_box = _feature_bilinear_interpolation(
+        features_per_box, kernel_y, kernel_x)
+
+  return features_per_box
+
+
+def crop_mask_in_target_box(masks,
+                            boxes,
+                            target_boxes,
+                            output_size,
+                            sample_offset=0,
+                            use_einsum=True):
+  """Crop masks in target boxes.
+
+  Args:
+    masks: A tensor with a shape of [batch_size, num_masks, height, width].
+    boxes: a float tensor representing box cooridnates that tightly enclose
+      masks with a shape of [batch_size, num_masks, 4] in un-normalized
+      coordinates. A box is represented by [ymin, xmin, ymax, xmax].
+    target_boxes: a float tensor representing target box cooridnates for masks
+      with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A
+      box is represented by [ymin, xmin, ymax, xmax].
+    output_size: A scalar to indicate the output crop size. It currently only
+      supports to output a square shape outputs.
+    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
+      from grid point.
+    use_einsum: Use einsum to replace gather in selective_crop_and_resize.
+
+  Returns:
+    A 4-D tensor representing feature crop of shape
+    [batch_size, num_boxes, output_size, output_size].
+  """
+  with tf.name_scope('crop_mask_in_target_box'):
+    batch_size, num_masks, height, width = masks.get_shape().as_list()
+    if batch_size is None:
+      batch_size = tf.shape(masks)[0]
+    masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1])
+    # Pad zeros on the boundary of masks.
+    masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4)
+    masks = tf.reshape(masks, [batch_size, num_masks, height+4, width+4, 1])
+
+    # Projects target box locations and sizes to corresponding cropped
+    # mask coordinates.
+    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
+        value=boxes, num_or_size_splits=4, axis=2)
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
+        value=target_boxes, num_or_size_splits=4, axis=2)
+    y_transform = (bb_y_min - gt_y_min) * height / (
+        gt_y_max - gt_y_min + _EPSILON) + 2
+    x_transform = (bb_x_min - gt_x_min) * height / (
+        gt_x_max - gt_x_min + _EPSILON) + 2
+    h_transform = (bb_y_max - bb_y_min) * width / (
+        gt_y_max - gt_y_min + _EPSILON)
+    w_transform = (bb_x_max - bb_x_min) * width / (
+        gt_x_max - gt_x_min + _EPSILON)
+
+    boundaries = tf.concat(
+        [tf.ones_like(y_transform) * ((height + 4) - 1),
+         tf.ones_like(x_transform) * ((width + 4) - 1)],
+        axis=-1)
+    boundaries = tf.cast(boundaries, dtype=y_transform.dtype)
+
+    # Reshape tensors to have the right shape for selective_crop_and_resize.
+    trasnformed_boxes = tf.concat(
+        [y_transform, x_transform, h_transform, w_transform], -1)
+    levels = tf.tile(tf.reshape(tf.range(num_masks), [1, num_masks]),
+                     [batch_size, 1])
+
+    cropped_masks = _selective_crop_and_resize(
+        masks,
+        trasnformed_boxes,
+        levels,
+        boundaries,
+        output_size,
+        sample_offset=sample_offset,
+        use_einsum_gather=use_einsum)
+    cropped_masks = tf.squeeze(cropped_masks, axis=-1)
+
+  return cropped_masks
+
+
+def nearest_upsampling(data, scale):
+  """Nearest neighbor upsampling implementation.
+
+  Args:
+    data: A tensor with a shape of [batch, height_in, width_in, channels].
+    scale: An integer multiple to scale resolution of input data.
+
+  Returns:
+    data_up: A tensor with a shape of
+      [batch, height_in*scale, width_in*scale, channels]. Same dtype as input
+      data.
+  """
+  with tf.name_scope('nearest_upsampling'):
+    bs, _, _, c = data.get_shape().as_list()
+    shape = tf.shape(input=data)
+    h = shape[1]
+    w = shape[2]
+    bs = -1 if bs is None else bs
+    # Uses reshape to quickly upsample the input.  The nearest pixel is selected
+    # implicitly via broadcasting.
+    data = tf.reshape(data, [bs, h, 1, w, 1, c]) * tf.ones(
+        [1, 1, scale, 1, scale, 1], dtype=data.dtype)
+    return tf.reshape(data, [bs, h * scale, w * scale, c])
--- a/official/vision/beta/ops/spatial_transform_ops_test.py
+++ b/official/vision/beta/ops/spatial_transform_ops_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for spatial_transform_ops.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.ops import spatial_transform_ops
+
+
+class MultiLevelCropAndResizeTest(tf.test.TestCase):
+
+  def test_multilevel_crop_and_resize_square(self):
+    """Example test case.
+
+    Input =
+    [
+      [0, 1, 2, 3],
+      [4, 5, 6, 7],
+      [8, 9, 10, 11],
+      [12, 13, 14, 15],
+    ]
+    output_size = 2x2
+    box =
+    [
+      [[0, 0, 2, 2]]
+    ]
+    Gathered data =
+    [
+      [0, 1, 1, 2],
+      [4, 5, 5, 6],
+      [4, 5, 5, 6],
+      [8, 9, 9, 10],
+    ]
+    Interpolation kernel =
+    [
+      [1, 1, 1, 1],
+      [1, 1, 1, 1],
+      [1, 1, 1, 1],
+      [1, 1, 1, 1],
+    ]
+    Output =
+    [
+      [2.5, 3.5],
+      [6.5, 7.5]
+    ]
+    """
+    input_size = 4
+    min_level = 0
+    max_level = 0
+    batch_size = 1
+    output_size = 2
+    num_filters = 1
+    features = {}
+    for level in range(min_level, max_level + 1):
+      feat_size = int(input_size / 2**level)
+
+      features[level] = tf.range(
+          batch_size * feat_size * feat_size * num_filters, dtype=tf.float32)
+      features[level] = tf.reshape(
+          features[level], [batch_size, feat_size, feat_size, num_filters])
+    boxes = tf.constant([
+        [[0, 0, 2, 2]],
+    ], dtype=tf.float32)
+    tf_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        features, boxes, output_size)
+    roi_features = tf_roi_features.numpy()
+    self.assertAllClose(
+        roi_features,
+        np.array([[2.5, 3.5],
+                  [6.5,
+                   7.5]]).reshape([batch_size, 1, output_size, output_size, 1]))
+
+  def test_multilevel_crop_and_resize_rectangle(self):
+    """Example test case.
+
+    Input =
+    [
+      [0, 1, 2, 3],
+      [4, 5, 6, 7],
+      [8, 9, 10, 11],
+      [12, 13, 14, 15],
+    ]
+    output_size = 2x2
+    box =
+    [
+      [[0, 0, 2, 3]]
+    ]
+    Box vertices =
+    [
+      [[0.5, 0.75], [0.5, 2.25]],
+      [[1.5, 0.75], [1.5, 2.25]],
+    ]
+    Gathered data =
+    [
+      [0, 1, 2, 3],
+      [4, 5, 6, 7],
+      [4, 5, 6, 7],
+      [8, 9, 10, 11],
+    ]
+    Interpolation kernel =
+    [
+      [0.5 1.5 1.5 0.5],
+      [0.5 1.5 1.5 0.5],
+      [0.5 1.5 1.5 0.5],
+      [0.5 1.5 1.5 0.5],
+    ]
+    Output =
+    [
+      [2.75, 4.25],
+      [6.75, 8.25]
+    ]
+    """
+    input_size = 4
+    min_level = 0
+    max_level = 0
+    batch_size = 1
+    output_size = 2
+    num_filters = 1
+    features = {}
+    for level in range(min_level, max_level + 1):
+      feat_size = int(input_size / 2**level)
+
+      features[level] = tf.range(
+          batch_size * feat_size * feat_size * num_filters, dtype=tf.float32)
+      features[level] = tf.reshape(
+          features[level], [batch_size, feat_size, feat_size, num_filters])
+    boxes = tf.constant([
+        [[0, 0, 2, 3]],
+    ], dtype=tf.float32)
+    tf_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        features, boxes, output_size)
+    roi_features = tf_roi_features.numpy()
+    self.assertAllClose(
+        roi_features,
+        np.array([[2.75, 4.25],
+                  [6.75,
+                   8.25]]).reshape([batch_size, 1, output_size, output_size,
+                                    1]))
+
+  def test_multilevel_crop_and_resize_two_boxes(self):
+    """Test two boxes."""
+    input_size = 4
+    min_level = 0
+    max_level = 0
+    batch_size = 1
+    output_size = 2
+    num_filters = 1
+    features = {}
+    for level in range(min_level, max_level + 1):
+      feat_size = int(input_size / 2**level)
+
+      features[level] = tf.range(
+          batch_size * feat_size * feat_size * num_filters, dtype=tf.float32)
+      features[level] = tf.reshape(
+          features[level], [batch_size, feat_size, feat_size, num_filters])
+    boxes = tf.constant([
+        [[0, 0, 2, 2], [0, 0, 2, 3]],
+    ], dtype=tf.float32)
+    tf_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        features, boxes, output_size)
+    roi_features = tf_roi_features.numpy()
+    self.assertAllClose(
+        roi_features,
+        np.array([[[2.5, 3.5], [6.5, 7.5]], [[2.75, 4.25], [6.75, 8.25]]
+                 ]).reshape([batch_size, 2, output_size, output_size, 1]))
+
+  def test_multilevel_crop_and_resize_feature_level_assignment(self):
+    """Test feature level assignment."""
+    input_size = 640
+    min_level = 2
+    max_level = 5
+    batch_size = 1
+    output_size = 2
+    num_filters = 1
+    features = {}
+    for level in range(min_level, max_level + 1):
+      feat_size = int(input_size / 2**level)
+
+      features[level] = float(level) * tf.ones(
+          [batch_size, feat_size, feat_size, num_filters], dtype=tf.float32)
+    boxes = tf.constant(
+        [
+            [
+                [0, 0, 111, 111],  # Level 2.
+                [0, 0, 113, 113],  # Level 3.
+                [0, 0, 223, 223],  # Level 3.
+                [0, 0, 225, 225],  # Level 4.
+                [0, 0, 449, 449]
+            ],  # Level 5.
+        ],
+        dtype=tf.float32)
+    tf_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        features, boxes, output_size)
+    roi_features = tf_roi_features.numpy()
+    self.assertAllClose(roi_features[0, 0], 2 * np.ones((2, 2, 1)))
+    self.assertAllClose(roi_features[0, 1], 3 * np.ones((2, 2, 1)))
+    self.assertAllClose(roi_features[0, 2], 3 * np.ones((2, 2, 1)))
+    self.assertAllClose(roi_features[0, 3], 4 * np.ones((2, 2, 1)))
+    self.assertAllClose(roi_features[0, 4], 5 * np.ones((2, 2, 1)))
+
+  def test_multilevel_crop_and_resize_large_input(self):
+    """Test 512 boxes on TPU."""
+    input_size = 1408
+    min_level = 2
+    max_level = 6
+    batch_size = 2
+    num_boxes = 512
+    num_filters = 256
+    output_size = 7
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      features = {}
+      for level in range(min_level, max_level + 1):
+        feat_size = int(input_size / 2**level)
+        features[level] = tf.constant(
+            np.reshape(
+                np.arange(
+                    batch_size * feat_size * feat_size * num_filters,
+                    dtype=np.float32),
+                [batch_size, feat_size, feat_size, num_filters]),
+            dtype=tf.bfloat16)
+      boxes = np.array([
+          [[0, 0, 256, 256]]*num_boxes,
+      ], dtype=np.float32)
+      boxes = np.tile(boxes, [batch_size, 1, 1])
+      tf_boxes = tf.constant(boxes, dtype=tf.float32)
+
+      tf_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+          features, tf_boxes)
+      roi_features = tf_roi_features.numpy()
+      self.assertEqual(
+          roi_features.shape,
+          (batch_size, num_boxes, output_size, output_size, num_filters))
+
+
+class CropMaskInTargetBoxTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (False),
+      (True),
+  )
+  def test_crop_mask_in_target_box(self, use_einsum):
+    batch_size = 1
+    num_masks = 2
+    height = 2
+    width = 2
+    output_size = 2
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      masks = tf.ones([batch_size, num_masks, height, width])
+      boxes = tf.constant(
+          [[0., 0., 1., 1.],
+           [0., 0., 1., 1.]])
+      target_boxes = tf.constant(
+          [[0., 0., 1., 1.],
+           [-1., -1., 1., 1.]])
+      expected_outputs = np.array([
+          [[[1., 1.],
+            [1., 1.]],
+           [[0., 0.],
+            [0., 1.]]]])
+      boxes = tf.reshape(boxes, [batch_size, num_masks, 4])
+      target_boxes = tf.reshape(target_boxes, [batch_size, num_masks, 4])
+
+      tf_cropped_masks = spatial_transform_ops.crop_mask_in_target_box(
+          masks, boxes, target_boxes, output_size, use_einsum=use_einsum)
+      cropped_masks = tf_cropped_masks.numpy()
+      self.assertAllEqual(cropped_masks, expected_outputs)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/tasks/__init__.py
+++ b/official/vision/beta/tasks/__init__.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tasks package definition."""
+
+from official.vision.beta.tasks import image_classification
+from official.vision.beta.tasks import maskrcnn
+from official.vision.beta.tasks import retinanet
+from official.vision.beta.tasks import video_classification
--- a/official/vision/beta/tasks/image_classification.py
+++ b/official/vision/beta/tasks/image_classification.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Image classification task definition."""
+import tensorflow as tf
+from official.core import base_task
+from official.core import input_reader
+from official.core import task_factory
+from official.modeling import tf_utils
+from official.vision.beta.configs import image_classification as exp_cfg
+from official.vision.beta.dataloaders import classification_input
+from official.vision.beta.modeling import factory
+
+
+@task_factory.register_task_cls(exp_cfg.ImageClassificationTask)
+class ImageClassificationTask(base_task.Task):
+  """A task for image classification."""
+
+  def build_model(self):
+    """Builds classification model."""
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None] + self.task_config.model.input_size)
+
+    l2_weight_decay = self.task_config.losses.l2_weight_decay
+    # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss.
+    # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2)
+    # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss)
+    l2_regularizer = (tf.keras.regularizers.l2(
+        l2_weight_decay / 2.0) if l2_weight_decay else None)
+
+    model = factory.build_classification_model(
+        input_specs=input_specs,
+        model_config=self.task_config.model,
+        l2_regularizer=l2_regularizer)
+    return model
+
+  def build_inputs(self, params, input_context=None):
+    """Builds classification input."""
+
+    num_classes = self.task_config.model.num_classes
+    input_size = self.task_config.model.input_size
+
+    decoder = classification_input.Decoder()
+    parser = classification_input.Parser(
+        output_size=input_size[:2],
+        num_classes=num_classes,
+        dtype=params.dtype)
+
+    reader = input_reader.InputReader(
+        params,
+        dataset_fn=tf.data.TFRecordDataset,
+        decoder_fn=decoder.decode,
+        parser_fn=parser.parse_fn(params.is_training))
+
+    dataset = reader.read(input_context=input_context)
+
+    return dataset
+
+  def build_losses(self, labels, model_outputs, aux_losses=None):
+    """Sparse categorical cross entropy loss.
+
+    Args:
+      labels: labels.
+      model_outputs: Output logits of the classifier.
+      aux_losses: auxiliarly loss tensors, i.e. `losses` in keras.Model.
+
+    Returns:
+      The total loss tensor.
+    """
+    losses_config = self.task_config.losses
+    if losses_config.one_hot:
+      total_loss = tf.keras.losses.categorical_crossentropy(
+          labels,
+          model_outputs,
+          from_logits=True,
+          label_smoothing=losses_config.label_smoothing)
+    else:
+      total_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          labels, model_outputs, from_logits=True)
+
+    total_loss = tf_utils.safe_mean(total_loss)
+    if aux_losses:
+      total_loss += tf.add_n(aux_losses)
+
+    return total_loss
+
+  def build_metrics(self, training=True):
+    """Gets streaming metrics for training/validation."""
+    if self.task_config.losses.one_hot:
+      metrics = [
+          tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+          tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top_5_accuracy')]
+    else:
+      metrics = [
+          tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+          tf.keras.metrics.SparseTopKCategoricalAccuracy(
+              k=5, name='top_5_accuracy')]
+    return metrics
+
+  def train_step(self, inputs, model, optimizer, metrics=None):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+    if self.task_config.losses.one_hot:
+      labels = tf.one_hot(labels, self.task_config.model.num_classes)
+
+    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
+    with tf.GradientTape() as tape:
+      outputs = model(features, training=True)
+      # Casting output layer as float32 is necessary when mixed_precision is
+      # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32.
+      outputs = tf.nest.map_structure(
+          lambda x: tf.cast(x, tf.float32), outputs)
+
+      # Computes per-replica loss.
+      loss = self.build_losses(
+          model_outputs=outputs, labels=labels, aux_losses=model.losses)
+      # Scales loss as the default gradients allreduce performs sum inside the
+      # optimizer.
+      scaled_loss = loss / num_replicas
+
+      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
+      # scaled for numerical stability.
+      if isinstance(
+          optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    # Scales back gradient before apply_gradients when LossScaleOptimizer is
+    # used.
+    if isinstance(
+        optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+
+    # Apply gradient clipping.
+    if self.task_config.gradient_clip_norm > 0:
+      grads, _ = tf.clip_by_global_norm(
+          grads, self.task_config.gradient_clip_norm)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+
+    logs = {self.loss: loss}
+    if metrics:
+      self.process_metrics(metrics, labels, outputs)
+      logs.update({m.name: m.result() for m in metrics})
+    elif model.compiled_metrics:
+      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
+      logs.update({m.name: m.result() for m in model.metrics})
+    return logs
+
+  def validation_step(self, inputs, model, metrics=None):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+    if self.task_config.losses.one_hot:
+      labels = tf.one_hot(labels, self.task_config.model.num_classes)
+
+    outputs = self.inference_step(features, model)
+    outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs)
+    loss = self.build_losses(model_outputs=outputs, labels=labels,
+                             aux_losses=model.losses)
+
+    logs = {self.loss: loss}
+    if metrics:
+      self.process_metrics(metrics, labels, outputs)
+      logs.update({m.name: m.result() for m in metrics})
+    elif model.compiled_metrics:
+      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
+      logs.update({m.name: m.result() for m in model.metrics})
+    return logs
+
+  def inference_step(self, inputs, model):
+    """Performs the forward step."""
+    return model(inputs, training=False)
--- a/official/vision/beta/tasks/image_classification_test.py
+++ b/official/vision/beta/tasks/image_classification_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for image classification task."""
+
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import orbit
+import tensorflow as tf
+
+from official.core import exp_factory
+from official.modeling import optimization
+from official.vision import beta
+from official.vision.beta.tasks import image_classification as img_cls_task
+
+
+class ImageClassificationTaskTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(('resnet_imagenet'),
+                            ('revnet_imagenet'))
+  def test_task(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    config.task.train_data.global_batch_size = 2
+
+    task = img_cls_task.ImageClassificationTask(config.task)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    strategy = tf.distribute.get_strategy()
+
+    dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs,
+                                                   config.task.train_data)
+
+    iterator = iter(dataset)
+    opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
+    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
+    logs = task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    self.assertIn('loss', logs)
+    self.assertIn('accuracy', logs)
+    self.assertIn('top_5_accuracy', logs)
+    logs = task.validation_step(next(iterator), model, metrics=metrics)
+    self.assertIn('loss', logs)
+    self.assertIn('accuracy', logs)
+    self.assertIn('top_5_accuracy', logs)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/tasks/maskrcnn.py
+++ b/official/vision/beta/tasks/maskrcnn.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RetinaNet task definition."""
+
+from absl import logging
+import tensorflow as tf
+from official.core import base_task
+from official.core import input_reader
+from official.core import task_factory
+from official.vision.beta.configs import maskrcnn as exp_cfg
+from official.vision.beta.dataloaders import maskrcnn_input
+from official.vision.beta.dataloaders import tf_example_decoder
+from official.vision.beta.dataloaders import tf_example_label_map_decoder
+from official.vision.beta.evaluation import coco_evaluator
+from official.vision.beta.losses import maskrcnn_losses
+from official.vision.beta.modeling import factory
+
+
+@task_factory.register_task_cls(exp_cfg.MaskRCNNTask)
+class MaskRCNNTask(base_task.Task):
+  """A single-replica view of training procedure.
+
+  Mask R-CNN task provides artifacts for training/evalution procedures,
+  including loading/iterating over Datasets, initializing the model, calculating
+  the loss, post-processing, and customized metrics with reduction.
+  """
+
+  def build_model(self):
+    """Build Mask R-CNN model."""
+
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None] + self.task_config.model.input_size)
+
+    l2_weight_decay = self.task_config.losses.l2_weight_decay
+    # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss.
+    # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2)
+    # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss)
+    l2_regularizer = (tf.keras.regularizers.l2(
+        l2_weight_decay / 2.0) if l2_weight_decay else None)
+
+    model = factory.build_maskrcnn(
+        input_specs=input_specs,
+        model_config=self.task_config.model,
+        l2_regularizer=l2_regularizer)
+    return model
+
+  def initialize(self, model: tf.keras.Model):
+    """Loading pretrained checkpoint."""
+    if not self.task_config.init_checkpoint:
+      return
+
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+
+    # Restoring checkpoint.
+    if self.task_config.init_checkpoint_modules == 'all':
+      ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+      status = ckpt.read(ckpt_dir_or_file)
+      status.assert_consumed()
+    elif self.task_config.init_checkpoint_modules == 'backbone':
+      ckpt = tf.train.Checkpoint(backbone=model.backbone)
+      status = ckpt.read(ckpt_dir_or_file)
+      status.expect_partial().assert_existing_objects_matched()
+    else:
+      assert "Only 'all' or 'backbone' can be used to initialize the model."
+
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
+
+  def build_inputs(self, params, input_context=None):
+    """Build input dataset."""
+    decoder_cfg = params.decoder.get()
+    if params.decoder.type == 'simple_decoder':
+      decoder = tf_example_decoder.TfExampleDecoder(
+          include_mask=self._task_config.model.include_mask,
+          regenerate_source_id=decoder_cfg.regenerate_source_id)
+    elif params.decoder.type == 'label_map_decoder':
+      decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
+          label_map=decoder_cfg.label_map,
+          include_mask=self._task_config.model.include_mask,
+          regenerate_source_id=decoder_cfg.regenerate_source_id)
+    else:
+      raise ValueError('Unknown decoder type: {}!'.format(params.decoder.type))
+
+    parser = maskrcnn_input.Parser(
+        output_size=self.task_config.model.input_size[:2],
+        min_level=self.task_config.model.min_level,
+        max_level=self.task_config.model.max_level,
+        num_scales=self.task_config.model.anchor.num_scales,
+        aspect_ratios=self.task_config.model.anchor.aspect_ratios,
+        anchor_size=self.task_config.model.anchor.anchor_size,
+        dtype=params.dtype,
+        rpn_match_threshold=params.parser.rpn_match_threshold,
+        rpn_unmatched_threshold=params.parser.rpn_unmatched_threshold,
+        rpn_batch_size_per_im=params.parser.rpn_batch_size_per_im,
+        rpn_fg_fraction=params.parser.rpn_fg_fraction,
+        aug_rand_hflip=params.parser.aug_rand_hflip,
+        aug_scale_min=params.parser.aug_scale_min,
+        aug_scale_max=params.parser.aug_scale_max,
+        skip_crowd_during_training=params.parser.skip_crowd_during_training,
+        max_num_instances=params.parser.max_num_instances,
+        include_mask=self._task_config.model.include_mask,
+        mask_crop_size=params.parser.mask_crop_size)
+
+    reader = input_reader.InputReader(
+        params,
+        dataset_fn=tf.data.TFRecordDataset,
+        decoder_fn=decoder.decode,
+        parser_fn=parser.parse_fn(params.is_training))
+    dataset = reader.read(input_context=input_context)
+
+    return dataset
+
+  def build_losses(self, outputs, labels, aux_losses=None):
+    """Build Mask R-CNN losses."""
+    params = self.task_config
+
+    rpn_score_loss_fn = maskrcnn_losses.RpnScoreLoss(
+        tf.shape(outputs['box_outputs'])[1])
+    rpn_box_loss_fn = maskrcnn_losses.RpnBoxLoss(
+        params.losses.rpn_huber_loss_delta)
+    rpn_score_loss = tf.reduce_mean(
+        rpn_score_loss_fn(
+            outputs['rpn_scores'], labels['rpn_score_targets']))
+    rpn_box_loss = tf.reduce_mean(
+        rpn_box_loss_fn(
+            outputs['rpn_boxes'], labels['rpn_box_targets']))
+
+    frcnn_cls_loss_fn = maskrcnn_losses.FastrcnnClassLoss()
+    frcnn_box_loss_fn = maskrcnn_losses.FastrcnnBoxLoss(
+        params.losses.frcnn_huber_loss_delta)
+    frcnn_cls_loss = tf.reduce_mean(
+        frcnn_cls_loss_fn(
+            outputs['class_outputs'], outputs['class_targets']))
+    frcnn_box_loss = tf.reduce_mean(
+        frcnn_box_loss_fn(
+            outputs['box_outputs'],
+            outputs['class_targets'],
+            outputs['box_targets']))
+
+    if params.model.include_mask:
+      mask_loss_fn = maskrcnn_losses.MaskrcnnLoss()
+      mask_loss = tf.reduce_mean(
+          mask_loss_fn(
+              outputs['mask_outputs'],
+              outputs['mask_targets'],
+              outputs['mask_class_targets']))
+    else:
+      mask_loss = 0.0
+
+    model_loss = (
+        params.losses.rpn_score_weight * rpn_score_loss +
+        params.losses.rpn_box_weight * rpn_box_loss +
+        params.losses.frcnn_class_weight * frcnn_cls_loss +
+        params.losses.frcnn_box_weight * frcnn_box_loss +
+        params.losses.mask_weight * mask_loss)
+
+    total_loss = model_loss
+    if aux_losses:
+      reg_loss = tf.reduce_sum(aux_losses)
+      total_loss = model_loss + reg_loss
+
+    losses = {
+        'total_loss': total_loss,
+        'rpn_score_loss': rpn_score_loss,
+        'rpn_box_loss': rpn_box_loss,
+        'frcnn_cls_loss': frcnn_cls_loss,
+        'frcnn_box_loss': frcnn_box_loss,
+        'mask_loss': mask_loss,
+        'model_loss': model_loss,
+    }
+    return losses
+
+  def build_metrics(self, training=True):
+    """Build detection metrics."""
+    metrics = []
+    if training:
+      metric_names = [
+          'total_loss',
+          'rpn_score_loss',
+          'rpn_box_loss',
+          'frcnn_cls_loss',
+          'frcnn_box_loss',
+          'mask_loss',
+          'model_loss'
+      ]
+      for name in metric_names:
+        metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32))
+
+    else:
+      self.coco_metric = coco_evaluator.COCOEvaluator(
+          annotation_file=self._task_config.annotation_file,
+          include_mask=self._task_config.model.include_mask)
+
+    return metrics
+
+  def train_step(self, inputs, model, optimizer, metrics=None):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    images, labels = inputs
+    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
+    with tf.GradientTape() as tape:
+      outputs = model(
+          images,
+          image_shape=labels['image_info'][:, 1, :],
+          anchor_boxes=labels['anchor_boxes'],
+          gt_boxes=labels['gt_boxes'],
+          gt_classes=labels['gt_classes'],
+          gt_masks=(labels['gt_masks'] if self.task_config.model.include_mask
+                    else None),
+          training=True)
+      outputs = tf.nest.map_structure(
+          lambda x: tf.cast(x, tf.float32), outputs)
+
+      # Computes per-replica loss.
+      losses = self.build_losses(
+          outputs=outputs, labels=labels, aux_losses=model.losses)
+      scaled_loss = losses['total_loss'] / num_replicas
+
+      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
+      # scaled for numerical stability.
+      if isinstance(
+          optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    # Scales back gradient when LossScaleOptimizer is used.
+    if isinstance(
+        optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+
+    # Apply gradient clipping.
+    if self.task_config.gradient_clip_norm > 0:
+      grads, _ = tf.clip_by_global_norm(
+          grads, self.task_config.gradient_clip_norm)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+
+    logs = {self.loss: losses['total_loss']}
+
+    if metrics:
+      for m in metrics:
+        m.update_state(losses[m.name])
+        logs.update({m.name: m.result()})
+
+    return logs
+
+  def validation_step(self, inputs, model, metrics=None):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    images, labels = inputs
+
+    outputs = model(
+        images,
+        anchor_boxes=labels['anchor_boxes'],
+        image_shape=labels['image_info'][:, 1, :],
+        training=False)
+
+    logs = {self.loss: 0}
+    coco_model_outputs = {
+        'detection_boxes': outputs['detection_boxes'],
+        'detection_scores': outputs['detection_scores'],
+        'detection_classes': outputs['detection_classes'],
+        'num_detections': outputs['num_detections'],
+        'source_id': labels['groundtruths']['source_id'],
+        'image_info': labels['image_info']
+    }
+    if self.task_config.model.include_mask:
+      coco_model_outputs.update({
+          'detection_masks': outputs['detection_masks'],
+      })
+    logs.update({
+        self.coco_metric.name: (labels['groundtruths'], coco_model_outputs)
+    })
+    return logs
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    if state is None:
+      self.coco_metric.reset_states()
+      state = self.coco_metric
+    self.coco_metric.update_state(
+        step_outputs[self.coco_metric.name][0],
+        step_outputs[self.coco_metric.name][1])
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    return self.coco_metric.result()
--- a/official/vision/beta/tasks/maskrcnn_test.py
+++ b/official/vision/beta/tasks/maskrcnn_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MaskRCNN task."""
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import orbit
+import tensorflow as tf
+
+from official.core import exp_factory
+from official.modeling import optimization
+from official.vision import beta
+from official.vision.beta.tasks import maskrcnn
+
+
+class RetinaNetTaskTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ("fasterrcnn_resnetfpn_coco", True),
+      ("fasterrcnn_resnetfpn_coco", False),
+      ("maskrcnn_resnetfpn_coco", True),
+      ("maskrcnn_resnetfpn_coco", False),
+  )
+  def test_retinanet_task_train(self, test_config, is_training):
+    """RetinaNet task test for training and val using toy configs."""
+    config = exp_factory.get_exp_config(test_config)
+    tf.keras.mixed_precision.experimental.Policy("mixed_bfloat16")
+    # modify config to suit local testing
+    config.trainer.steps_per_loop = 1
+    config.task.train_data.global_batch_size = 2
+    config.task.model.input_size = [384, 384, 3]
+    config.train_steps = 2
+    config.task.train_data.shuffle_buffer_size = 10
+    config.task.train_data.input_path = "/readahead/200M/placer/prod/home/snaggletooth/test/data/coco/train-00000-of-00256.tfrecord"
+    config.task.validation_data.global_batch_size = 2
+    config.task.validation_data.input_path = "/readahead/200M/placer/prod/home/snaggletooth/test/data/coco/val-00000-of-00032.tfrecord"
+
+    task = maskrcnn.MaskRCNNTask(config.task)
+    model = task.build_model()
+    metrics = task.build_metrics(training=is_training)
+
+    strategy = tf.distribute.get_strategy()
+
+    data_config = config.task.train_data if is_training else config.task.validation_data
+    dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs,
+                                                   data_config)
+    iterator = iter(dataset)
+    opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
+    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
+
+    if is_training:
+      task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    else:
+      task.validation_step(next(iterator), model, metrics=metrics)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/vision/beta/tasks/retinanet.py
+++ b/official/vision/beta/tasks/retinanet.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RetinaNet task definition."""
+
+from absl import logging
+import tensorflow as tf
+from official.core import base_task
+from official.core import input_reader
+from official.core import task_factory
+from official.vision.beta.configs import retinanet as exp_cfg
+from official.vision.beta.dataloaders import retinanet_input
+from official.vision.beta.dataloaders import tf_example_decoder
+from official.vision.beta.dataloaders import tf_example_label_map_decoder
+from official.vision.beta.evaluation import coco_evaluator
+from official.vision.beta.losses import retinanet_losses
+from official.vision.beta.modeling import factory
+
+
+@task_factory.register_task_cls(exp_cfg.RetinaNetTask)
+class RetinaNetTask(base_task.Task):
+  """A single-replica view of training procedure.
+
+  RetinaNet task provides artifacts for training/evalution procedures, including
+  loading/iterating over Datasets, initializing the model, calculating the loss,
+  post-processing, and customized metrics with reduction.
+  """
+
+  def build_model(self):
+    """Build RetinaNet model."""
+
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None] + self.task_config.model.input_size)
+
+    l2_weight_decay = self.task_config.losses.l2_weight_decay
+    # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss.
+    # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2)
+    # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss)
+    l2_regularizer = (tf.keras.regularizers.l2(
+        l2_weight_decay / 2.0) if l2_weight_decay else None)
+
+    model = factory.build_retinanet(
+        input_specs=input_specs,
+        model_config=self.task_config.model,
+        l2_regularizer=l2_regularizer)
+    return model
+
+  def initialize(self, model: tf.keras.Model):
+    """Loading pretrained checkpoint."""
+    if not self.task_config.init_checkpoint:
+      return
+
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+
+    # Restoring checkpoint.
+    if self.task_config.init_checkpoint_modules == 'all':
+      ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+      status = ckpt.read(ckpt_dir_or_file)
+      status.assert_consumed()
+    elif self.task_config.init_checkpoint_modules == 'backbone':
+      ckpt = tf.train.Checkpoint(backbone=model.backbone)
+      status = ckpt.read(ckpt_dir_or_file)
+      status.expect_partial().assert_existing_objects_matched()
+    else:
+      assert "Only 'all' or 'backbone' can be used to initialize the model."
+
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
+
+  def build_inputs(self, params, input_context=None):
+    """Build input dataset."""
+    decoder_cfg = params.decoder.get()
+    if params.decoder.type == 'simple_decoder':
+      decoder = tf_example_decoder.TfExampleDecoder(
+          regenerate_source_id=decoder_cfg.regenerate_source_id)
+    elif params.decoder.type == 'label_map_decoder':
+      decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
+          label_map=decoder_cfg.label_map,
+          regenerate_source_id=decoder_cfg.regenerate_source_id)
+    else:
+      raise ValueError('Unknown decoder type: {}!'.format(params.decoder.type))
+    decoder_cfg = params.decoder.get()
+    if params.decoder.type == 'simple_decoder':
+      decoder = tf_example_decoder.TfExampleDecoder(
+          regenerate_source_id=decoder_cfg.regenerate_source_id)
+    elif params.decoder.type == 'label_map_decoder':
+      decoder = tf_example_decoder.TfExampleDecoderLabelMap(
+          label_map=decoder_cfg.label_map,
+          regenerate_source_id=decoder_cfg.regenerate_source_id)
+    else:
+      raise ValueError('Unknown decoder type: {}!'.format(params.decoder.type))
+    parser = retinanet_input.Parser(
+        output_size=self.task_config.model.input_size[:2],
+        min_level=self.task_config.model.min_level,
+        max_level=self.task_config.model.max_level,
+        num_scales=self.task_config.model.anchor.num_scales,
+        aspect_ratios=self.task_config.model.anchor.aspect_ratios,
+        anchor_size=self.task_config.model.anchor.anchor_size,
+        dtype=params.dtype,
+        match_threshold=params.parser.match_threshold,
+        unmatched_threshold=params.parser.unmatched_threshold,
+        aug_rand_hflip=params.parser.aug_rand_hflip,
+        aug_scale_min=params.parser.aug_scale_min,
+        aug_scale_max=params.parser.aug_scale_max,
+        skip_crowd_during_training=params.parser.skip_crowd_during_training,
+        max_num_instances=params.parser.max_num_instances)
+
+    reader = input_reader.InputReader(
+        params,
+        dataset_fn=tf.data.TFRecordDataset,
+        decoder_fn=decoder.decode,
+        parser_fn=parser.parse_fn(params.is_training))
+    dataset = reader.read(input_context=input_context)
+
+    return dataset
+
+  def build_losses(self, outputs, labels, aux_losses=None):
+    """Build RetinaNet losses."""
+    params = self.task_config
+    cls_loss_fn = retinanet_losses.FocalLoss(
+        alpha=params.losses.focal_loss_alpha,
+        gamma=params.losses.focal_loss_gamma,
+        num_classes=params.model.num_classes,
+        reduction=tf.keras.losses.Reduction.SUM)
+    box_loss_fn = retinanet_losses.RetinanetBoxLoss(
+        params.losses.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
+
+    # Sums all positives in a batch for normalization and avoids zero
+    # num_positives_sum, which would lead to inf loss during training
+    cls_sample_weight = labels['cls_weights']
+    box_sample_weight = labels['box_weights']
+    num_positives = tf.reduce_sum(box_sample_weight) + 1.0
+    cls_sample_weight = cls_sample_weight / num_positives
+    box_sample_weight = box_sample_weight / num_positives
+    cls_loss = cls_loss_fn(
+        y_true=labels['cls_targets'],
+        y_pred=outputs['cls_outputs'],
+        sample_weight=cls_sample_weight)
+    box_loss = box_loss_fn(
+        y_true=labels['box_targets'],
+        y_pred=outputs['box_outputs'],
+        sample_weight=box_sample_weight)
+
+    model_loss = cls_loss + params.losses.box_loss_weight * box_loss
+
+    total_loss = model_loss
+    if aux_losses:
+      reg_loss = tf.reduce_sum(aux_losses)
+      total_loss = model_loss + reg_loss
+
+    return total_loss, cls_loss, box_loss, model_loss
+
+  def build_metrics(self, training=True):
+    """Build detection metrics."""
+    metrics = []
+    metric_names = ['total_loss', 'cls_loss', 'box_loss', 'model_loss']
+    for name in metric_names:
+      metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32))
+
+    if not training:
+      self.coco_metric = coco_evaluator.COCOEvaluator(
+          annotation_file=None, include_mask=False)
+
+    return metrics
+
+  def train_step(self, inputs, model, optimizer, metrics=None):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
+    with tf.GradientTape() as tape:
+      outputs = model(features, training=True)
+      outputs = tf.nest.map_structure(
+          lambda x: tf.cast(x, tf.float32), outputs)
+
+      # Computes per-replica loss.
+      loss, cls_loss, box_loss, model_loss = self.build_losses(
+          outputs=outputs, labels=labels, aux_losses=model.losses)
+      scaled_loss = loss / num_replicas
+
+      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
+      # scaled for numerical stability.
+      if isinstance(
+          optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    # Scales back gradient when LossScaleOptimizer is used.
+    if isinstance(
+        optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+
+    # Apply gradient clipping.
+    if self.task_config.gradient_clip_norm > 0:
+      grads, _ = tf.clip_by_global_norm(
+          grads, self.task_config.gradient_clip_norm)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+
+    logs = {self.loss: loss}
+
+    all_losses = {
+        'total_loss': loss,
+        'cls_loss': cls_loss,
+        'box_loss': box_loss,
+        'model_loss': model_loss,
+    }
+    if metrics:
+      for m in metrics:
+        m.update_state(all_losses[m.name])
+        logs.update({m.name: m.result()})
+
+    return logs
+
+  def validation_step(self, inputs, model, metrics=None):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+
+    outputs = model(features, anchor_boxes=labels['anchor_boxes'],
+                    image_shape=labels['image_info'][:, 1, :],
+                    training=False)
+    loss, cls_loss, box_loss, model_loss = self.build_losses(
+        outputs=outputs, labels=labels, aux_losses=model.losses)
+    logs = {self.loss: loss}
+
+    all_losses = {
+        'total_loss': loss,
+        'cls_loss': cls_loss,
+        'box_loss': box_loss,
+        'model_loss': model_loss,
+    }
+
+    coco_model_outputs = {
+        'detection_boxes': outputs['detection_boxes'],
+        'detection_scores': outputs['detection_scores'],
+        'detection_classes': outputs['detection_classes'],
+        'num_detections': outputs['num_detections'],
+        'source_id': labels['groundtruths']['source_id'],
+        'image_info': labels['image_info']
+    }
+    logs.update({self.coco_metric.name: (labels['groundtruths'],
+                                         coco_model_outputs)})
+    if metrics:
+      for m in metrics:
+        m.update_state(all_losses[m.name])
+        logs.update({m.name: m.result()})
+    return logs
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    if state is None:
+      self.coco_metric.reset_states()
+      state = self.coco_metric
+    self.coco_metric.update_state(step_outputs[self.coco_metric.name][0],
+                                  step_outputs[self.coco_metric.name][1])
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    return self.coco_metric.result()
--- a/official/vision/beta/tasks/retinanet_test.py
+++ b/official/vision/beta/tasks/retinanet_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RetinaNet task."""
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import orbit
+import tensorflow as tf
+
+from official.core import exp_factory
+from official.modeling import optimization
+from official.vision import beta
+from official.vision.beta.tasks import retinanet
+
+
+class RetinaNetTaskTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ("retinanet_resnetfpn_coco", True),
+      ("retinanet_spinenet_coco", True),
+  )
+  def test_retinanet_task_train(self, test_config, is_training):
+    """RetinaNet task test for training and val using toy configs."""
+    config = exp_factory.get_exp_config(test_config)
+    # modify config to suit local testing
+    config.trainer.steps_per_loop = 1
+    config.task.train_data.global_batch_size = 2
+    config.task.validation_data.global_batch_size = 2
+    config.task.train_data.shuffle_buffer_size = 4
+    config.task.validation_data.shuffle_buffer_size = 4
+    config.train_steps = 2
+
+    task = retinanet.RetinaNetTask(config.task)
+    model = task.build_model()
+    metrics = task.build_metrics()
+
+    strategy = tf.distribute.get_strategy()
+
+    data_config = config.task.train_data if is_training else config.task.validation_data
+    dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs,
+                                                   data_config)
+    iterator = iter(dataset)
+    opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
+    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
+
+    if is_training:
+      task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    else:
+      task.validation_step(next(iterator), model, metrics=metrics)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/vision/beta/train.py
+++ b/official/vision/beta/train.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFM vision training driver."""
+
+from absl import app
+from absl import flags
+import gin
+
+from official.common import flags as tfm_flags
+from official.common import registry_imports  # pylint: disable=unused-import
+from official.core import task_factory
+from official.core import train_lib
+from official.core import train_utils
+from official.modeling import performance
+from official.utils.misc import distribution_utils
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  params = train_utils.parse_configuration(FLAGS)
+  model_dir = FLAGS.model_dir
+  if 'train' in FLAGS.mode:
+    # Pure eval modes do not output yaml files. Otherwise continuous eval job
+    # may race against the train job for writing the same file.
+    train_utils.serialize_config(params, model_dir)
+
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype,
+                                           params.runtime.loss_scale)
+  distribution_strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  with distribution_strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+
+  train_lib.run_experiment(
+      distribution_strategy=distribution_strategy,
+      task=task,
+      mode=FLAGS.mode,
+      params=params,
+      model_dir=model_dir)
+
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  app.run(main)