Merge pull request #10251 from PurdueDualityLab:loss_fn_pr

PiperOrigin-RevId: 396512110

Merge pull request #10251 from PurdueDualityLab:loss_fn_pr
PiperOrigin-RevId: 396512110
c6d7d57d · A. Unique TensorFlower · 31fb7a65 · 7f90664e · c6d7d57d · c6d7d57d
Commit c6d7d57d authored Sep 13, 2021 by A. Unique TensorFlower
14 changed files
--- a/official/vision/beta/projects/yolo/losses/__init__.py
+++ b/official/vision/beta/projects/yolo/losses/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/official/vision/beta/projects/yolo/losses/yolo_loss.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Yolo Loss function."""
+import abc
+import collections
+import functools
+import tensorflow as tf
+from official.vision.beta.projects.yolo.ops import box_ops
+from official.vision.beta.projects.yolo.ops import loss_utils
+from official.vision.beta.projects.yolo.ops import math_ops
+class YoloLossBase(object, metaclass=abc.ABCMeta):
+  """Parameters for the YOLO loss functions used at each detection generator.
+  This base class implements the base functionality required to implement a Yolo
+  Loss function.
+  """
+  def __init__(self,
+               classes,
+               mask,
+               anchors,
+               path_stride=1,
+               ignore_thresh=0.7,
+               truth_thresh=1.0,
+               loss_type='ciou',
+               iou_normalizer=1.0,
+               cls_normalizer=1.0,
+               obj_normalizer=1.0,
+               label_smoothing=0.0,
+               objectness_smooth=True,
+               update_on_repeat=False,
+               box_type='original',
+               scale_x_y=1.0,
+               max_delta=10):
+    """Loss Function Initialization.
+    Args:
+      classes: `int` for the number of classes
+      mask: `List[int]` for the output level that this specific model output
+        level
+      anchors: `List[List[int]]` for the anchor boxes that are used in the model
+        at all levels. For anchor free prediction set the anchor list to be the
+        same as the image resolution.
+      path_stride: `int` for how much to scale this level to get the orginal
+        input shape.
+      ignore_thresh: `float` for the IOU value over which the loss is not
+        propagated, and a detection is assumed to have been made.
+      truth_thresh: `float` for the IOU value over which the loss is propagated
+        despite a detection being made.
+      loss_type: `str` for the typeof iou loss to use with in {ciou, diou, giou,
+        iou}.
+      iou_normalizer: `float` for how much to scale the loss on the IOU or the
+        boxes.
+      cls_normalizer: `float` for how much to scale the loss on the classes.
+      obj_normalizer: `float` for how much to scale loss on the detection map.
+      label_smoothing: `float` for how much to smooth the loss on the classes.
+      objectness_smooth: `float` for how much to smooth the loss on the
+        detection map.
+      update_on_repeat: `bool` for whether to replace with the newest or the
+        best value when an index is consumed by multiple objects.
+      box_type: `bool` for which scaling type to use.
+      scale_x_y: dictionary `float` values inidcating how far each pixel can see
+        outside of its containment of 1.0. a value of 1.2 indicates there is a
+        20% extended radius around each pixel that this specific pixel can
+        predict values for a center at. the center can range from 0 - value/2 to
+        1 + value/2, this value is set in the yolo filter, and resused here.
+        there should be one value for scale_xy for each level from min_level to
+        max_level.
+      max_delta: gradient clipping to apply to the box loss.
+    """
+    self._loss_type = loss_type
+    self._classes = tf.constant(tf.cast(classes, dtype=tf.int32))
+    self._num = tf.cast(len(mask), dtype=tf.int32)
+    self._truth_thresh = truth_thresh
+    self._ignore_thresh = ignore_thresh
+    self._masks = mask
+    self._anchors = anchors
+    self._iou_normalizer = iou_normalizer
+    self._cls_normalizer = cls_normalizer
+    self._obj_normalizer = obj_normalizer
+    self._scale_x_y = scale_x_y
+    self._max_delta = max_delta
+    self._label_smoothing = tf.cast(label_smoothing, tf.float32)
+    self._objectness_smooth = float(objectness_smooth)
+    self._update_on_repeat = update_on_repeat
+    self._box_type = box_type
+    self._path_stride = path_stride
+    box_kwargs = dict(
+        stride=self._path_stride,
+        scale_xy=self._scale_x_y,
+        box_type=self._box_type,
+        max_delta=self._max_delta)
+    self._decode_boxes = functools.partial(
+        loss_utils.get_predicted_box, **box_kwargs)
+    self._search_pairs = lambda pred_boxes, pred_classes, boxes, classes, scale, yxyx: (None, None, None, None)  # pylint:disable=line-too-long
+    self._build_per_path_attributes()
+    self._build_per_path_attributes()
+  def box_loss(self, true_box, pred_box, darknet=False):
+    """Call iou function and use it to compute the loss for the box maps."""
+    if self._loss_type == 'giou':
+      iou, liou = box_ops.compute_giou(true_box, pred_box)
+    elif self._loss_type == 'ciou':
+      iou, liou = box_ops.compute_ciou(true_box, pred_box, darknet=darknet)
+    else:
+      liou = iou = box_ops.compute_iou(true_box, pred_box)
+    loss_box = 1 - liou
+    return iou, liou, loss_box
+  def _tiled_global_box_search(self,
+                               pred_boxes,
+                               pred_classes,
+                               boxes,
+                               classes,
+                               true_conf,
+                               smoothed,
+                               scale=None):
+    """Search of all groundtruths to associate groundtruths to predictions."""
+    # Search all predictions against ground truths to find mathcing boxes for
+    # each pixel.
+    _, _, iou_max, _ = self._search_pairs(
+        pred_boxes, pred_classes, boxes, classes, scale=scale, yxyx=True)
+    if iou_max is None:
+      return true_conf, tf.ones_like(true_conf)
+    # Find the exact indexes to ignore and keep.
+    ignore_mask = tf.cast(iou_max < self._ignore_thresh, pred_boxes.dtype)
+    iou_mask = iou_max > self._ignore_thresh
+    if not smoothed:
+      # Ignore all pixels where a box was not supposed to be predicted but a
+      # high confidence box was predicted.
+      obj_mask = true_conf + (1 - true_conf) * ignore_mask
+    else:
+      # Replace pixels in the tre confidence map with the max iou predicted
+      # with in that cell.
+      obj_mask = tf.ones_like(true_conf)
+      iou_ = (1 - self._objectness_smooth) + self._objectness_smooth * iou_max
+      iou_ = tf.where(iou_max > 0, iou_, tf.zeros_like(iou_))
+      true_conf = tf.where(iou_mask, iou_, true_conf)
+    # Stop gradient so while loop is not tracked.
+    obj_mask = tf.stop_gradient(obj_mask)
+    true_conf = tf.stop_gradient(true_conf)
+    return true_conf, obj_mask
+  def __call__(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Call function to compute the loss and a set of metrics per FPN level.
+    Args:
+      true_counts: `Tensor` of shape [batchsize, height, width, num_anchors]
+        represeneting how many boxes are in a given pixel [j, i] in the output
+        map.
+      inds: `Tensor` of shape [batchsize, None, 3] indicating the location [j,
+        i] that a given box is associatied with in the FPN prediction map.
+      y_true: `Tensor` of shape [batchsize, None, 8] indicating the actual box
+        associated with each index in the inds tensor list.
+      boxes: `Tensor` of shape [batchsize, None, 4] indicating the original
+        ground truth boxes for each image as they came from the decoder used for
+        bounding box search.
+      classes: `Tensor` of shape [batchsize, None, 1] indicating the original
+        ground truth classes for each image as they came from the decoder used
+        for bounding box search.
+      y_pred: `Tensor` of shape [batchsize, height, width, output_depth] holding
+        the models output at a specific FPN level.
+    Returns:
+      loss: `float` for the actual loss.
+      box_loss: `float` loss on the boxes used for metrics.
+      conf_loss: `float` loss on the confidence used for metrics.
+      class_loss: `float` loss on the classes used for metrics.
+      avg_iou: `float` metric for the average iou between predictions and ground
+        truth.
+      avg_obj: `float` metric for the average confidence of the model for
+        predictions.
+    """
+    (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf, ind_mask,
+     grid_mask) = self._compute_loss(true_counts, inds, y_true, boxes, classes,
+                                     y_pred)
+    # Temporary metrics
+    box_loss = tf.stop_gradient(0.05 * box_loss / self._iou_normalizer)
+    # Metric compute using done here to save time and resources.
+    sigmoid_conf = tf.stop_gradient(tf.sigmoid(pred_conf))
+    iou = tf.stop_gradient(iou)
+    avg_iou = loss_utils.average_iou(
+        loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), iou))
+    avg_obj = loss_utils.average_iou(
+        tf.squeeze(sigmoid_conf, axis=-1) * grid_mask)
+    return (loss, box_loss, conf_loss, class_loss, mean_loss,
+            tf.stop_gradient(avg_iou), tf.stop_gradient(avg_obj))
+  @abc.abstractmethod
+  def _build_per_path_attributes(self):
+    """Additional initialization required for each YOLO loss version."""
+    ...
+  @abc.abstractmethod
+  def _compute_loss(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """The actual logic to apply to the raw model for optimization."""
+    ...
+  def post_path_aggregation(self, loss, ground_truths, predictions):  # pylint:disable=unused-argument
+    """This method allows for post processing of a loss value.
+    After the loss has been aggregated across all the FPN levels some post
+    proceessing may need to occur to poroperly scale the loss. The default
+    behavior is to pass the loss through with no alterations.
+    Args:
+      loss: `tf.float` scalar for the actual loss.
+      ground_truths: `Dict` holding all the ground truth tensors.
+      predictions: `Dict` holding all the predicted values.
+    Returns:
+      loss: `tf.float` scalar for the scaled loss.
+    """
+    return loss
+  @abc.abstractmethod
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """This controls how the loss should be aggregated across replicas."""
+    ...
+@tf.custom_gradient
+def grad_sigmoid(values):
+  """This function scales the gradient as if a signmoid was applied.
+  This is used in the Darknet Loss when the choosen box type is the scaled
+  coordinate type. This function is used to match the propagated gradient to
+  match that of the Darkent Yolov4 model. This is an Identity operation that
+  allows us to add some extra steps to the back propagation.
+  Args:
+    values: A tensor of any shape.
+  Returns:
+    values: The unaltered input tensor.
+    delta: A custom gradient function that adds the sigmoid step to the
+      backpropagation.
+  """
+  def delta(dy):
+    t = tf.math.sigmoid(values)
+    return dy * t * (1 - t)
+  return values, delta
+class DarknetLoss(YoloLossBase):
+  """This class implements the full logic for the standard Yolo models."""
+  def _build_per_path_attributes(self):
+    """Paramterization of pair wise search and grid generators.
+    Objects created here are used for box decoding and dynamic ground truth
+    association.
+    """
+    self._anchor_generator = loss_utils.GridGenerator(
+        masks=self._masks,
+        anchors=self._anchors,
+        scale_anchors=self._path_stride)
+    if self._ignore_thresh > 0.0:
+      self._search_pairs = loss_utils.PairWiseSearch(
+          iou_type='iou', any_match=True, min_conf=0.25)
+    return
+  def _compute_loss(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Per FPN path loss logic used for Yolov3, Yolov4, and Yolo-Tiny."""
+    if self._box_type == 'scaled':
+      # Darknet Model Propagates a sigmoid once in back prop so we replicate
+      # that behaviour
+      y_pred = grad_sigmoid(y_pred)
+    # Generate and store constants and format output.
+    shape = tf.shape(true_counts)
+    batch_size, width, height, num = shape[0], shape[1], shape[2], shape[3]
+    fwidth = tf.cast(width, tf.float32)
+    fheight = tf.cast(height, tf.float32)
+    grid_points, anchor_grid = self._anchor_generator(
+        width, height, batch_size, dtype=tf.float32)
+    # Cast all input compontnts to float32 and stop gradient to save memory.
+    boxes = tf.stop_gradient(tf.cast(boxes, tf.float32))
+    classes = tf.stop_gradient(tf.cast(classes, tf.float32))
+    y_true = tf.stop_gradient(tf.cast(y_true, tf.float32))
+    true_counts = tf.stop_gradient(tf.cast(true_counts, tf.float32))
+    true_conf = tf.stop_gradient(tf.clip_by_value(true_counts, 0.0, 1.0))
+    grid_points = tf.stop_gradient(grid_points)
+    anchor_grid = tf.stop_gradient(anchor_grid)
+    # Split all the ground truths to use as seperate items in loss computation.
+    (true_box, ind_mask, true_class, _, _) = tf.split(
+        y_true, [4, 1, 1, 1, 1], axis=-1)
+    true_conf = tf.squeeze(true_conf, axis=-1)
+    true_class = tf.squeeze(true_class, axis=-1)
+    grid_mask = true_conf
+    # Splits all predictions.
+    y_pred = tf.cast(
+        tf.reshape(y_pred, [batch_size, width, height, num, -1]), tf.float32)
+    pred_box, pred_conf, pred_class = tf.split(y_pred, [4, 1, -1], axis=-1)
+    # Decode the boxes to be used for loss compute.
+    _, _, pred_box = self._decode_boxes(
+        fwidth, fheight, pred_box, anchor_grid, grid_points, darknet=True)
+    # If the ignore threshold is enabled, search all boxes ignore all
+    # IOU valeus larger than the ignore threshold that are not in the
+    # noted ground truth list.
+    if self._ignore_thresh != 0.0:
+      (true_conf, obj_mask) = self._tiled_global_box_search(
+          pred_box,
+          tf.stop_gradient(tf.sigmoid(pred_class)),
+          boxes,
+          classes,
+          true_conf,
+          smoothed=self._objectness_smooth > 0)
+    # Build the one hot class list that are used for class loss.
+    true_class = tf.one_hot(
+        tf.cast(true_class, tf.int32),
+        depth=tf.shape(pred_class)[-1],
+        dtype=pred_class.dtype)
+    true_classes = tf.stop_gradient(loss_utils.apply_mask(ind_mask, true_class))
+    # Reorganize the one hot class list as a grid.
+    true_class = loss_utils.build_grid(
+        inds, true_classes, pred_class, ind_mask, update=False)
+    true_class = tf.stop_gradient(true_class)
+    # Use the class mask to find the number of objects located in
+    # each predicted grid cell/pixel.
+    counts = true_class
+    counts = tf.reduce_sum(counts, axis=-1, keepdims=True)
+    reps = tf.gather_nd(counts, inds, batch_dims=1)
+    reps = tf.squeeze(reps, axis=-1)
+    reps = tf.stop_gradient(tf.where(reps == 0.0, tf.ones_like(reps), reps))
+    # Compute the loss for only the cells in which the boxes are located.
+    pred_box = loss_utils.apply_mask(ind_mask,
+                                     tf.gather_nd(pred_box, inds, batch_dims=1))
+    iou, _, box_loss = self.box_loss(true_box, pred_box, darknet=True)
+    box_loss = loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), box_loss)
+    box_loss = math_ops.divide_no_nan(box_loss, reps)
+    box_loss = tf.cast(tf.reduce_sum(box_loss, axis=1), dtype=y_pred.dtype)
+    # Compute the sigmoid binary cross entropy for the class maps.
+    class_loss = tf.reduce_mean(
+        loss_utils.sigmoid_bce(
+            tf.expand_dims(true_class, axis=-1),
+            tf.expand_dims(pred_class, axis=-1), self._label_smoothing),
+        axis=-1)
+    # Apply normalization to the class losses.
+    if self._cls_normalizer < 1.0:
+      # Build a mask based on the true class locations.
+      cls_norm_mask = true_class
+      # Apply the classes weight to class indexes were one_hot is one.
+      class_loss *= ((1 - cls_norm_mask) + cls_norm_mask * self._cls_normalizer)
+    # Mask to the class loss and compute the sum over all the objects.
+    class_loss = tf.reduce_sum(class_loss, axis=-1)
+    class_loss = loss_utils.apply_mask(grid_mask, class_loss)
+    class_loss = math_ops.rm_nan_inf(class_loss, val=0.0)
+    class_loss = tf.cast(
+        tf.reduce_sum(class_loss, axis=(1, 2, 3)), dtype=y_pred.dtype)
+    # Compute the sigmoid binary cross entropy for the confidence maps.
+    bce = tf.reduce_mean(
+        loss_utils.sigmoid_bce(
+            tf.expand_dims(true_conf, axis=-1), pred_conf, 0.0),
+        axis=-1)
+    # Mask the confidence loss and take the sum across all the grid cells.
+    if self._ignore_thresh != 0.0:
+      bce = loss_utils.apply_mask(obj_mask, bce)
+    conf_loss = tf.cast(tf.reduce_sum(bce, axis=(1, 2, 3)), dtype=y_pred.dtype)
+    # Apply the weights to each loss.
+    box_loss *= self._iou_normalizer
+    conf_loss *= self._obj_normalizer
+    # Add all the losses together then take the mean over the batches.
+    loss = box_loss + class_loss + conf_loss
+    loss = tf.reduce_mean(loss)
+    # Reduce the mean of the losses to use as a metric.
+    box_loss = tf.reduce_mean(box_loss)
+    conf_loss = tf.reduce_mean(conf_loss)
+    class_loss = tf.reduce_mean(class_loss)
+    return (loss, box_loss, conf_loss, class_loss, loss, iou, pred_conf,
+            ind_mask, grid_mask)
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """This method is not specific to each loss path, but each loss type."""
+    return loss / num_replicas_in_sync
+class ScaledLoss(YoloLossBase):
+  """This class implements the full logic for the scaled Yolo models."""
+  def _build_per_path_attributes(self):
+    """Paramterization of pair wise search and grid generators.
+    Objects created here are used for box decoding and dynamic ground truth
+    association.
+    """
+    self._anchor_generator = loss_utils.GridGenerator(
+        masks=self._masks,
+        anchors=self._anchors,
+        scale_anchors=self._path_stride)
+    if self._ignore_thresh > 0.0:
+      self._search_pairs = loss_utils.PairWiseSearch(
+          iou_type=self._loss_type, any_match=False, min_conf=0.25)
+    return
+  def _compute_loss(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Per FPN path loss logic for Yolov4-csp, Yolov4-Large, and Yolov5."""
+    # Generate shape constants.
+    shape = tf.shape(true_counts)
+    batch_size, width, height, num = shape[0], shape[1], shape[2], shape[3]
+    fwidth = tf.cast(width, tf.float32)
+    fheight = tf.cast(height, tf.float32)
+    # Cast all input compontnts to float32 and stop gradient to save memory.
+    y_true = tf.cast(y_true, tf.float32)
+    true_counts = tf.cast(true_counts, tf.float32)
+    true_conf = tf.clip_by_value(true_counts, 0.0, 1.0)
+    grid_points, anchor_grid = self._anchor_generator(
+        width, height, batch_size, dtype=tf.float32)
+    # Split the y_true list.
+    (true_box, ind_mask, true_class, _, _) = tf.split(
+        y_true, [4, 1, 1, 1, 1], axis=-1)
+    grid_mask = true_conf = tf.squeeze(true_conf, axis=-1)
+    true_class = tf.squeeze(true_class, axis=-1)
+    num_objs = tf.cast(tf.reduce_sum(ind_mask), dtype=y_pred.dtype)
+    # Split up the predicitons.
+    y_pred = tf.cast(
+        tf.reshape(y_pred, [batch_size, width, height, num, -1]), tf.float32)
+    pred_box, pred_conf, pred_class = tf.split(y_pred, [4, 1, -1], axis=-1)
+    # Decode the boxes for loss compute.
+    scale, pred_box, _ = self._decode_boxes(
+        fwidth, fheight, pred_box, anchor_grid, grid_points, darknet=False)
+    # If the ignore threshold is enabled, search all boxes ignore all
+    # IOU valeus larger than the ignore threshold that are not in the
+    # noted ground truth list.
+    if self._ignore_thresh != 0.0:
+      (_, obj_mask) = self._tiled_global_box_search(
+          pred_box,
+          tf.stop_gradient(tf.sigmoid(pred_class)),
+          boxes,
+          classes,
+          true_conf,
+          smoothed=False,
+          scale=scale)
+    # Scale and shift and select the ground truth boxes
+    # and predictions to the prediciton domain.
+    offset = tf.cast(
+        tf.gather_nd(grid_points, inds, batch_dims=1), true_box.dtype)
+    offset = tf.concat([offset, tf.zeros_like(offset)], axis=-1)
+    true_box = loss_utils.apply_mask(ind_mask, (scale * true_box) - offset)
+    pred_box = loss_utils.apply_mask(ind_mask,
+                                     tf.gather_nd(pred_box, inds, batch_dims=1))
+    # Select the correct/used prediction classes.
+    true_class = tf.one_hot(
+        tf.cast(true_class, tf.int32),
+        depth=tf.shape(pred_class)[-1],
+        dtype=pred_class.dtype)
+    true_class = loss_utils.apply_mask(ind_mask, true_class)
+    pred_class = loss_utils.apply_mask(
+        ind_mask, tf.gather_nd(pred_class, inds, batch_dims=1))
+    # Compute the box loss.
+    _, iou, box_loss = self.box_loss(true_box, pred_box, darknet=False)
+    box_loss = loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), box_loss)
+    box_loss = math_ops.divide_no_nan(tf.reduce_sum(box_loss), num_objs)
+    # Use the box IOU to build the map for confidence loss computation.
+    iou = tf.maximum(tf.stop_gradient(iou), 0.0)
+    smoothed_iou = ((
+        (1 - self._objectness_smooth) * tf.cast(ind_mask, iou.dtype)) +
+                    self._objectness_smooth * tf.expand_dims(iou, axis=-1))
+    smoothed_iou = loss_utils.apply_mask(ind_mask, smoothed_iou)
+    true_conf = loss_utils.build_grid(
+        inds, smoothed_iou, pred_conf, ind_mask, update=self._update_on_repeat)
+    true_conf = tf.squeeze(true_conf, axis=-1)
+    # Compute the cross entropy loss for the confidence map.
+    bce = tf.keras.losses.binary_crossentropy(
+        tf.expand_dims(true_conf, axis=-1), pred_conf, from_logits=True)
+    if self._ignore_thresh != 0.0:
+      bce = loss_utils.apply_mask(obj_mask, bce)
+    conf_loss = tf.reduce_mean(bce)
+    # Compute the cross entropy loss for the class maps.
+    class_loss = tf.keras.losses.binary_crossentropy(
+        true_class,
+        pred_class,
+        label_smoothing=self._label_smoothing,
+        from_logits=True)
+    class_loss = loss_utils.apply_mask(
+        tf.squeeze(ind_mask, axis=-1), class_loss)
+    class_loss = math_ops.divide_no_nan(tf.reduce_sum(class_loss), num_objs)
+    # Apply the weights to each loss.
+    box_loss *= self._iou_normalizer
+    class_loss *= self._cls_normalizer
+    conf_loss *= self._obj_normalizer
+    # Add all the losses together then take the sum over the batches.
+    mean_loss = box_loss + class_loss + conf_loss
+    loss = mean_loss * tf.cast(batch_size, mean_loss.dtype)
+    return (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf,
+            ind_mask, grid_mask)
+  def post_path_aggregation(self, loss, ground_truths, predictions):
+    """This method allows for post processing of a loss value.
+    By default the model will have about 3 FPN levels {3, 4, 5}, on
+    larger model that have more like 4 or 5 FPN levels the loss needs to
+    be scaled such that the total update is scaled to the same effective
+    magintude as the model with 3 FPN levels. This helps to prevent gradient
+    explosions.
+    Args:
+      loss: `tf.float` scalar for the actual loss.
+      ground_truths: `Dict` holding all the ground truth tensors.
+      predictions: `Dict` holding all the predicted values.
+    Returns:
+      loss: `tf.float` scalar for the scaled loss.
+    """
+    scale = tf.stop_gradient(3 / len(list(predictions.keys())))
+    return loss * scale
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """In the scaled loss, take the sum of the loss across replicas."""
+    return loss
+class YoloLoss:
+  """This class implements the aggregated loss across YOLO model FPN levels."""
+  def __init__(self,
+               keys,
+               classes,
+               anchors,
+               masks=None,
+               path_strides=None,
+               truth_thresholds=None,
+               ignore_thresholds=None,
+               loss_types=None,
+               iou_normalizers=None,
+               cls_normalizers=None,
+               obj_normalizers=None,
+               objectness_smooths=None,
+               box_types=None,
+               scale_xys=None,
+               max_deltas=None,
+               label_smoothing=0.0,
+               use_scaled_loss=False,
+               update_on_repeat=True):
+    """Loss Function Initialization.
+    Args:
+      keys: `List[str]` indicating the name of the FPN paths that need to be
+        optimized.
+      classes: `int` for the number of classes
+      anchors: `List[List[int]]` for the anchor boxes that are used in the model
+        at all levels. For anchor free prediction set the anchor list to be the
+        same as the image resolution.
+      masks: `List[int]` for the output level that this specific model output
+        level
+      path_strides: `Dict[int]` for how much to scale this level to get the
+        orginal input shape for each FPN path.
+      truth_thresholds: `Dict[float]` for the IOU value over which the loss is
+        propagated despite a detection being made for each FPN path.
+      ignore_thresholds: `Dict[float]` for the IOU value over which the loss is
+        not propagated, and a detection is assumed to have been made for each
+        FPN path.
+      loss_types: `Dict[str]` for the typeof iou loss to use with in {ciou,
+        diou, giou, iou} for each FPN path.
+      iou_normalizers: `Dict[float]` for how much to scale the loss on the IOU
+        or the boxes for each FPN path.
+      cls_normalizers: `Dict[float]` for how much to scale the loss on the
+        classes for each FPN path.
+      obj_normalizers: `Dict[float]` for how much to scale loss on the detection
+        map for each FPN path.
+      objectness_smooths: `Dict[float]` for how much to smooth the loss on the
+        detection map for each FPN path.
+      box_types: `Dict[bool]` for which scaling type to use for each FPN path.
+      scale_xys:  `Dict[float]` values inidcating how far each pixel can see
+        outside of its containment of 1.0. a value of 1.2 indicates there is a
+        20% extended radius around each pixel that this specific pixel can
+        predict values for a center at. the center can range from 0 - value/2 to
+        1 + value/2, this value is set in the yolo filter, and resused here.
+        there should be one value for scale_xy for each level from min_level to
+        max_level. One for each FPN path.
+      max_deltas: `Dict[float]` for gradient clipping to apply to the box loss
+        for each FPN path.
+      label_smoothing: `Dict[float]` for how much to smooth the loss on the
+        classes for each FPN path.
+      use_scaled_loss: `bool` for whether to use the scaled loss or the
+        traditional loss.
+      update_on_repeat: `bool` for whether to replace with the newest or the
+        best value when an index is consumed by multiple objects.
+    """
+    losses = {'darknet': DarknetLoss, 'scaled': ScaledLoss}
+    if use_scaled_loss:
+      loss_type = 'scaled'
+    else:
+      loss_type = 'darknet'
+    self._loss_dict = {}
+    for key in keys:
+      self._loss_dict[key] = losses[loss_type](
+          classes=classes,
+          anchors=anchors,
+          mask=masks[key],
+          truth_thresh=truth_thresholds[key],
+          ignore_thresh=ignore_thresholds[key],
+          loss_type=loss_types[key],
+          iou_normalizer=iou_normalizers[key],
+          cls_normalizer=cls_normalizers[key],
+          obj_normalizer=obj_normalizers[key],
+          box_type=box_types[key],
+          objectness_smooth=objectness_smooths[key],
+          max_delta=max_deltas[key],
+          path_stride=path_strides[key],
+          scale_x_y=scale_xys[key],
+          update_on_repeat=update_on_repeat,
+          label_smoothing=label_smoothing)
+  def __call__(self, ground_truth, predictions, use_reduced_logs=True):
+    metric_dict = collections.defaultdict(dict)
+    metric_dict['net']['box'] = 0
+    metric_dict['net']['class'] = 0
+    metric_dict['net']['conf'] = 0
+    loss_val, metric_loss = 0, 0
+    num_replicas_in_sync = tf.distribute.get_strategy().num_replicas_in_sync
+    for key in predictions.keys():
+      (loss, loss_box, loss_conf, loss_class, mean_loss, avg_iou,
+       avg_obj) = self._loss_dict[key](ground_truth['true_conf'][key],
+                                       ground_truth['inds'][key],
+                                       ground_truth['upds'][key],
+                                       ground_truth['bbox'],
+                                       ground_truth['classes'],
+                                       predictions[key])
+      # after computing the loss, scale loss as needed for aggregation
+      # across FPN levels
+      loss = self._loss_dict[key].post_path_aggregation(
+          loss, ground_truth, predictions)
+      # after completing the scaling of the loss on each replica, handle
+      # scaling the loss for mergeing the loss across replicas
+      loss = self._loss_dict[key].cross_replica_aggregation(
+          loss, num_replicas_in_sync)
+      loss_val += loss
+      # detach all the below gradients: none of them should make a
+      # contribution to the gradient form this point forwards
+      metric_loss += tf.stop_gradient(mean_loss)
+      metric_dict[key]['loss'] = tf.stop_gradient(mean_loss)
+      metric_dict[key]['avg_iou'] = tf.stop_gradient(avg_iou)
+      metric_dict[key]['avg_obj'] = tf.stop_gradient(avg_obj)
+      if not use_reduced_logs:
+        metric_dict[key]['conf_loss'] = tf.stop_gradient(loss_conf)
+        metric_dict[key]['box_loss'] = tf.stop_gradient(loss_box)
+        metric_dict[key]['class_loss'] = tf.stop_gradient(loss_class)
+      metric_dict['net']['box'] += tf.stop_gradient(loss_box)
+      metric_dict['net']['class'] += tf.stop_gradient(loss_class)
+      metric_dict['net']['conf'] += tf.stop_gradient(loss_conf)
+    return loss_val, metric_loss, metric_dict
--- a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for yolo heads."""
+from absl.testing import parameterized
+import tensorflow as tf
+from official.vision.beta.projects.yolo.losses import yolo_loss
+class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.parameters(
+      (True),
+      (False),
+  )
+  def test_loss_init(self, scaled):
+    """Test creation of YOLO family models."""
+    def inpdict(input_shape, dtype=tf.float32):
+      inputs = {}
+      for key in input_shape:
+        inputs[key] = tf.ones(input_shape[key], dtype=dtype)
+      return inputs
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 255],
+        '4': [1, 26, 26, 255],
+        '5': [1, 13, 13, 255]
+    }
+    classes = 80
+    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
+    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
+               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
+               [348.0, 340.0]]
+    keys = ['3', '4', '5']
+    path_strides = {key: 2**int(key) for key in keys}
+    loss = yolo_loss.YoloLoss(
+        keys,
+        classes,
+        anchors,
+        masks=masks,
+        path_strides=path_strides,
+        truth_thresholds={key: 1.0 for key in keys},
+        ignore_thresholds={key: 0.7 for key in keys},
+        loss_types={key: 'ciou' for key in keys},
+        iou_normalizers={key: 0.05 for key in keys},
+        cls_normalizers={key: 0.5 for key in keys},
+        obj_normalizers={key: 1.0 for key in keys},
+        objectness_smooths={key: 1.0 for key in keys},
+        box_types={key: 'scaled' for key in keys},
+        scale_xys={key: 2.0 for key in keys},
+        max_deltas={key: 30.0 for key in keys},
+        label_smoothing=0.0,
+        use_scaled_loss=scaled,
+        update_on_repeat=True)
+    count = inpdict({
+        '3': [1, 52, 52, 3, 1],
+        '4': [1, 26, 26, 3, 1],
+        '5': [1, 13, 13, 3, 1]
+    })
+    ind = inpdict({
+        '3': [1, 300, 3],
+        '4': [1, 300, 3],
+        '5': [1, 300, 3]
+    }, tf.int32)
+    truths = inpdict({'3': [1, 300, 8], '4': [1, 300, 8], '5': [1, 300, 8]})
+    boxes = tf.ones([1, 300, 4], dtype=tf.float32)
+    classes = tf.ones([1, 300], dtype=tf.float32)
+    gt = {
+        'true_conf': count,
+        'inds': ind,
+        'upds': truths,
+        'bbox': boxes,
+        'classes': classes
+    }
+    _, _, _ = loss(gt, inpdict(input_shape))
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Contains definitions of Darknet Backbone Networks.
   The models are inspired by ResNet and CSPNet.
@@ -390,7 +389,7 @@ class Darknet(tf.keras.Model):
      norm_momentum=0.99,
      norm_epsilon=0.001,
      dilate=False,
-      kernel_initializer='glorot_uniform',
+      kernel_initializer='VarianceScaling',
      kernel_regularizer=None,
      bias_regularizer=None,
      **kwargs):
@@ -507,10 +506,12 @@ class Darknet(tf.keras.Model):
    self._default_dict['name'] = f'{name}_csp_down'
    if self._dilate:
      self._default_dict['dilation_rate'] = config.dilation_rate
+      degrid = int(tf.math.log(float(config.dilation_rate)) / tf.math.log(2.))
    else:
      self._default_dict['dilation_rate'] = 1
+      degrid = 0
-    # swap/add dilation
+    # swap/add dialation
    x, x_route = nn_blocks.CSPRoute(
        filters=config.filters,
        filter_scale=csp_filter_scale,
@@ -518,7 +519,7 @@ class Darknet(tf.keras.Model):
        **self._default_dict)(
            inputs)
-    dilated_reps = config.repetitions - self._default_dict['dilation_rate'] // 2
+    dilated_reps = config.repetitions - degrid
    for i in range(dilated_reps):
      self._default_dict['name'] = f'{name}_{i}'
      x = nn_blocks.DarkResidual(
@@ -528,8 +529,8 @@ class Darknet(tf.keras.Model):
              x)
    for i in range(dilated_reps, config.repetitions):
-      self._default_dict[
+      self._default_dict['dilation_rate'] = max(
-          'dilation_rate'] = self._default_dict['dilation_rate'] // 2
+          1, self._default_dict['dilation_rate'] // 2)
      self._default_dict[
          'name'] = f"{name}_{i}_degridded_{self._default_dict['dilation_rate']}"
      x = nn_blocks.DarkResidual(
@@ -592,8 +593,8 @@ class Darknet(tf.keras.Model):
        filters=config.filters, downsample=True, **self._default_dict)(
            inputs)
-    dilated_reps = config.repetitions - (
+    dilated_reps = config.repetitions - self._default_dict[
-        self._default_dict['dilation_rate'] // 2) - 1
+        'dilation_rate'] // 2 - 1
    for i in range(dilated_reps):
      self._default_dict['name'] = f'{name}_{i}'
      x = nn_blocks.DarkResidual(
@@ -661,12 +662,13 @@ class Darknet(tf.keras.Model):
 @factory.register_backbone_builder('darknet')
 def build_darknet(
    input_specs: tf.keras.layers.InputSpec,
-    backbone_config: hyperparams.Config,
+    backbone_cfg: hyperparams.Config,
    norm_activation_config: hyperparams.Config,
    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
  """Builds darknet."""
-  backbone_cfg = backbone_config.get()
+  backbone_cfg = backbone_cfg.get()
  model = Darknet(
      model_id=backbone_cfg.model_id,
      min_level=backbone_cfg.min_level,

--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Feature Pyramid Network and Path Aggregation variants used in YOLO."""
 import tensorflow as tf
@@ -39,7 +38,7 @@ class YoloFPN(tf.keras.layers.Layer):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
@@ -184,7 +183,7 @@ class YoloPAN(tf.keras.layers.Layer):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               fpn_input=True,
@@ -206,7 +205,7 @@ class YoloPAN(tf.keras.layers.Layer):
        by zero.
      kernel_initializer: kernel_initializer for convolutional layers.
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
      fpn_input: `bool`, for whether the input into this fucntion is an FPN or
        a backbone.
      fpn_filter_scale: `int`, scaling factor for the FPN filters.
@@ -374,7 +373,7 @@ class YoloDecoder(tf.keras.Model):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
@@ -389,8 +388,8 @@ class YoloDecoder(tf.keras.Model):
      use_fpn: `bool`, use the FPN found in the YoloV4 model.
      use_spatial_attention: `bool`, use the spatial attention module.
      csp_stack: `bool`, CSPize the FPN.
-      fpn_depth: `int`, number of layers ot use in each FPN path
+      fpn_depth: `int`, number of layers ot use in each FPN path if you choose
-        if you choose to use an FPN.
+        to use an FPN.
      fpn_filter_scale: `int`, scaling factor for the FPN filters.
      path_process_len: `int`, number of layers ot use in each Decoder path.
      max_level_process_len: `int`, number of layers ot use in the largest

--- a/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Yolo heads."""
 import tensorflow as tf
@@ -30,10 +29,11 @@ class YoloHead(tf.keras.layers.Layer):
               output_extras=0,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               activation=None,
+               smart_bias=False,
               **kwargs):
    """Yolo Prediction Head initialization function.
@@ -52,6 +52,7 @@ class YoloHead(tf.keras.layers.Layer):
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
      activation: `str`, the activation function to use typically leaky or mish.
+      smart_bias: `bool` whether or not use smart bias.
      **kwargs: keyword arguments to be passed.
    """
@@ -68,6 +69,7 @@ class YoloHead(tf.keras.layers.Layer):
    self._output_extras = output_extras
    self._output_conv = (classes + output_extras + 5) * boxes_per_level
+    self._smart_bias = smart_bias
    self._base_config = dict(
        activation=activation,
@@ -85,10 +87,29 @@ class YoloHead(tf.keras.layers.Layer):
        use_bn=False,
        **self._base_config)
+  def bias_init(self, scale, inshape, isize=640, no_per_conf=8):
+    def bias(shape, dtype):
+      init = tf.keras.initializers.Zeros()
+      base = init(shape, dtype=dtype)
+      if self._smart_bias:
+        base = tf.reshape(base, [self._boxes_per_level, -1])
+        box, conf, classes = tf.split(base, [4, 1, -1], axis=-1)
+        conf += tf.math.log(no_per_conf / ((isize / scale)**2))
+        classes += tf.math.log(0.6 / (self._classes - 0.99))
+        base = tf.concat([box, conf, classes], axis=-1)
+        base = tf.reshape(base, [-1])
+      return base
+    return bias
  def build(self, input_shape):
    self._head = dict()
    for key in self._key_list:
-      self._head[key] = nn_blocks.ConvBN(**self._conv_config)
+      scale = 2**int(key)
+      self._head[key] = nn_blocks.ConvBN(
+          bias_initializer=self.bias_init(scale, input_shape[key][-1]),
+          **self._conv_config)
  def call(self, inputs):
    outputs = dict()
@@ -107,6 +128,10 @@ class YoloHead(tf.keras.layers.Layer):
          'Model has to be built before number of boxes can be determined.')
    return (self._max_level - self._min_level + 1) * self._boxes_per_level
+  @property
+  def num_heads(self):
+    return self._max_level - self._min_level + 1
  def get_config(self):
    config = dict(
        min_level=self._min_level,

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
@@ -15,7 +15,10 @@
 """Contains common building blocks for yolo layer (detection layer)."""
 import tensorflow as tf
+from official.vision.beta.modeling.layers import detection_generator
+from official.vision.beta.projects.yolo.losses import yolo_loss
 from official.vision.beta.projects.yolo.ops import box_ops
+from official.vision.beta.projects.yolo.ops import loss_utils
 @tf.keras.utils.register_keras_serializable(package='yolo')
@@ -36,11 +39,11 @@ class YoloLayer(tf.keras.Model):
               cls_normalizer=1.0,
               obj_normalizer=1.0,
               use_scaled_loss=False,
-               darknet=None,
+               update_on_repeat=False,
               pre_nms_points=5000,
               label_smoothing=0.0,
               max_boxes=200,
-               new_cords=False,
+               box_type='original',
               path_scale=None,
               scale_xy=None,
               nms_type='greedy',
@@ -70,14 +73,25 @@ class YoloLayer(tf.keras.Model):
      obj_normalizer: `float` for how much to scale loss on the detection map.
      use_scaled_loss: `bool` for whether to use the scaled loss
        or the traditional loss.
-      darknet: `bool` for whether to use the DarkNet or PyTorch loss function
+      update_on_repeat: `bool` indicating how you would like to handle repeated
-        implementation.
+        indexes in a given [j, i] index. Setting this to True will give more
+        consistent MAP, setting it to falls will improve recall by 1-2% but will
+        sacrifice some MAP.
      pre_nms_points: `int` number of top candidate detections per class before
        NMS.
      label_smoothing: `float` for how much to smooth the loss on the classes.
      max_boxes: `int` for the maximum number of boxes retained over all
        classes.
-      new_cords: `bool` for using the ScaledYOLOv4 coordinates.
+      box_type: `str`, there are 3 different box types that will affect training
+        differently {original, scaled and anchor_free}. The original method
+        decodes the boxes by applying an exponential to the model width and
+        height maps, then scaling the maps by the anchor boxes. This method is
+        used in Yolo-v4, Yolo-v3, and all its counterparts. The Scale method
+        squares the width and height and scales both by a fixed factor of 4.
+        This method is used in the Scale Yolo models, as well as Yolov4-CSP.
+        Finally, anchor_free is like the original method but will not apply an
+        activation function to the boxes, this is used for some of the newer
+        anchor free versions of YOLO.
      path_scale: `dict` for the size of the input tensors. Defaults to
        precalulated values from the `mask`.
      scale_xy: dictionary `float` values inidcating how far each pixel can see
@@ -91,18 +105,6 @@ class YoloLayer(tf.keras.Model):
      objectness_smooth: `float` for how much to smooth the loss on the
        detection map.
      **kwargs: Addtional keyword arguments.
-    Return:
-      loss: `float` for the actual loss.
-      box_loss: `float` loss on the boxes used for metrics.
-      conf_loss: `float` loss on the confidence used for metrics.
-      class_loss: `float` loss on the classes used for metrics.
-      avg_iou: `float` metric for the average iou between predictions
-        and ground truth.
-      avg_obj: `float` metric for the average confidence of the model
-        for predictions.
-      recall50: `float` metric for how accurate the model is.
-      precision50: `float` metric for how precise the model is.
    """
    super().__init__(**kwargs)
    self._masks = masks
@@ -121,29 +123,18 @@ class YoloLayer(tf.keras.Model):
    self._loss_type = loss_type
    self._use_scaled_loss = use_scaled_loss
-    self._darknet = darknet
+    self._update_on_repeat = update_on_repeat
    self._pre_nms_points = pre_nms_points
    self._label_smoothing = label_smoothing
    self._keys = list(masks.keys())
    self._len_keys = len(self._keys)
-    self._new_cords = new_cords
+    self._box_type = box_type
    self._path_scale = path_scale or {
        key: 2**int(key) for key, _ in masks.items()
    }
-    self._nms_types = {
+    self._nms_type = nms_type
-        'greedy': 1,
-        'iou': 2,
-        'giou': 3,
-        'ciou': 4,
-        'diou': 5,
-        'class_independent': 6,
-        'weighted_diou': 7
-    }
-    self._nms_type = self._nms_types[nms_type]
    self._scale_xy = scale_xy or {key: 1.0 for key, _ in masks.items()}
    self._generator = {}
@@ -156,27 +147,33 @@ class YoloLayer(tf.keras.Model):
    return
  def get_generators(self, anchors, path_scale, path_key):
-    return None
+    anchor_generator = loss_utils.GridGenerator(
+        anchors, scale_anchors=path_scale)
-  def rm_nan_inf(self, x, val=0.0):
+    return anchor_generator
-    x = tf.where(tf.math.is_nan(x), tf.cast(val, dtype=x.dtype), x)
-    x = tf.where(tf.math.is_inf(x), tf.cast(val, dtype=x.dtype), x)
-    return x
  def parse_prediction_path(self, key, inputs):
+    shape_ = tf.shape(inputs)
    shape = inputs.get_shape().as_list()
-    height, width = shape[1], shape[2]
+    batchsize, height, width = shape_[0], shape[1], shape[2]
+    if height is None or width is None:
+      height, width = shape_[1], shape_[2]
+    generator = self._generator[key]
    len_mask = self._len_mask[key]
+    scale_xy = self._scale_xy[key]
    # reshape the yolo output to (batchsize,
    #                             width,
    #                             height,
    #                             number_anchors,
    #                             remaining_points)
    data = tf.reshape(inputs, [-1, height, width, len_mask, self._classes + 5])
+    # use the grid generator to get the formatted anchor boxes and grid points
+    # in shape [1, height, width, 2]
+    centers, anchors = generator(height, width, batchsize, dtype=data.dtype)
    # split the yolo detections into boxes, object score map, classes
    boxes, obns_scores, class_scores = tf.split(
        data, [4, 1, self._classes], axis=-1)
@@ -184,25 +181,32 @@ class YoloLayer(tf.keras.Model):
    # determine the number of classes
    classes = class_scores.get_shape().as_list()[-1]
+    # configurable to use the new coordinates in scaled Yolo v4 or not
+    _, _, boxes = loss_utils.get_predicted_box(
+        tf.cast(height, data.dtype),
+        tf.cast(width, data.dtype),
+        boxes,
+        anchors,
+        centers,
+        scale_xy,
+        stride=self._path_scale[key],
+        darknet=False,
+        box_type=self._box_type[key])
    # convert boxes from yolo(x, y, w. h) to tensorflow(ymin, xmin, ymax, xmax)
    boxes = box_ops.xcycwh_to_yxyx(boxes)
    # activate and detection map
    obns_scores = tf.math.sigmoid(obns_scores)
-    # threshold the detection map
-    obns_mask = tf.cast(obns_scores > self._thresh, obns_scores.dtype)
    # convert detection map to class detection probabailities
-    class_scores = tf.math.sigmoid(class_scores) * obns_mask * obns_scores
+    class_scores = tf.math.sigmoid(class_scores) * obns_scores
-    class_scores *= tf.cast(class_scores > self._thresh, class_scores.dtype)
-    fill = height * width * len_mask
    # platten predictions to [batchsize, N, -1] for non max supression
+    fill = height * width * len_mask
    boxes = tf.reshape(boxes, [-1, fill, 4])
    class_scores = tf.reshape(class_scores, [-1, fill, classes])
    obns_scores = tf.reshape(obns_scores, [-1, fill])
    return obns_scores, boxes, class_scores
  def call(self, inputs):
@@ -224,26 +228,49 @@ class YoloLayer(tf.keras.Model):
    # colate all predicitons
    boxes = tf.concat(boxes, axis=1)
-    object_scores = tf.keras.backend.concatenate(object_scores, axis=1)
+    object_scores = tf.concat(object_scores, axis=1)
-    class_scores = tf.keras.backend.concatenate(class_scores, axis=1)
+    class_scores = tf.concat(class_scores, axis=1)
-    # greedy NMS
+    # get masks to threshold all the predicitons
-    boxes = tf.cast(boxes, dtype=tf.float32)
+    object_mask = tf.cast(object_scores > self._thresh, object_scores.dtype)
-    class_scores = tf.cast(class_scores, dtype=tf.float32)
+    class_mask = tf.cast(class_scores > self._thresh, class_scores.dtype)
-    nms_items = tf.image.combined_non_max_suppression(
-        tf.expand_dims(boxes, axis=-2),
+    # apply thresholds mask to all the predicitons
-        class_scores,
+    object_scores *= object_mask
-        self._pre_nms_points,
+    class_scores *= (tf.expand_dims(object_mask, axis=-1) * class_mask)
-        self._max_boxes,
-        iou_threshold=self._nms_thresh,
+    # apply nms
-        score_threshold=self._thresh)
+    if self._nms_type == 'greedy':
-    # cast the boxes and predicitons abck to original datatype
+      # greedy NMS
-    boxes = tf.cast(nms_items.nmsed_boxes, object_scores.dtype)
+      boxes = tf.cast(boxes, dtype=tf.float32)
-    class_scores = tf.cast(nms_items.nmsed_classes, object_scores.dtype)
+      class_scores = tf.cast(class_scores, dtype=tf.float32)
-    object_scores = tf.cast(nms_items.nmsed_scores, object_scores.dtype)
+      boxes, object_scores_, class_scores, num_detections = (
+          tf.image.combined_non_max_suppression(
-    # compute the number of valid detections
+              tf.expand_dims(boxes, axis=-2),
-    num_detections = tf.math.reduce_sum(tf.math.ceil(object_scores), axis=-1)
+              class_scores,
+              self._pre_nms_points,
+              self._max_boxes,
+              iou_threshold=self._nms_thresh,
+              score_threshold=self._thresh))
+      # cast the boxes and predicitons abck to original datatype
+      boxes = tf.cast(boxes, object_scores.dtype)
+      class_scores = tf.cast(class_scores, object_scores.dtype)
+      object_scores = tf.cast(object_scores_, object_scores.dtype)
+    else:
+      # TPU NMS
+      boxes = tf.cast(boxes, dtype=tf.float32)
+      class_scores = tf.cast(class_scores, dtype=tf.float32)
+      (boxes, confidence, classes,
+       num_detections) = detection_generator._generate_detections_v2(  # pylint:disable=protected-access
+           tf.expand_dims(boxes, axis=-2),
+           class_scores,
+           pre_nms_top_k=self._pre_nms_points,
+           max_num_detections=self._max_boxes,
+           nms_iou_threshold=self._nms_thresh,
+           pre_nms_score_threshold=self._thresh)
+      boxes = tf.cast(boxes, object_scores.dtype)
+      class_scores = tf.cast(classes, object_scores.dtype)
+      object_scores = tf.cast(confidence, object_scores.dtype)
    # format and return
    return {
@@ -258,9 +285,28 @@ class YoloLayer(tf.keras.Model):
    """Generates a dictionary of losses to apply to each path.
    Done in the detection generator because all parameters are the same
-    across both loss and detection generator.
+    across both loss and detection generator
    """
-    return None
+    loss = yolo_loss.YoloLoss(
+        keys=self._keys,
+        classes=self._classes,
+        anchors=self._anchors,
+        masks=self._masks,
+        path_strides=self._path_scale,
+        truth_thresholds=self._truth_thresh,
+        ignore_thresholds=self._ignore_thresh,
+        loss_types=self._loss_type,
+        iou_normalizers=self._iou_normalizer,
+        cls_normalizers=self._cls_normalizer,
+        obj_normalizers=self._obj_normalizer,
+        objectness_smooths=self._objectness_smooth,
+        box_types=self._box_type,
+        max_deltas=self._max_delta,
+        scale_xys=self._scale_xy,
+        use_scaled_loss=self._use_scaled_loss,
+        update_on_repeat=self._update_on_repeat,
+        label_smoothing=self._label_smoothing)
+    return loss
  def get_config(self):
    return {

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
@@ -39,7 +39,10 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
               [348.0, 340.0]]
-    layer = dg.YoloLayer(masks, anchors, classes, max_boxes=10)
+    box_type = {key: 'scaled' for key in masks.keys()}
+    layer = dg.YoloLayer(
+        masks, anchors, classes, box_type=box_type, max_boxes=10)
    inputs = {}
    for key in input_shape:

--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Contains common building blocks for yolo neural networks."""
-from typing import Callable, List
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.vision.beta.ops import spatial_transform_ops
@@ -48,7 +46,7 @@ class ConvBN(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -97,7 +95,14 @@ class ConvBN(tf.keras.layers.Layer):
    self._strides = strides
    self._padding = padding
    self._dilation_rate = dilation_rate
-    self._kernel_initializer = kernel_initializer
+    if kernel_initializer == 'VarianceScaling':
+      # to match pytorch initialization method
+      self._kernel_initializer = tf.keras.initializers.VarianceScaling(
+          scale=1 / 3, mode='fan_in', distribution='uniform')
+    else:
+      self._kernel_initializer = kernel_initializer
    self._bias_initializer = bias_initializer
    self._kernel_regularizer = kernel_regularizer
@@ -194,7 +199,7 @@ class DarkResidual(tf.keras.layers.Layer):
               filters=1,
               filter_scale=2,
               dilation_rate=1,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               kernel_regularizer=None,
               bias_regularizer=None,
@@ -366,7 +371,7 @@ class CSPTiny(tf.keras.layers.Layer):
  def __init__(self,
               filters=1,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -532,7 +537,7 @@ class CSPRoute(tf.keras.layers.Layer):
               filters,
               filter_scale=2,
               activation='mish',
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -661,7 +666,7 @@ class CSPConnect(tf.keras.layers.Layer):
               drop_first=False,
               activation='mish',
               kernel_size=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -761,122 +766,6 @@ class CSPConnect(tf.keras.layers.Layer):
    return x
-class CSPStack(tf.keras.layers.Layer):
-  """CSP Stack layer.
-  CSP full stack, combines the route and the connect in case you dont want to
-  jsut quickly wrap an existing callable or list of layers to
-  make it a cross stage partial. Added for ease of use. you should be able
-  to wrap any layer stack with a CSP independent of wether it belongs
-  to the Darknet family. if filter_scale = 2, then the blocks in the stack
-  passed into the the CSP stack should also have filters = filters/filter_scale
-  Cross Stage Partial networks (CSPNets) were proposed in:
-  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu,
-        Ping-Yang Chen, Jun-Wei Hsieh
-      CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
-        arXiv:1911.11929
-  """
-  def __init__(self,
-               filters,
-               model_to_wrap=None,
-               filter_scale=2,
-               activation='mish',
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               bias_regularizer=None,
-               kernel_regularizer=None,
-               downsample=True,
-               use_bn=True,
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               **kwargs):
-    """CSPStack layer initializer.
-    Args:
-      filters: integer for output depth, or the number of features to learn.
-      model_to_wrap: callable Model or a list of callable objects that will
-        process the output of CSPRoute, and be input into CSPConnect.
-        list will be called sequentially.
-      filter_scale: integer dictating (filters//2) or the number of filters in
-        the partial feature stack.
-      activation: string for activation function to use in layer.
-      kernel_initializer: string to indicate which function to use to initialize
-        weights.
-      bias_initializer: string to indicate which function to use to initialize
-        bias.
-      bias_regularizer: string to indicate which function to use to regularizer
-        bias.
-      kernel_regularizer: string to indicate which function to use to
-        regularizer weights.
-      downsample: down_sample the input.
-      use_bn: boolean for whether to use batch normalization.
-      use_sync_bn: boolean for whether sync batch normalization statistics
-        of all batch norm layers to the models global statistics
-        (across all input batches).
-      norm_momentum: float for moment to use for batch normalization.
-      norm_epsilon: float for batch normalization epsilon.
-      **kwargs: Keyword Arguments.
-    Raises:
-      TypeError: model_to_wrap is not a layer or a list of layers
-    """
-    super().__init__(**kwargs)
-    # layer params
-    self._filters = filters
-    self._filter_scale = filter_scale
-    self._activation = activation
-    self._downsample = downsample
-    # convoultion params
-    self._kernel_initializer = kernel_initializer
-    self._bias_initializer = bias_initializer
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._use_bn = use_bn
-    self._use_sync_bn = use_sync_bn
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    if model_to_wrap is None:
-      self._model_to_wrap = []
-    elif isinstance(model_to_wrap, Callable):
-      self._model_to_wrap = [model_to_wrap]
-    elif isinstance(model_to_wrap, List):
-      self._model_to_wrap = model_to_wrap
-    else:
-      raise TypeError(
-          'the input to the CSPStack must be a list of layers that we can' +
-          'iterate through, or \n a callable')
-  def build(self, input_shape):
-    dark_conv_args = {
-        'filters': self._filters,
-        'filter_scale': self._filter_scale,
-        'activation': self._activation,
-        'kernel_initializer': self._kernel_initializer,
-        'bias_initializer': self._bias_initializer,
-        'bias_regularizer': self._bias_regularizer,
-        'use_bn': self._use_bn,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon,
-        'kernel_regularizer': self._kernel_regularizer,
-    }
-    self._route = CSPRoute(downsample=self._downsample, **dark_conv_args)
-    self._connect = CSPConnect(**dark_conv_args)
-  def call(self, inputs, training=None):
-    x, x_route = self._route(inputs)
-    for layer in self._model_to_wrap:
-      x = layer(x)
-    x = self._connect([x, x_route])
-    return x
 @tf.keras.utils.register_keras_serializable(package='yolo')
 class PathAggregationBlock(tf.keras.layers.Layer):
  """Path Aggregation block."""
@@ -884,7 +773,7 @@ class PathAggregationBlock(tf.keras.layers.Layer):
  def __init__(self,
               filters=1,
               drop_final=True,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1120,7 +1009,7 @@ class SAM(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1192,7 +1081,7 @@ class CAM(tf.keras.layers.Layer):
  def __init__(self,
               reduction_ratio=1.0,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1285,7 +1174,7 @@ class CBAM(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1354,27 +1243,26 @@ class DarkRouteProcess(tf.keras.layers.Layer):
                          insert_spp = False)(x)
  """
-  def __init__(
+  def __init__(self,
-      self,
+               filters=2,
-      filters=2,
+               repetitions=2,
-      repetitions=2,
+               insert_spp=False,
-      insert_spp=False,
+               insert_sam=False,
-      insert_sam=False,
+               insert_cbam=False,
-      insert_cbam=False,
+               csp_stack=0,
-      csp_stack=0,
+               csp_scale=2,
-      csp_scale=2,
+               kernel_initializer='VarianceScaling',
-      kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
-      bias_initializer='zeros',
+               bias_regularizer=None,
-      bias_regularizer=None,
+               kernel_regularizer=None,
-      kernel_regularizer=None,
+               use_sync_bn=False,
-      use_sync_bn=False,
+               norm_momentum=0.99,
-      norm_momentum=0.99,
+               norm_epsilon=0.001,
-      norm_epsilon=0.001,
+               block_invert=False,
-      block_invert=False,
+               activation='leaky',
-      activation='leaky',
+               leaky_alpha=0.1,
-      leaky_alpha=0.1,
+               spp_keys=None,
-      spp_keys=None,
+               **kwargs):
-      **kwargs):
    """DarkRouteProcess initializer.
    Args:

--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
@@ -106,86 +106,6 @@ class CSPRouteTest(tf.test.TestCase, parameterized.TestCase):
    self.assertNotIn(None, grad)
-class CSPStackTest(tf.test.TestCase, parameterized.TestCase):
-  def build_layer(self, layer_type, filters, filter_scale, count, stack_type,
-                  downsample):
-    if stack_type is not None:
-      layers = []
-      if layer_type == 'residual':
-        for _ in range(count):
-          layers.append(
-              nn_blocks.DarkResidual(
-                  filters=filters // filter_scale, filter_scale=filter_scale))
-      else:
-        for _ in range(count):
-          layers.append(nn_blocks.ConvBN(filters=filters))
-      if stack_type == 'model':
-        layers = tf.keras.Sequential(layers=layers)
-    else:
-      layers = None
-    stack = nn_blocks.CSPStack(
-        filters=filters,
-        filter_scale=filter_scale,
-        downsample=downsample,
-        model_to_wrap=layers)
-    return stack
-  @parameterized.named_parameters(
-      ('no_stack', 224, 224, 64, 2, 'residual', None, 0, True),
-      ('residual_stack', 224, 224, 64, 2, 'residual', 'list', 2, True),
-      ('conv_stack', 224, 224, 64, 2, 'conv', 'list', 3, False),
-      ('callable_no_scale', 224, 224, 64, 1, 'residual', 'model', 5, False))
-  def test_pass_through(self, width, height, filters, mod, layer_type,
-                        stack_type, count, downsample):
-    x = tf.keras.Input(shape=(width, height, filters))
-    test_layer = self.build_layer(layer_type, filters, mod, count, stack_type,
-                                  downsample)
-    outx = test_layer(x)
-    print(outx)
-    print(outx.shape.as_list())
-    if downsample:
-      self.assertAllEqual(outx.shape.as_list(),
-                          [None, width // 2, height // 2, filters])
-    else:
-      self.assertAllEqual(outx.shape.as_list(), [None, width, height, filters])
-  @parameterized.named_parameters(
-      ('no_stack', 224, 224, 64, 2, 'residual', None, 0, True),
-      ('residual_stack', 224, 224, 64, 2, 'residual', 'list', 2, True),
-      ('conv_stack', 224, 224, 64, 2, 'conv', 'list', 3, False),
-      ('callable_no_scale', 224, 224, 64, 1, 'residual', 'model', 5, False))
-  def test_gradient_pass_though(self, width, height, filters, mod, layer_type,
-                                stack_type, count, downsample):
-    loss = tf.keras.losses.MeanSquaredError()
-    optimizer = tf.keras.optimizers.SGD()
-    init = tf.random_normal_initializer()
-    x = tf.Variable(
-        initial_value=init(shape=(1, width, height, filters), dtype=tf.float32))
-    if not downsample:
-      y = tf.Variable(
-          initial_value=init(
-              shape=(1, width, height, filters), dtype=tf.float32))
-    else:
-      y = tf.Variable(
-          initial_value=init(
-              shape=(1, width // 2, height // 2, filters), dtype=tf.float32))
-    test_layer = self.build_layer(layer_type, filters, mod, count, stack_type,
-                                  downsample)
-    with tf.GradientTape() as tape:
-      x_hat = test_layer(x)
-      grad_loss = loss(x_hat, y)
-    grad = tape.gradient(grad_loss, test_layer.trainable_variables)
-    optimizer.apply_gradients(zip(grad, test_layer.trainable_variables))
-    self.assertNotIn(None, grad)
 class ConvBNTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.named_parameters(

--- a/official/vision/beta/projects/yolo/modeling/yolo_model.py
+++ b/official/vision/beta/projects/yolo/modeling/yolo_model.py
@@ -17,7 +17,7 @@
 import tensorflow as tf
-# Static base Yolo Models that do not require configuration
+# static base Yolo Models that do not require configuration
 # similar to a backbone model id.
 # this is done greatly simplify the model config
@@ -85,26 +85,27 @@ class Yolo(tf.keras.Model):
    """Detection initialization function.
    Args:
-      backbone: `tf.keras.Model`, a backbone network.
+      backbone: `tf.keras.Model` a backbone network.
-      decoder: `tf.keras.Model`, a decoder network.
+      decoder: `tf.keras.Model` a decoder network.
-      head: `YoloHead`, the YOLO head.
+      head: `RetinaNetHead`, the RetinaNet head.
-      detection_generator: `tf.keras.Model`, the detection generator.
+      detection_generator: the detection generator.
      **kwargs: keyword arguments to be passed.
    """
-    super().__init__(**kwargs)
+    super(Yolo, self).__init__(**kwargs)
    self._config_dict = {
        "backbone": backbone,
        "decoder": decoder,
        "head": head,
-        "detection_generator": detection_generator
+        "filter": detection_generator
    }
    # model components
    self._backbone = backbone
    self._decoder = decoder
    self._head = head
-    self._detection_generator = detection_generator
+    self._filter = detection_generator
+    return
  def call(self, inputs, training=False):
    maps = self._backbone(inputs)
@@ -114,7 +115,7 @@ class Yolo(tf.keras.Model):
      return {"raw_output": raw_predictions}
    else:
      # Post-processing.
-      predictions = self._detection_generator(raw_predictions)
+      predictions = self._filter(raw_predictions)
      predictions.update({"raw_output": raw_predictions})
      return predictions
@@ -131,8 +132,8 @@ class Yolo(tf.keras.Model):
    return self._head
  @property
-  def detection_generator(self):
+  def filter(self):
-    return self._detection_generator
+    return self._filter
  def get_config(self):
    return self._config_dict
@@ -140,3 +141,29 @@ class Yolo(tf.keras.Model):
  @classmethod
  def from_config(cls, config):
    return cls(**config)
+  def get_weight_groups(self, train_vars):
+    """Sort the list of trainable variables into groups for optimization.
+    Args:
+      train_vars: a list of tf.Variables that need to get sorted into their
+        respective groups.
+    Returns:
+      weights: a list of tf.Variables for the weights.
+      bias: a list of tf.Variables for the bias.
+      other: a list of tf.Variables for the other operations.
+    """
+    bias = []
+    weights = []
+    other = []
+    for var in train_vars:
+      if "bias" in var.name:
+        bias.append(var)
+      elif "beta" in var.name:
+        bias.append(var)
+      elif "kernel" in var.name or "weight" in var.name:
+        weights.append(var)
+      else:
+        other.append(var)
+    return weights, bias, other
--- a/official/vision/beta/projects/yolo/ops/box_ops.py
+++ b/official/vision/beta/projects/yolo/ops/box_ops.py
@@ -38,51 +38,26 @@ def yxyx_to_xcycwh(box: tf.Tensor):
  return box
-@tf.custom_gradient
+def xcycwh_to_yxyx(box: tf.Tensor):
-def _xcycwh_to_yxyx(box: tf.Tensor, scale):
-  """Private function to allow custom gradients with defaults."""
-  with tf.name_scope('xcycwh_to_yxyx'):
-    xy, wh = tf.split(box, 2, axis=-1)
-    xy_min = xy - wh / 2
-    xy_max = xy + wh / 2
-    x_min, y_min = tf.split(xy_min, 2, axis=-1)
-    x_max, y_max = tf.split(xy_max, 2, axis=-1)
-    box = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
-    def delta(dbox):
-      # y_min = top, x_min = left, y_max = bottom, x_max = right
-      dt, dl, db, dr = tf.split(dbox, 4, axis=-1)
-      dx = dl + dr
-      dy = dt + db
-      dw = (dr - dl) / scale
-      dh = (db - dt) / scale
-      dbox = tf.concat([dx, dy, dw, dh], axis=-1)
-      return dbox, 0.0
-  return box, delta
-def xcycwh_to_yxyx(box: tf.Tensor, darknet=False):
  """Converts boxes from x_center, y_center, width, height to yxyx format.
  Args:
    box: any `Tensor` whose last dimension is 4 representing the coordinates of
      boxes in x_center, y_center, width, height.
-    darknet: `bool`, if True a scale of 1.0 is used.
  Returns:
    box: a `Tensor` whose shape is the same as `box` in new format.
  """
-  if darknet:
+  with tf.name_scope('xcycwh_to_yxyx'):
-    scale = 1.0
+    xy, wh = tf.split(box, 2, axis=-1)
-  else:
+    xy_min = xy - wh / 2
-    scale = 2.0
+    xy_max = xy + wh / 2
-  box = _xcycwh_to_yxyx(box, scale)
+    x_min, y_min = tf.split(xy_min, 2, axis=-1)
+    x_max, y_max = tf.split(xy_max, 2, axis=-1)
+    box = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
  return box
-# IOU
 def intersect_and_union(box1, box2, yxyx=False):
  """Calculates the intersection and union between box1 and box2.
@@ -98,8 +73,9 @@ def intersect_and_union(box1, box2, yxyx=False):
    intersection: a `Tensor` who represents the intersection.
    union: a `Tensor` who represents the union.
  """
  if not yxyx:
+    box1_area = tf.reduce_prod(tf.split(box1, 2, axis=-1)[-1], axis=-1)
+    box2_area = tf.reduce_prod(tf.split(box2, 2, axis=-1)[-1], axis=-1)
    box1 = xcycwh_to_yxyx(box1)
    box2 = xcycwh_to_yxyx(box2)
@@ -110,13 +86,14 @@ def intersect_and_union(box1, box2, yxyx=False):
  intersect_wh = tf.math.maximum(intersect_maxes - intersect_mins, 0.0)
  intersection = tf.reduce_prod(intersect_wh, axis=-1)
-  box1_area = tf.reduce_prod(b1ma - b1mi, axis=-1)
+  if yxyx:
-  box2_area = tf.reduce_prod(b2ma - b2mi, axis=-1)
+    box1_area = tf.reduce_prod(b1ma - b1mi, axis=-1)
+    box2_area = tf.reduce_prod(b2ma - b2mi, axis=-1)
  union = box1_area + box2_area - intersection
  return intersection, union
-def smallest_encompassing_box(box1, box2, yxyx=False):
+def smallest_encompassing_box(box1, box2, yxyx=False, clip=False):
  """Calculates the smallest box that encompasses box1 and box2.
  Args:
@@ -126,6 +103,7 @@ def smallest_encompassing_box(box1, box2, yxyx=False):
      boxes.
    yxyx: a `bool` indicating whether the input box is of the format x_center
      y_center, width, height or y_min, x_min, y_max, x_max.
+    clip: a `bool`, whether or not to clip boxes.
  Returns:
    box_c: a `Tensor` whose last dimension is 4 representing the coordinates of
@@ -141,15 +119,15 @@ def smallest_encompassing_box(box1, box2, yxyx=False):
  bcmi = tf.math.minimum(b1mi, b2mi)
  bcma = tf.math.maximum(b1ma, b2ma)
-  bca = tf.reduce_prod(bcma - bcmi, keepdims=True, axis=-1)
  box_c = tf.concat([bcmi, bcma], axis=-1)
  if not yxyx:
    box_c = yxyx_to_xcycwh(box_c)
-  box_c = tf.where(bca == 0.0, tf.zeros_like(box_c), box_c)
+  if clip:
-  return box_c
+    bca = tf.reduce_prod(bcma - bcmi, keepdims=True, axis=-1)
+    box_c = tf.where(bca <= 0.0, tf.zeros_like(box_c), box_c)
+  return bcmi, bcma, box_c
 def compute_iou(box1, box2, yxyx=False):
@@ -166,15 +144,13 @@ def compute_iou(box1, box2, yxyx=False):
  Returns:
    iou: a `Tensor` who represents the intersection over union.
  """
-  # get box corners
  with tf.name_scope('iou'):
    intersection, union = intersect_and_union(box1, box2, yxyx=yxyx)
    iou = math_ops.divide_no_nan(intersection, union)
-    iou = math_ops.rm_nan_inf(iou, val=0.0)
  return iou
-def compute_giou(box1, box2, yxyx=False, darknet=False):
+def compute_giou(box1, box2, yxyx=False):
  """Calculates the General intersection over union between box1 and box2.
  Args:
@@ -184,38 +160,30 @@ def compute_giou(box1, box2, yxyx=False, darknet=False):
      boxes.
    yxyx: a `bool` indicating whether the input box is of the format x_center
      y_center, width, height or y_min, x_min, y_max, x_max.
-    darknet: a `bool` indicating whether the calling function is the YOLO
-      darknet loss.
  Returns:
    giou: a `Tensor` who represents the General intersection over union.
  """
  with tf.name_scope('giou'):
-    # get IOU
    if not yxyx:
-      box1 = xcycwh_to_yxyx(box1, darknet=darknet)
+      yxyx1 = xcycwh_to_yxyx(box1)
-      box2 = xcycwh_to_yxyx(box2, darknet=darknet)
+      yxyx2 = xcycwh_to_yxyx(box2)
-      yxyx = True
+    else:
+      yxyx1, yxyx2 = box1, box2
-    intersection, union = intersect_and_union(box1, box2, yxyx=yxyx)
+    cmi, cma, _ = smallest_encompassing_box(yxyx1, yxyx2, yxyx=True)
+    intersection, union = intersect_and_union(yxyx1, yxyx2, yxyx=True)
    iou = math_ops.divide_no_nan(intersection, union)
-    iou = math_ops.rm_nan_inf(iou, val=0.0)
-    # find the smallest box to encompase both box1 and box2
+    bcwh = cma - cmi
-    boxc = smallest_encompassing_box(box1, box2, yxyx=yxyx)
+    c = tf.math.reduce_prod(bcwh, axis=-1)
-    if yxyx:
-      boxc = yxyx_to_xcycwh(boxc)
-    _, cwch = tf.split(boxc, 2, axis=-1)
-    c = tf.math.reduce_prod(cwch, axis=-1)
-    # compute giou
    regularization = math_ops.divide_no_nan((c - union), c)
    giou = iou - regularization
-    giou = tf.clip_by_value(giou, clip_value_min=-1.0, clip_value_max=1.0)
  return iou, giou
-def compute_diou(box1, box2, beta=1.0, yxyx=False, darknet=False):
+def compute_diou(box1, box2, beta=1.0, yxyx=False):
  """Calculates the distance intersection over union between box1 and box2.
  Args:
@@ -227,8 +195,6 @@ def compute_diou(box1, box2, beta=1.0, yxyx=False, darknet=False):
      regularization term.
    yxyx: a `bool` indicating whether the input box is of the format x_center
      y_center, width, height or y_min, x_min, y_max, x_max.
-    darknet: a `bool` indicating whether the calling function is the YOLO
-      darknet loss.
  Returns:
    diou: a `Tensor` who represents the distance intersection over union.
@@ -236,30 +202,27 @@ def compute_diou(box1, box2, beta=1.0, yxyx=False, darknet=False):
  with tf.name_scope('diou'):
    # compute center distance
    if not yxyx:
-      box1 = xcycwh_to_yxyx(box1, darknet=darknet)
+      xycc1, xycc2 = box1, box2
-      box2 = xcycwh_to_yxyx(box2, darknet=darknet)
+      yxyx1 = xcycwh_to_yxyx(box1)
-      yxyx = True
+      yxyx2 = xcycwh_to_yxyx(box2)
+    else:
-    intersection, union = intersect_and_union(box1, box2, yxyx=yxyx)
+      yxyx1, yxyx2 = box1, box2
-    boxc = smallest_encompassing_box(box1, box2, yxyx=yxyx)
+      xycc1 = yxyx_to_xcycwh(box1)
+      xycc2 = yxyx_to_xcycwh(box2)
+    cmi, cma, _ = smallest_encompassing_box(yxyx1, yxyx2, yxyx=True)
+    intersection, union = intersect_and_union(yxyx1, yxyx2, yxyx=True)
    iou = math_ops.divide_no_nan(intersection, union)
-    iou = math_ops.rm_nan_inf(iou, val=0.0)
-    if yxyx:
-      boxc = yxyx_to_xcycwh(boxc)
-      box1 = yxyx_to_xcycwh(box1)
-      box2 = yxyx_to_xcycwh(box2)
-    b1xy, _ = tf.split(box1, 2, axis=-1)
+    b1xy, _ = tf.split(xycc1, 2, axis=-1)
-    b2xy, _ = tf.split(box2, 2, axis=-1)
+    b2xy, _ = tf.split(xycc2, 2, axis=-1)
-    _, bcwh = tf.split(boxc, 2, axis=-1)
+    bcwh = cma - cmi
    center_dist = tf.reduce_sum((b1xy - b2xy)**2, axis=-1)
    c_diag = tf.reduce_sum(bcwh**2, axis=-1)
    regularization = math_ops.divide_no_nan(center_dist, c_diag)
    diou = iou - regularization**beta
-    diou = tf.clip_by_value(diou, clip_value_min=-1.0, clip_value_max=1.0)
  return iou, diou
@@ -280,33 +243,48 @@ def compute_ciou(box1, box2, yxyx=False, darknet=False):
    ciou: a `Tensor` who represents the complete intersection over union.
  """
  with tf.name_scope('ciou'):
-    # compute DIOU and IOU
+    if not yxyx:
-    iou, diou = compute_diou(box1, box2, yxyx=yxyx, darknet=darknet)
+      xycc1, xycc2 = box1, box2
+      yxyx1 = xcycwh_to_yxyx(box1)
-    if yxyx:
+      yxyx2 = xcycwh_to_yxyx(box2)
-      box1 = yxyx_to_xcycwh(box1)
+    else:
-      box2 = yxyx_to_xcycwh(box2)
+      yxyx1, yxyx2 = box1, box2
+      xycc1 = yxyx_to_xcycwh(box1)
-    _, _, b1w, b1h = tf.split(box1, 4, axis=-1)
+      xycc2 = yxyx_to_xcycwh(box2)
-    _, _, b2w, b2h = tf.split(box1, 4, axis=-1)
+    # Build the smallest encomapssing box.
-    # computer aspect ratio consistency
+    cmi, cma, _ = smallest_encompassing_box(yxyx1, yxyx2, yxyx=True)
-    terma = tf.cast(math_ops.divide_no_nan(b1w, b1h), tf.float32)
+    intersection, union = intersect_and_union(yxyx1, yxyx2, yxyx=True)
-    termb = tf.cast(math_ops.divide_no_nan(b2w, b2h), tf.float32)
+    iou = math_ops.divide_no_nan(intersection, union)
-    arcterm = tf.square(tf.math.atan(terma) - tf.math.atan(termb))
-    v = tf.squeeze(4 * arcterm / (math.pi**2), axis=-1)
+    b1xy, b1w, b1h = tf.split(xycc1, [2, 1, 1], axis=-1)
-    v = tf.cast(v, b1w.dtype)
+    b2xy, b2w, b2h = tf.split(xycc2, [2, 1, 1], axis=-1)
+    bchw = cma - cmi
-    a = tf.stop_gradient(math_ops.divide_no_nan(v, ((1 - iou) + v)))
-    ciou = diou - (v * a)
+    # Center regularization
-    ciou = tf.clip_by_value(ciou, clip_value_min=-1.0, clip_value_max=1.0)
+    center_dist = tf.reduce_sum((b1xy - b2xy)**2, axis=-1)
+    c_diag = tf.reduce_sum(bchw**2, axis=-1)
+    regularization = math_ops.divide_no_nan(center_dist, c_diag)
+    # Computer aspect ratio consistency
+    terma = math_ops.divide_no_nan(b1w, b1h)  # gt
+    termb = math_ops.divide_no_nan(b2w, b2h)  # pred
+    arcterm = tf.squeeze(
+        tf.math.pow(tf.math.atan(termb) - tf.math.atan(terma), 2), axis=-1)
+    v = (4 / math.pi**2) * arcterm
+    # Compute the aspect ratio weight, should be treated as a constant
+    a = tf.stop_gradient(math_ops.divide_no_nan(v, 1 - iou + v))
+    if darknet:
+      grad_scale = tf.stop_gradient(tf.square(b2w) + tf.square(b2h))
+      v *= tf.squeeze(grad_scale, axis=-1)
+    ciou = iou - regularization - (v * a)
  return iou, ciou
-def aggregated_comparitive_iou(boxes1,
+def aggregated_comparitive_iou(boxes1, boxes2=None, iou_type=0, beta=0.6):
-                               boxes2=None,
-                               iou_type=0,
-                               beta=0.6):
  """Calculates the IOU between two set of boxes.
  Similar to bbox_overlap but far more versitile.
@@ -333,11 +311,11 @@ def aggregated_comparitive_iou(boxes1,
  else:
    boxes2 = tf.transpose(boxes1, perm=(0, 2, 1, 3))
-  if iou_type == 0:  # diou
+  if iou_type == 0 or iou_type == 'diou':  # diou
    _, iou = compute_diou(boxes1, boxes2, beta=beta, yxyx=True)
-  elif iou_type == 1:  # giou
+  elif iou_type == 1 or iou_type == 'giou':  # giou
    _, iou = compute_giou(boxes1, boxes2, yxyx=True)
-  elif iou_type == 2:  # ciou
+  elif iou_type == 2 or iou_type == 'ciou':  # ciou
    _, iou = compute_ciou(boxes1, boxes2, yxyx=True)
  else:
    iou = compute_iou(boxes1, boxes2, yxyx=True)

--- a/official/vision/beta/projects/yolo/ops/loss_utils.py
+++ b/official/vision/beta/projects/yolo/ops/loss_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Yolo loss utility functions."""
+import numpy as np
+import tensorflow as tf
+from official.vision.beta.projects.yolo.ops import box_ops
+from official.vision.beta.projects.yolo.ops import math_ops
+@tf.custom_gradient
+def sigmoid_bce(y, x_prime, label_smoothing):
+  """Applies the Sigmoid Cross Entropy Loss.
+  Implements the same derivative as that found in the Darknet C library.
+  The derivative of this method is not the same as the standard binary cross
+  entropy with logits function.
+  The BCE with logits function equation is as follows:
+    x = 1 / (1 + exp(-x_prime))
+    bce = -ylog(x) - (1 - y)log(1 - x)
+  The standard BCE with logits function derivative is as follows:
+    dloss = -y/x + (1-y)/(1-x)
+    dsigmoid = x * (1 - x)
+    dx = dloss * dsigmoid
+  This derivative can be reduced simply to:
+    dx = (-y + x)
+  This simplification is used by the darknet library in order to improve
+  training stability. The gradient is almost the same
+  as tf.keras.losses.binary_crossentropy but varies slightly and
+  yields different performance.
+  Args:
+    y: `Tensor` holding ground truth data.
+    x_prime: `Tensor` holding the predictions prior to application of the
+      sigmoid operation.
+    label_smoothing: float value between 0.0 and 1.0 indicating the amount of
+      smoothing to apply to the data.
+  Returns:
+    bce: Tensor of the be applied loss values.
+    delta: callable function indicating the custom gradient for this operation.
+  """
+  eps = 1e-9
+  x = tf.math.sigmoid(x_prime)
+  y = tf.stop_gradient(y * (1 - label_smoothing) + 0.5 * label_smoothing)
+  bce = -y * tf.math.log(x + eps) - (1 - y) * tf.math.log(1 - x + eps)
+  def delta(dpass):
+    x = tf.math.sigmoid(x_prime)
+    dx = (-y + x) * dpass
+    dy = tf.zeros_like(y)
+    return dy, dx, 0.0
+  return bce, delta
+def apply_mask(mask, x, value=0):
+  """This function is used for gradient masking.
+  The YOLO loss function makes extensive use of dynamically shaped tensors.
+  To allow this use case on the TPU while preserving the gradient correctly
+  for back propagation we use this masking function to use a tf.where operation
+  to hard set masked location to have a gradient and a value of zero.
+  Args:
+    mask: A `Tensor` with the same shape as x used to select values of
+      importance.
+    x: A `Tensor` with the same shape as mask that will be getting masked.
+    value: `float` constant additive value.
+  Returns:
+    x: A masked `Tensor` with the same shape as x.
+  """
+  mask = tf.cast(mask, tf.bool)
+  masked = tf.where(mask, x, tf.zeros_like(x) + value)
+  return masked
+def build_grid(indexes, truths, preds, ind_mask, update=False, grid=None):
+  """This function is used to broadcast elements into the output shape.
+  This function is used to broadcasts a list of truths into the correct index
+  in the output shape. This is used for the ground truth map construction in
+  the scaled loss and the classification map in the darknet loss.
+  Args:
+    indexes: A `Tensor` for the indexes
+    truths: A `Tensor` for the ground truth.
+    preds: A `Tensor` for the predictions.
+    ind_mask: A `Tensor` for the index masks.
+    update: A `bool` for updating the grid.
+    grid: A `Tensor` for the grid.
+  Returns:
+    grid: A `Tensor` representing the augmented grid.
+  """
+  # this function is used to broadcast all the indexes to the correct
+  # into the correct ground truth mask, used for iou detection map
+  # in the scaled loss and the classification mask in the darknet loss
+  num_flatten = tf.shape(preds)[-1]
+  # is there a way to verify that we are not on the CPU?
+  ind_mask = tf.cast(ind_mask, indexes.dtype)
+  # find all the batch indexes using the cumulated sum of a ones tensor
+  # cumsum(ones) - 1 yeild the zero indexed batches
+  bhep = tf.reduce_max(tf.ones_like(indexes), axis=-1, keepdims=True)
+  bhep = tf.math.cumsum(bhep, axis=0) - 1
+  # concatnate the batch sizes to the indexes
+  indexes = tf.concat([bhep, indexes], axis=-1)
+  indexes = apply_mask(tf.cast(ind_mask, indexes.dtype), indexes)
+  indexes = (indexes + (ind_mask - 1))
+  # reshape the indexes into the correct shape for the loss,
+  # just flatten all indexes but the last
+  indexes = tf.reshape(indexes, [-1, 4])
+  # also flatten the ground truth value on all axis but the last
+  truths = tf.reshape(truths, [-1, num_flatten])
+  # build a zero grid in the samve shape as the predicitons
+  if grid is None:
+    grid = tf.zeros_like(preds)
+  # remove invalid values from the truths that may have
+  # come up from computation, invalid = nan and inf
+  truths = math_ops.rm_nan_inf(truths)
+  # scatter update the zero grid
+  if update:
+    grid = tf.tensor_scatter_nd_update(grid, indexes, truths)
+  else:
+    grid = tf.tensor_scatter_nd_max(grid, indexes, truths)
+  # stop gradient and return to avoid TPU errors and save compute
+  # resources
+  return grid
+class GridGenerator:
+  """Grid generator that generates anchor grids for box decoding."""
+  def __init__(self, anchors, masks=None, scale_anchors=None):
+    """Initialize Grid Generator.
+    Args:
+      anchors: A `List[List[int]]` for the anchor boxes that are used in the
+        model at all levels.
+      masks: A `List[int]` for the output level that this specific model output
+        Level.
+      scale_anchors: An `int` for how much to scale this level to get the
+        original input shape.
+    """
+    self.dtype = tf.keras.backend.floatx()
+    if masks is not None:
+      self._num = len(masks)
+    else:
+      self._num = tf.shape(anchors)[0]
+    if masks is not None:
+      anchors = [anchors[mask] for mask in masks]
+    self._scale_anchors = scale_anchors
+    self._anchors = tf.convert_to_tensor(anchors)
+    return
+  def _build_grid_points(self, lwidth, lheight, anchors, dtype):
+    """Generate a grid of fixed grid edges for box center decoding."""
+    with tf.name_scope('center_grid'):
+      y = tf.range(0, lheight)
+      x = tf.range(0, lwidth)
+      num = tf.shape(anchors)[0]
+      x_left = tf.tile(
+          tf.transpose(tf.expand_dims(y, axis=-1), perm=[1, 0]), [lwidth, 1])
+      y_left = tf.tile(tf.expand_dims(x, axis=-1), [1, lheight])
+      x_y = tf.stack([x_left, y_left], axis=-1)
+      x_y = tf.cast(x_y, dtype=dtype)
+      x_y = tf.expand_dims(
+          tf.tile(tf.expand_dims(x_y, axis=-2), [1, 1, num, 1]), axis=0)
+    return x_y
+  def _build_anchor_grid(self, anchors, dtype):
+    """Get the transformed anchor boxes for each dimention."""
+    with tf.name_scope('anchor_grid'):
+      num = tf.shape(anchors)[0]
+      anchors = tf.cast(anchors, dtype=dtype)
+      anchors = tf.reshape(anchors, [1, 1, 1, num, 2])
+    return anchors
+  def _extend_batch(self, grid, batch_size):
+    return tf.tile(grid, [batch_size, 1, 1, 1, 1])
+  def __call__(self, width, height, batch_size, dtype=None):
+    if dtype is None:
+      self.dtype = tf.keras.backend.floatx()
+    else:
+      self.dtype = dtype
+    grid_points = self._build_grid_points(width, height, self._anchors,
+                                          self.dtype)
+    anchor_grid = self._build_anchor_grid(
+        tf.cast(self._anchors, self.dtype) /
+        tf.cast(self._scale_anchors, self.dtype), self.dtype)
+    grid_points = self._extend_batch(grid_points, batch_size)
+    anchor_grid = self._extend_batch(anchor_grid, batch_size)
+    return grid_points, anchor_grid
+TILE_SIZE = 50
+class PairWiseSearch:
+  """Apply a pairwise search between the ground truth and the labels.
+  The goal is to indicate the locations where the predictions overlap with
+  ground truth for dynamic ground truth associations.
+  """
+  def __init__(self,
+               iou_type='iou',
+               any_match=True,
+               min_conf=0.0,
+               track_boxes=False,
+               track_classes=False):
+    """Initialization of Pair Wise Search.
+    Args:
+      iou_type: An `str` for the iou type to use.
+      any_match: A `bool` for any match(no class match).
+      min_conf: An `int` for minimum confidence threshold.
+      track_boxes: A `bool` dynamic box assignment.
+      track_classes: A `bool` dynamic class assignment.
+    """
+    self.iou_type = iou_type
+    self._any = any_match
+    self._min_conf = min_conf
+    self._track_boxes = track_boxes
+    self._track_classes = track_classes
+    return
+  def box_iou(self, true_box, pred_box):
+    # based on the type of loss, compute the iou loss for a box
+    # compute_<name> indicated the type of iou to use
+    if self.iou_type == 'giou':
+      _, iou = box_ops.compute_giou(true_box, pred_box)
+    elif self.iou_type == 'ciou':
+      _, iou = box_ops.compute_ciou(true_box, pred_box)
+    else:
+      iou = box_ops.compute_iou(true_box, pred_box)
+    return iou
+  def _search_body(self, pred_box, pred_class, boxes, classes, running_boxes,
+                   running_classes, max_iou, idx):
+    """Main search fn."""
+    # capture the batch size to be used, and gather a slice of
+    # boxes from the ground truth. currently TILE_SIZE = 50, to
+    # save memory
+    batch_size = tf.shape(boxes)[0]
+    box_slice = tf.slice(boxes, [0, idx * TILE_SIZE, 0],
+                         [batch_size, TILE_SIZE, 4])
+    # match the dimentions of the slice to the model predictions
+    # shape: [batch_size, 1, 1, num, TILE_SIZE, 4]
+    box_slice = tf.expand_dims(box_slice, axis=1)
+    box_slice = tf.expand_dims(box_slice, axis=1)
+    box_slice = tf.expand_dims(box_slice, axis=1)
+    box_grid = tf.expand_dims(pred_box, axis=-2)
+    # capture the classes
+    class_slice = tf.slice(classes, [0, idx * TILE_SIZE],
+                           [batch_size, TILE_SIZE])
+    class_slice = tf.expand_dims(class_slice, axis=1)
+    class_slice = tf.expand_dims(class_slice, axis=1)
+    class_slice = tf.expand_dims(class_slice, axis=1)
+    iou = self.box_iou(box_slice, box_grid)
+    if self._min_conf > 0.0:
+      if not self._any:
+        class_grid = tf.expand_dims(pred_class, axis=-2)
+        class_mask = tf.one_hot(
+            tf.cast(class_slice, tf.int32),
+            depth=tf.shape(pred_class)[-1],
+            dtype=pred_class.dtype)
+        class_mask = tf.reduce_any(tf.equal(class_mask, class_grid), axis=-1)
+      else:
+        class_mask = tf.reduce_max(pred_class, axis=-1, keepdims=True)
+      class_mask = tf.cast(class_mask, iou.dtype)
+      iou *= class_mask
+    max_iou_ = tf.concat([max_iou, iou], axis=-1)
+    max_iou = tf.reduce_max(max_iou_, axis=-1, keepdims=True)
+    ind = tf.expand_dims(tf.argmax(max_iou_, axis=-1), axis=-1)
+    if self._track_boxes:
+      running_boxes = tf.expand_dims(running_boxes, axis=-2)
+      box_slice = tf.zeros_like(running_boxes) + box_slice
+      box_slice = tf.concat([running_boxes, box_slice], axis=-2)
+      running_boxes = tf.gather_nd(box_slice, ind, batch_dims=4)
+    if self._track_classes:
+      running_classes = tf.expand_dims(running_classes, axis=-1)
+      class_slice = tf.zeros_like(running_classes) + class_slice
+      class_slice = tf.concat([running_classes, class_slice], axis=-1)
+      running_classes = tf.gather_nd(class_slice, ind, batch_dims=4)
+    return (pred_box, pred_class, boxes, classes, running_boxes,
+            running_classes, max_iou, idx + 1)
+  def __call__(self,
+               pred_boxes,
+               pred_classes,
+               boxes,
+               classes,
+               scale=None,
+               yxyx=True,
+               clip_thresh=0.0):
+    num_boxes = tf.shape(boxes)[-2]
+    num_tiles = (num_boxes // TILE_SIZE) - 1
+    if yxyx:
+      boxes = box_ops.yxyx_to_xcycwh(boxes)
+    if scale is not None:
+      boxes = boxes * tf.stop_gradient(scale)
+    if self._min_conf > 0.0:
+      pred_classes = tf.cast(pred_classes > self._min_conf, pred_classes.dtype)
+    def _loop_cond(unused_pred_box, unused_pred_class, boxes, unused_classes,
+                   unused_running_boxes, unused_running_classes, unused_max_iou,
+                   idx):
+      # check that the slice has boxes that all zeros
+      batch_size = tf.shape(boxes)[0]
+      box_slice = tf.slice(boxes, [0, idx * TILE_SIZE, 0],
+                           [batch_size, TILE_SIZE, 4])
+      return tf.logical_and(idx < num_tiles,
+                            tf.math.greater(tf.reduce_sum(box_slice), 0))
+    running_boxes = tf.zeros_like(pred_boxes)
+    running_classes = tf.zeros_like(tf.reduce_sum(running_boxes, axis=-1))
+    max_iou = tf.zeros_like(tf.reduce_sum(running_boxes, axis=-1))
+    max_iou = tf.expand_dims(max_iou, axis=-1)
+    (pred_boxes, pred_classes, boxes, classes, running_boxes, running_classes,
+     max_iou, _) = tf.while_loop(_loop_cond, self._search_body, [
+         pred_boxes, pred_classes, boxes, classes, running_boxes,
+         running_classes, max_iou,
+         tf.constant(0)
+     ])
+    mask = tf.cast(max_iou > clip_thresh, running_boxes.dtype)
+    running_boxes *= mask
+    running_classes *= tf.squeeze(mask, axis=-1)
+    max_iou *= mask
+    max_iou = tf.squeeze(max_iou, axis=-1)
+    mask = tf.squeeze(mask, axis=-1)
+    return (tf.stop_gradient(running_boxes), tf.stop_gradient(running_classes),
+            tf.stop_gradient(max_iou), tf.stop_gradient(mask))
+def average_iou(iou):
+  """Computes the average intersection over union without counting locations.
+  where the iou is zero.
+  Args:
+    iou: A `Tensor` representing the iou values.
+  Returns:
+    tf.stop_gradient(avg_iou): A `Tensor` representing average
+     intersection over union.
+  """
+  iou_sum = tf.reduce_sum(iou, axis=tf.range(1, tf.shape(tf.shape(iou))[0]))
+  counts = tf.cast(
+      tf.math.count_nonzero(iou, axis=tf.range(1,
+                                               tf.shape(tf.shape(iou))[0])),
+      iou.dtype)
+  avg_iou = tf.reduce_mean(math_ops.divide_no_nan(iou_sum, counts))
+  return tf.stop_gradient(avg_iou)
+def _scale_boxes(encoded_boxes, width, height, anchor_grid, grid_points,
+                 scale_xy):
+  """Decodes models boxes applying and exponential to width and height maps."""
+  # split the boxes
+  pred_xy = encoded_boxes[..., 0:2]
+  pred_wh = encoded_boxes[..., 2:4]
+  # build a scaling tensor to get the offset of th ebox relative to the image
+  scaler = tf.convert_to_tensor([height, width, height, width])
+  scale_xy = tf.cast(scale_xy, encoded_boxes.dtype)
+  # apply the sigmoid
+  pred_xy = tf.math.sigmoid(pred_xy)
+  # scale the centers and find the offset of each box relative to
+  # their center pixel
+  pred_xy = pred_xy * scale_xy - 0.5 * (scale_xy - 1)
+  # scale the offsets and add them to the grid points or a tensor that is
+  # the realtive location of each pixel
+  box_xy = grid_points + pred_xy
+  # scale the width and height of the predictions and corlate them
+  # to anchor boxes
+  box_wh = tf.math.exp(pred_wh) * anchor_grid
+  # build the final predicted box
+  scaled_box = tf.concat([box_xy, box_wh], axis=-1)
+  pred_box = scaled_box / scaler
+  # shift scaled boxes
+  scaled_box = tf.concat([pred_xy, box_wh], axis=-1)
+  return (scaler, scaled_box, pred_box)
+@tf.custom_gradient
+def _darknet_boxes(encoded_boxes, width, height, anchor_grid, grid_points,
+                   max_delta, scale_xy):
+  """Wrapper for _scale_boxes to implement a custom gradient."""
+  (scaler, scaled_box, pred_box) = _scale_boxes(encoded_boxes, width, height,
+                                                anchor_grid, grid_points,
+                                                scale_xy)
+  def delta(unused_dy_scaler, dy_scaled, dy):
+    dy_xy, dy_wh = tf.split(dy, 2, axis=-1)
+    dy_xy_, dy_wh_ = tf.split(dy_scaled, 2, axis=-1)
+    # add all the gradients that may have been applied to the
+    # boxes and those that have been applied to the width and height
+    dy_wh += dy_wh_
+    dy_xy += dy_xy_
+    # propagate the exponential applied to the width and height in
+    # order to ensure the gradient propagated is of the correct
+    # magnitude
+    pred_wh = encoded_boxes[..., 2:4]
+    dy_wh *= tf.math.exp(pred_wh)
+    dbox = tf.concat([dy_xy, dy_wh], axis=-1)
+    # apply the gradient clipping to xy and wh
+    dbox = math_ops.rm_nan_inf(dbox)
+    delta = tf.cast(max_delta, dbox.dtype)
+    dbox = tf.clip_by_value(dbox, -delta, delta)
+    return dbox, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+  return (scaler, scaled_box, pred_box), delta
+def _new_coord_scale_boxes(encoded_boxes, width, height, anchor_grid,
+                           grid_points, scale_xy):
+  """Decodes models boxes by squaring and scaling the width and height maps."""
+  # split the boxes
+  pred_xy = encoded_boxes[..., 0:2]
+  pred_wh = encoded_boxes[..., 2:4]
+  # build a scaling tensor to get the offset of th ebox relative to the image
+  scaler = tf.convert_to_tensor([height, width, height, width])
+  scale_xy = tf.cast(scale_xy, pred_xy.dtype)
+  # apply the sigmoid
+  pred_xy = tf.math.sigmoid(pred_xy)
+  pred_wh = tf.math.sigmoid(pred_wh)
+  # scale the xy offset predictions according to the config
+  pred_xy = pred_xy * scale_xy - 0.5 * (scale_xy - 1)
+  # find the true offset from the grid points and the scaler
+  # where the grid points are the relative offset of each pixel with
+  # in the image
+  box_xy = grid_points + pred_xy
+  # decode the widht and height of the boxes and correlate them
+  # to the anchor boxes
+  box_wh = (2 * pred_wh)**2 * anchor_grid
+  # build the final boxes
+  scaled_box = tf.concat([box_xy, box_wh], axis=-1)
+  pred_box = scaled_box / scaler
+  # shift scaled boxes
+  scaled_box = tf.concat([pred_xy, box_wh], axis=-1)
+  return (scaler, scaled_box, pred_box)
+@tf.custom_gradient
+def _darknet_new_coord_boxes(encoded_boxes, width, height, anchor_grid,
+                             grid_points, max_delta, scale_xy):
+  """Wrapper for _new_coord_scale_boxes to implement a custom gradient."""
+  (scaler, scaled_box,
+   pred_box) = _new_coord_scale_boxes(encoded_boxes, width, height, anchor_grid,
+                                      grid_points, scale_xy)
+  def delta(unused_dy_scaler, dy_scaled, dy):
+    dy_xy, dy_wh = tf.split(dy, 2, axis=-1)
+    dy_xy_, dy_wh_ = tf.split(dy_scaled, 2, axis=-1)
+    # add all the gradients that may have been applied to the
+    # boxes and those that have been applied to the width and height
+    dy_wh += dy_wh_
+    dy_xy += dy_xy_
+    dbox = tf.concat([dy_xy, dy_wh], axis=-1)
+    # apply the gradient clipping to xy and wh
+    dbox = math_ops.rm_nan_inf(dbox)
+    delta = tf.cast(max_delta, dbox.dtype)
+    dbox = tf.clip_by_value(dbox, -delta, delta)
+    return dbox, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+  return (scaler, scaled_box, pred_box), delta
+def _anchor_free_scale_boxes(encoded_boxes, width, height, stride, grid_points,
+                             scale_xy):
+  """Decode models boxes using FPN stride under anchor free conditions."""
+  # split the boxes
+  pred_xy = encoded_boxes[..., 0:2]
+  pred_wh = encoded_boxes[..., 2:4]
+  # build a scaling tensor to get the offset of th ebox relative to the image
+  scaler = tf.convert_to_tensor([height, width, height, width])
+  scale_xy = tf.cast(scale_xy, encoded_boxes.dtype)
+  # scale the centers and find the offset of each box relative to
+  # their center pixel
+  pred_xy = pred_xy * scale_xy - 0.5 * (scale_xy - 1)
+  # scale the offsets and add them to the grid points or a tensor that is
+  # the realtive location of each pixel
+  box_xy = (grid_points + pred_xy) * stride
+  # scale the width and height of the predictions and corlate them
+  # to anchor boxes
+  box_wh = tf.math.exp(pred_wh) * stride
+  # build the final predicted box
+  scaled_box = tf.concat([box_xy, box_wh], axis=-1)
+  pred_box = scaled_box / scaler
+  return (scaler, scaled_box, pred_box)
+def get_predicted_box(width,
+                      height,
+                      encoded_boxes,
+                      anchor_grid,
+                      grid_points,
+                      scale_xy,
+                      stride,
+                      darknet=False,
+                      box_type='original',
+                      max_delta=np.inf):
+  """Decodes the predicted boxes from the model format to a usable format.
+  This function decodes the model outputs into the [x, y, w, h] format for
+  use in the loss function as well as for use within the detection generator.
+  Args:
+    width: A `float` scalar indicating the width of the prediction layer.
+    height: A `float` scalar indicating the height of the prediction layer
+    encoded_boxes: A `Tensor` of shape [..., height, width, 4] holding encoded
+      boxes.
+    anchor_grid: A `Tensor` of shape [..., 1, 1, 2] holding the anchor boxes
+      organized for box decoding, box width and height.
+    grid_points: A `Tensor` of shape [..., height, width, 2] holding the anchor
+      boxes for decoding the box centers.
+    scale_xy: A `float` scaler used to indicate the range for each center
+      outside of its given [..., i, j, 4] index, where i and j are indexing
+      pixels along the width and height of the predicted output map.
+    stride: An `int` defining the amount of down stride realtive to the input
+      image.
+    darknet: A `bool` used to select between custom gradient and default
+      autograd.
+    box_type: An `str` indicating the type of box encoding that is being used.
+    max_delta: A `float` scaler used for gradient clipping in back propagation.
+  Returns:
+    scaler: A `Tensor` of shape [4] returned to allow the scaling of the ground
+      truth boxes to be of the same magnitude as the decoded predicted boxes.
+    scaled_box: A `Tensor` of shape [..., height, width, 4] with the predicted
+      boxes.
+    pred_box: A `Tensor` of shape [..., height, width, 4] with the predicted
+      boxes divided by the scaler parameter used to put all boxes in the [0, 1]
+      range.
+  """
+  if box_type == 'anchor_free':
+    (scaler, scaled_box,
+     pred_box) = _anchor_free_scale_boxes(encoded_boxes, width, height, stride,
+                                          grid_points, scale_xy)
+  elif darknet:
+    # pylint:disable=unbalanced-tuple-unpacking
+    # if we are using the darknet loss we shoud nto propagate the
+    # decoding of the box
+    if box_type == 'scaled':
+      (scaler, scaled_box,
+       pred_box) = _darknet_new_coord_boxes(encoded_boxes, width, height,
+                                            anchor_grid, grid_points, max_delta,
+                                            scale_xy)
+    else:
+      (scaler, scaled_box,
+       pred_box) = _darknet_boxes(encoded_boxes, width, height, anchor_grid,
+                                  grid_points, max_delta, scale_xy)
+  else:
+    # if we are using the scaled loss we should propagate the decoding of
+    # the boxes
+    if box_type == 'scaled':
+      (scaler, scaled_box,
+       pred_box) = _new_coord_scale_boxes(encoded_boxes, width, height,
+                                          anchor_grid, grid_points, scale_xy)
+    else:
+      (scaler, scaled_box, pred_box) = _scale_boxes(encoded_boxes, width,
+                                                    height, anchor_grid,
+                                                    grid_points, scale_xy)
+  return (scaler, scaled_box, pred_box)
--- a/official/vision/beta/projects/yolo/ops/math_ops.py
+++ b/official/vision/beta/projects/yolo/ops/math_ops.py
@@ -58,25 +58,4 @@ def divide_no_nan(a, b):
  Returns:
    a `Tensor` representing a divided by b, with all nan values removed.
  """
-  zero = tf.cast(0.0, b.dtype)
+  return a / (b + 1e-9)
-  return tf.where(b == zero, zero, a / b)
-def mul_no_nan(x, y):
-  """Nan safe multiply operation.
-  Built to allow model compilation in tflite and
-  to allow one tensor to mask another. Where ever x is zero the
-  multiplication is not computed and the value is replaced with a zero. This is
-  required because 0 * nan = nan. This can make computation unstable in some
-  cases where the intended behavior is for zero to mean ignore.
-  Args:
-    x: any `Tensor` of any type.
-    y: any `Tensor` of any type with the same shape as tensor x.
-  Returns:
-    a `Tensor` representing x times y, where x is used to safely mask the
-    tensor y.
-  """
-  return tf.where(x == 0, tf.cast(0, x.dtype), x * y)