loss function and init/run test

8e9db0f1 · Vishnu Banna · a1df6e20 · 8e9db0f1 · 8e9db0f1 · 8e9db0f1
Commit 8e9db0f1 authored Sep 10, 2021 by Vishnu Banna
4 changed files
--- a/official/vision/beta/projects/yolo/losses/__init__.py
+++ b/official/vision/beta/projects/yolo/losses/__init__.py
--- a/official/vision/beta/projects/yolo/losses/yolo_loss.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss.py
+import tensorflow as tf
+from collections import defaultdict
+import abc
+from functools import partial
+from official.vision.beta.projects.yolo.ops import (loss_utils, 
+                                                          box_ops, math_ops)
+class YoloLossBase(object, metaclass=abc.ABCMeta):
+  """Parameters for the YOLO loss functions used at each detection  
+  generator. This base class implements the base functionality required to 
+  implement a Yolo Loss function"""
+  def __init__(self,
+               classes,
+               mask,
+               anchors,
+               path_stride=1,
+               ignore_thresh=0.7,
+               truth_thresh=1.0,
+               loss_type="ciou",
+               iou_normalizer=1.0,
+               cls_normalizer=1.0,
+               obj_normalizer=1.0,
+               label_smoothing=0.0,
+               objectness_smooth=True,
+               update_on_repeat=False,
+               box_type="original",
+               scale_x_y=1.0,
+               max_delta=10):
+    """Loss Function Initialization. 
+    Args:
+      classes: `int` for the number of classes 
+      mask: `List[int]` for the output level that this specific model output 
+        level
+      anchors: `List[List[int]]` for the anchor boxes that are used in the model 
+        at all levels. For anchor free prediction set the anchor list to be the 
+        same as the image resolution. 
+      path_stride: `int` for how much to scale this level to get the orginal 
+        input shape.
+      ignore_thresh: `float` for the IOU value over which the loss is not 
+        propagated, and a detection is assumed to have been made.
+      truth_thresh: `float` for the IOU value over which the loss is propagated 
+        despite a detection being made.
+      loss_type: `str` for the typeof iou loss to use with in {ciou, diou, 
+        giou, iou}.
+      iou_normalizer: `float` for how much to scale the loss on the IOU or the 
+        boxes.
+      cls_normalizer: `float` for how much to scale the loss on the classes.
+      obj_normalizer: `float` for how much to scale loss on the detection map.
+      objectness_smooth: `float` for how much to smooth the loss on the 
+        detection map.
+      use_reduction_sum: `bool` for whether to use the scaled loss 
+        or the traditional loss.
+      update_on_repeat: `bool` for whether to replace with the newest or the
+        best value when an index is consumed by multiple objects. 
+      label_smoothing: `float` for how much to smooth the loss on the classes
+      box_type: `bool` for which scaling type to use. 
+      scale_x_y: dictionary `float` values inidcating how far each pixel can see 
+        outside of its containment of 1.0. a value of 1.2 indicates there is a 
+        20% extended radius around each pixel that this specific pixel can 
+        predict values for a center at. the center can range from 0 - value/2 
+        to 1 + value/2, this value is set in the yolo filter, and resused here. 
+        there should be one value for scale_xy for each level from min_level to 
+        max_level.
+      max_delta: gradient clipping to apply to the box loss. 
+    """
+    self._loss_type = loss_type
+    self._classes = tf.constant(tf.cast(classes, dtype=tf.int32))
+    self._num = tf.cast(len(mask), dtype=tf.int32)
+    self._truth_thresh = truth_thresh
+    self._ignore_thresh = ignore_thresh
+    self._masks = mask
+    self._anchors = anchors
+    self._iou_normalizer = iou_normalizer
+    self._cls_normalizer = cls_normalizer
+    self._obj_normalizer = obj_normalizer
+    self._scale_x_y = scale_x_y
+    self._max_delta = max_delta
+    self._label_smoothing = tf.cast(label_smoothing, tf.float32)
+    self._objectness_smooth = float(objectness_smooth)
+    self._update_on_repeat = update_on_repeat
+    self._box_type = box_type
+    self._path_stride = path_stride
+    box_kwargs = dict(
+        stride=self._path_stride,
+        scale_xy=self._scale_x_y,
+        box_type=self._box_type,
+        max_delta=self._max_delta)
+    self._decode_boxes = partial(loss_utils.get_predicted_box, **box_kwargs)
+    self._build_per_path_attributes()
+  def box_loss(self, true_box, pred_box, darknet=False):
+    """Calls the iou functions and uses it to compute the loss this op is 
+    the same regardless of Yolo Loss version"""
+    if self._loss_type == "giou":
+      iou, liou = box_ops.compute_giou(true_box, pred_box)
+    elif self._loss_type == "ciou":
+      iou, liou = box_ops.compute_ciou(true_box, pred_box, darknet=darknet)
+    else:
+      liou = iou = box_ops.compute_iou(true_box, pred_box)
+    loss_box = 1 - liou
+    return iou, liou, loss_box
+  def _tiled_global_box_search(self,
+                               pred_boxes,
+                               pred_classes,
+                               boxes,
+                               classes,
+                               true_conf,
+                               smoothed,
+                               scale=None):
+    """Completes a search of all predictions against all the ground truths to 
+    dynamically associate ground truths with predictions."""
+    # Search all predictions against ground truths to find mathcing boxes for
+    # each pixel.
+    _, _, iou_max, _ = self._search_pairs(
+        pred_boxes, pred_classes, boxes, classes, scale=scale, yxyx=True)
+    # Find the exact indexes to ignore and keep.
+    ignore_mask = tf.cast(iou_max < self._ignore_thresh, pred_boxes.dtype)
+    iou_mask = iou_max > self._ignore_thresh
+    if not smoothed:
+      # Ignore all pixels where a box was not supposed to be predicted but a
+      # high confidence box was predicted.
+      obj_mask = true_conf + (1 - true_conf) * ignore_mask
+    else:
+      # Replace pixels in the tre confidence map with the max iou predicted
+      # with in that cell.
+      obj_mask = tf.ones_like(true_conf)
+      iou_ = (1 - self._objectness_smooth) + self._objectness_smooth * iou_max
+      iou_ = tf.where(iou_max > 0, iou_, tf.zeros_like(iou_))
+      true_conf = tf.where(iou_mask, iou_, true_conf)
+    # Stop gradient so while loop is not tracked.
+    obj_mask = tf.stop_gradient(obj_mask)
+    true_conf = tf.stop_gradient(true_conf)
+    return true_conf, obj_mask
+  def __call__(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Call function to compute the loss and return the total loss as 
+    well as the loss for each detection mask on a given FPN level. 
+    Args: 
+      true_counts: `Tensor` of shape [batchsize, height, width, num_anchors] 
+        represeneting how many boxes are in a given pixel [j, i] in the 
+        output map.
+      inds: `Tensor` of shape [batchsize, None, 3] indicating the location 
+        [j, i] that a given box is associatied with in the FPN prediction 
+        map. 
+      y_true: `Tensor` of shape [batchsize, None, 8] indicating the actual box 
+        associated with each index in the inds tensor list.
+      boxes: `Tensor` of shape [batchsize, None, 4] indicating the original 
+        ground truth boxes for each image as they came from the decoder used
+        for bounding box search. 
+      classes: `Tensor` of shape [batchsize, None, 1] indicating the original 
+        ground truth classes for each image as they came from the decoder used
+        for bounding box search.
+      y_pred: `Tensor` of shape [batchsize, height, width, output_depth] 
+        holding the models output at a specific FPN level.
+    Return:
+      loss: `float` for the actual loss.
+      box_loss: `float` loss on the boxes used for metrics.
+      conf_loss: `float` loss on the confidence used for metrics.
+      class_loss: `float` loss on the classes used for metrics.
+      avg_iou: `float` metric for the average iou between predictions 
+        and ground truth.
+      avg_obj: `float` metric for the average confidence of the model 
+        for predictions.
+    """
+    (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf, ind_mask,
+     grid_mask) = self.call(true_counts, inds, y_true, boxes, classes, y_pred)
+    # Temporary metrics
+    box_loss = tf.stop_gradient(0.05 * box_loss / self._iou_normalizer)
+    # Metric compute using done here to save time and resources.
+    sigmoid_conf = tf.stop_gradient(tf.sigmoid(pred_conf))
+    iou = tf.stop_gradient(iou)
+    avg_iou = loss_utils.avgiou(
+        loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), iou))
+    avg_obj = loss_utils.avgiou(tf.squeeze(sigmoid_conf, axis=-1) * grid_mask)
+    return (loss, box_loss, conf_loss, class_loss, mean_loss,
+            tf.stop_gradient(avg_iou), tf.stop_gradient(avg_obj))
+  @abc.abstractmethod
+  def _build_per_path_attributes(self):
+    """Additional initialization required specifically for each unique YOLO 
+    loss version"""
+    ...
+  @abc.abstractmethod
+  def call():
+    """The actual logic to apply to the raw model for optimization."""
+    ...
+  def post_path_aggregation(self, loss, ground_truths, predictions):
+    """This method allows for post processing of a loss value after the loss
+    has been aggregateda across all the FPN levels."""
+    return loss
+  @abc.abstractmethod
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """This controls how the loss should be aggregated across replicas."""
+    ...
+@tf.custom_gradient
+def grad_sigmoid(values):
+  # This is an identity operation that will
+  # allow us to add some steps to the back propagation.
+  def delta(dy):
+    # Darknet only propagtes sigmoids for the boxes
+    # under some conditions, so we need this to selectively
+    # add the sigmoid to the chain rule
+    t = tf.math.sigmoid(values)
+    return dy * t * (1 - t)
+  return values, delta
+class DarknetLoss(YoloLossBase):
+  """This class implements the full logic for the standard Yolo models 
+  encompassing Yolov3, Yolov4, and Yolo-Tiny."""
+  def _build_per_path_attributes(self):
+    """Paramterization of pair wise search and grid generators for box 
+    decoding and dynamic ground truth association."""
+    self._anchor_generator = loss_utils.GridGenerator(
+        masks=self._masks,
+        anchors=self._anchors,
+        scale_anchors=self._path_stride)
+    if self._ignore_thresh > 0.0:
+      self._search_pairs = loss_utils.PairWiseSearch(
+          iou_type="iou", any=True, min_conf=0.25)
+    return
+  def call(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Per FPN path loss computation logic."""
+    if self._box_type == "scaled":
+      # Darknet Model Propagates a sigmoid once in back prop so we replicate
+      # that behaviour
+      y_pred = grad_sigmoid(y_pred)
+    # Generate and store constants and format output.
+    shape = tf.shape(true_counts)
+    batch_size, width, height, num = shape[0], shape[1], shape[2], shape[3]
+    fwidth = tf.cast(width, tf.float32)
+    fheight = tf.cast(height, tf.float32)
+    grid_points, anchor_grid = self._anchor_generator(
+        width, height, batch_size, dtype=tf.float32)
+    # Cast all input compontnts to float32 and stop gradient to save memory.
+    boxes = tf.stop_gradient(tf.cast(boxes, tf.float32))
+    classes = tf.stop_gradient(tf.cast(classes, tf.float32))
+    y_true = tf.stop_gradient(tf.cast(y_true, tf.float32))
+    true_counts = tf.stop_gradient(tf.cast(true_counts, tf.float32))
+    true_conf = tf.stop_gradient(tf.clip_by_value(true_counts, 0.0, 1.0))
+    grid_points = tf.stop_gradient(grid_points)
+    anchor_grid = tf.stop_gradient(anchor_grid)
+    # Split all the ground truths to use as seperate items in loss computation.
+    (true_box, ind_mask, true_class, _, _) = tf.split(
+        y_true, [4, 1, 1, 1, 1], axis=-1)
+    true_conf = tf.squeeze(true_conf, axis=-1)
+    true_class = tf.squeeze(true_class, axis=-1)
+    grid_mask = true_conf
+    # Splits all predictions.
+    y_pred = tf.cast(
+        tf.reshape(y_pred, [batch_size, width, height, num, -1]), tf.float32)
+    pred_box, pred_conf, pred_class = tf.split(y_pred, [4, 1, -1], axis=-1)
+    # Decode the boxes to be used for loss compute.
+    _, _, pred_box = self._decode_boxes(
+        fwidth, fheight, pred_box, anchor_grid, grid_points, darknet=True)
+    # If the ignore threshold is enabled, search all boxes ignore all
+    # IOU valeus larger than the ignore threshold that are not in the
+    # noted ground truth list.
+    if self._ignore_thresh != 0.0:
+      (true_conf, obj_mask) = self._tiled_global_box_search(
+          pred_box,
+          tf.stop_gradient(tf.sigmoid(pred_class)),
+          boxes,
+          classes,
+          true_conf,
+          smoothed=self._objectness_smooth > 0)
+    # Build the one hot class list that are used for class loss.
+    true_class = tf.one_hot(
+        tf.cast(true_class, tf.int32),
+        depth=tf.shape(pred_class)[-1],
+        dtype=pred_class.dtype)
+    true_classes = tf.stop_gradient(loss_utils.apply_mask(ind_mask, true_class))
+    # Reorganize the one hot class list as a grid.
+    true_class = loss_utils.build_grid(
+        inds, true_classes, pred_class, ind_mask, update=False)
+    true_class = tf.stop_gradient(true_class)
+    # Use the class mask to find the number of objects located in
+    # each predicted grid cell/pixel.
+    counts = true_class
+    counts = tf.reduce_sum(counts, axis=-1, keepdims=True)
+    reps = tf.gather_nd(counts, inds, batch_dims=1)
+    reps = tf.squeeze(reps, axis=-1)
+    reps = tf.stop_gradient(tf.where(reps == 0.0, tf.ones_like(reps), reps))
+    # Compute the loss for only the cells in which the boxes are located.
+    pred_box = loss_utils.apply_mask(ind_mask,
+                                     tf.gather_nd(pred_box, inds, batch_dims=1))
+    iou, _, box_loss = self.box_loss(true_box, pred_box, darknet=True)
+    box_loss = loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), box_loss)
+    box_loss = math_ops.divide_no_nan(box_loss, reps)
+    box_loss = tf.cast(tf.reduce_sum(box_loss, axis=1), dtype=y_pred.dtype)
+    # Compute the sigmoid binary cross entropy for the class maps.
+    class_loss = tf.reduce_mean(
+        loss_utils.sigmoid_BCE(
+            tf.expand_dims(true_class, axis=-1),
+            tf.expand_dims(pred_class, axis=-1), self._label_smoothing),
+        axis=-1)
+    # Apply normalization to the class losses.
+    if self._cls_normalizer < 1.0:
+      # Build a mask based on the true class locations.
+      cls_norm_mask = true_class
+      # Apply the classes weight to class indexes were one_hot is one.
+      class_loss *= ((1 - cls_norm_mask) + cls_norm_mask * self._cls_normalizer)
+    # Mask to the class loss and compute the sum over all the objects.
+    class_loss = tf.reduce_sum(class_loss, axis=-1)
+    class_loss = loss_utils.apply_mask(grid_mask, class_loss)
+    class_loss = math_ops.rm_nan_inf(class_loss, val=0.0)
+    class_loss = tf.cast(
+        tf.reduce_sum(class_loss, axis=(1, 2, 3)), dtype=y_pred.dtype)
+    # Compute the sigmoid binary cross entropy for the confidence maps.
+    bce = tf.reduce_mean(
+        loss_utils.sigmoid_BCE(
+            tf.expand_dims(true_conf, axis=-1), pred_conf, 0.0),
+        axis=-1)
+    # Mask the confidence loss and take the sum across all the grid cells.
+    if self._ignore_thresh != 0.0:
+      bce = loss_utils.apply_mask(obj_mask, bce)
+    conf_loss = tf.cast(tf.reduce_sum(bce, axis=(1, 2, 3)), dtype=y_pred.dtype)
+    # Apply the weights to each loss.
+    box_loss *= self._iou_normalizer
+    conf_loss *= self._obj_normalizer
+    # Add all the losses together then take the mean over the batches.
+    loss = box_loss + class_loss + conf_loss
+    loss = tf.reduce_mean(loss)
+    # Reduce the mean of the losses to use as a metric.
+    box_loss = tf.reduce_mean(box_loss)
+    conf_loss = tf.reduce_mean(conf_loss)
+    class_loss = tf.reduce_mean(class_loss)
+    return (loss, box_loss, conf_loss, class_loss, loss, iou, pred_conf,
+            ind_mask, grid_mask)
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """this method is not specific to each loss path, but each loss type"""
+    return loss / num_replicas_in_sync
+class ScaledLoss(YoloLossBase):
+  """This class implements the full logic for the scaled Yolo models 
+  encompassing Yolov4-csp, Yolov4-Large, and Yolov5."""
+  def _build_per_path_attributes(self):
+    """Paramterization of pair wise search and grid generators for box 
+    decoding and dynamic ground truth association."""
+    self._anchor_generator = loss_utils.GridGenerator(
+        masks=self._masks,
+        anchors=self._anchors,
+        scale_anchors=self._path_stride)
+    if self._ignore_thresh > 0.0:
+      self._search_pairs = loss_utils.PairWiseSearch(
+          iou_type=self._loss_type, any=False, min_conf=0.25)
+    return
+  def call(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Per FPN path loss computation logic."""
+    # Generate shape constants.
+    shape = tf.shape(true_counts)
+    batch_size, width, height, num = shape[0], shape[1], shape[2], shape[3]
+    fwidth = tf.cast(width, tf.float32)
+    fheight = tf.cast(height, tf.float32)
+    # Cast all input compontnts to float32 and stop gradient to save memory.
+    y_true = tf.cast(y_true, tf.float32)
+    true_counts = tf.cast(true_counts, tf.float32)
+    true_conf = tf.clip_by_value(true_counts, 0.0, 1.0)
+    grid_points, anchor_grid = self._anchor_generator(
+        width, height, batch_size, dtype=tf.float32)
+    # Split the y_true list.
+    (true_box, ind_mask, true_class, _, _) = tf.split(
+        y_true, [4, 1, 1, 1, 1], axis=-1)
+    grid_mask = true_conf = tf.squeeze(true_conf, axis=-1)
+    true_class = tf.squeeze(true_class, axis=-1)
+    num_objs = tf.cast(tf.reduce_sum(ind_mask), dtype=y_pred.dtype)
+    # Split up the predicitons.
+    y_pred = tf.cast(
+        tf.reshape(y_pred, [batch_size, width, height, num, -1]), tf.float32)
+    pred_box, pred_conf, pred_class = tf.split(y_pred, [4, 1, -1], axis=-1)
+    # Decode the boxes for loss compute.
+    scale, pred_box, _ = self._decode_boxes(
+        fwidth, fheight, pred_box, anchor_grid, grid_points, darknet=False)
+    # If the ignore threshold is enabled, search all boxes ignore all
+    # IOU valeus larger than the ignore threshold that are not in the
+    # noted ground truth list.
+    if self._ignore_thresh != 0.0:
+      (_, obj_mask) = self._tiled_global_box_search(
+          pred_box,
+          tf.stop_gradient(tf.sigmoid(pred_class)),
+          boxes,
+          classes,
+          true_conf,
+          smoothed=False,
+          scale=scale)
+    # Scale and shift and select the ground truth boxes
+    # and predictions to the prediciton domain.
+    offset = tf.cast(
+        tf.gather_nd(grid_points, inds, batch_dims=1), true_box.dtype)
+    offset = tf.concat([offset, tf.zeros_like(offset)], axis=-1)
+    true_box = loss_utils.apply_mask(ind_mask, (scale * true_box) - offset)
+    pred_box = loss_utils.apply_mask(ind_mask,
+                                     tf.gather_nd(pred_box, inds, batch_dims=1))
+    # Select the correct/used prediction classes.
+    true_class = tf.one_hot(
+        tf.cast(true_class, tf.int32),
+        depth=tf.shape(pred_class)[-1],
+        dtype=pred_class.dtype)
+    true_class = loss_utils.apply_mask(ind_mask, true_class)
+    pred_class = loss_utils.apply_mask(
+        ind_mask, tf.gather_nd(pred_class, inds, batch_dims=1))
+    # Compute the box loss.
+    _, iou, box_loss = self.box_loss(true_box, pred_box, darknet=False)
+    box_loss = loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), box_loss)
+    box_loss = math_ops.divide_no_nan(tf.reduce_sum(box_loss), num_objs)
+    # Use the box IOU to build the map for confidence loss computation.
+    iou = tf.maximum(tf.stop_gradient(iou), 0.0)
+    smoothed_iou = ((
+        (1 - self._objectness_smooth) * tf.cast(ind_mask, iou.dtype)) +
+                    self._objectness_smooth * tf.expand_dims(iou, axis=-1))
+    smoothed_iou = loss_utils.apply_mask(ind_mask, smoothed_iou)
+    true_conf = loss_utils.build_grid(
+        inds, smoothed_iou, pred_conf, ind_mask, update=self._update_on_repeat)
+    true_conf = tf.squeeze(true_conf, axis=-1)
+    # Compute the cross entropy loss for the confidence map.
+    bce = tf.keras.losses.binary_crossentropy(
+        tf.expand_dims(true_conf, axis=-1), pred_conf, from_logits=True)
+    if self._ignore_thresh != 0.0:
+      bce = loss_utils.apply_mask(obj_mask, bce)
+    conf_loss = tf.reduce_mean(bce)
+    # Compute the cross entropy loss for the class maps.
+    class_loss = tf.keras.losses.binary_crossentropy(
+        true_class,
+        pred_class,
+        label_smoothing=self._label_smoothing,
+        from_logits=True)
+    class_loss = loss_utils.apply_mask(
+        tf.squeeze(ind_mask, axis=-1), class_loss)
+    class_loss = math_ops.divide_no_nan(tf.reduce_sum(class_loss), num_objs)
+    # Apply the weights to each loss.
+    box_loss *= self._iou_normalizer
+    class_loss *= self._cls_normalizer
+    conf_loss *= self._obj_normalizer
+    # Add all the losses together then take the sum over the batches.
+    mean_loss = box_loss + class_loss + conf_loss
+    loss = mean_loss * tf.cast(batch_size, mean_loss.dtype)
+    return (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf,
+            ind_mask, grid_mask)
+  def post_path_aggregation(self, loss, ground_truths, predictions):
+    scale = tf.stop_gradient(3 / len(list(predictions.keys())))
+    return loss * scale
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """this method is not specific to each loss path, but each loss type"""
+    return loss
+LOSSES = {"darknet": DarknetLoss, "scaled": ScaledLoss}
+class YoloLoss(object):
+  """This class implements the aggregated loss across paths for the YOLO 
+  model. The class implements the YOLO loss as a factory in order to allow 
+  selection and implementation of new versions of the YOLO loss as the model 
+  is updated in the future. 
+  """
+  def __init__(self,
+               keys,
+               classes,
+               anchors,
+               masks=None,
+               path_strides=None,
+               truth_thresholds=None,
+               ignore_thresholds=None,
+               loss_types=None,
+               iou_normalizers=None,
+               cls_normalizers=None,
+               obj_normalizers=None,
+               objectness_smooths=None,
+               box_types=None,
+               scale_xys=None,
+               max_deltas=None,
+               label_smoothing=0.0,
+               use_scaled_loss=False,
+               update_on_repeat=True):
+    """Loss Function Initialization. 
+    Args:
+      keys: `List[str]` indicating the name of the FPN paths that need to be
+        optimized.
+      classes: `int` for the number of classes 
+      anchors: `List[List[int]]` for the anchor boxes that are used in the model 
+        at all levels. For anchor free prediction set the anchor list to be the 
+        same as the image resolution.
+      masks: `List[int]` for the output level that this specific model output 
+        level
+      path_strides: `Dict[int]` for how much to scale this level to get the 
+        orginal input shape for each FPN path.
+      truth_thresholds: `Dict[float]` for the IOU value over which the loss is 
+        propagated despite a detection being made for each FPN path.
+      ignore_thresholds: `Dict[float]` for the IOU value over which the loss is 
+        not propagated, and a detection is assumed to have been made for each 
+        FPN path.
+      loss_types: `Dict[str]` for the typeof iou loss to use with in {ciou, 
+        diou, giou, iou} for each FPN path.
+      iou_normalizers: `Dict[float]` for how much to scale the loss on the IOU 
+        or the boxes for each FPN path.
+      cls_normalizers: `Dict[float]` for how much to scale the loss on the 
+        classes for each FPN path.
+      obj_normalizers: `Dict[float]` for how much to scale loss on the detection 
+        map for each FPN path.
+      objectness_smooths: `Dict[float]` for how much to smooth the loss on the 
+        detection map for each FPN path.
+      box_type: `Dict[bool]` for which scaling type to use for each FPN path. 
+      scale_xys:  `Dict[float]` values inidcating how far each pixel can see 
+        outside of its containment of 1.0. a value of 1.2 indicates there is a 
+        20% extended radius around each pixel that this specific pixel can 
+        predict values for a center at. the center can range from 0 - value/2 
+        to 1 + value/2, this value is set in the yolo filter, and resused here. 
+        there should be one value for scale_xy for each level from min_level to 
+        max_level. One for each FPN path.
+      max_deltas: `Dict[float]` for gradient clipping to apply to the box loss 
+        for each FPN path.
+      label_smoothing: `Dict[float]` for how much to smooth the loss on the 
+        classes for each FPN path.
+      use_scaled_loss: `bool` for whether to use the scaled loss 
+        or the traditional loss.
+      update_on_repeat: `bool` for whether to replace with the newest or 
+        the best value when an index is consumed by multiple objects. 
+    """
+    if use_scaled_loss:
+      loss_type = "scaled"
+    else:
+      loss_type = "darknet"
+    self._loss_dict = {}
+    for key in keys:
+      self._loss_dict[key] = LOSSES[loss_type](
+          classes=classes,
+          anchors=anchors,
+          mask=masks[key],
+          truth_thresh=truth_thresholds[key],
+          ignore_thresh=ignore_thresholds[key],
+          loss_type=loss_types[key],
+          iou_normalizer=iou_normalizers[key],
+          cls_normalizer=cls_normalizers[key],
+          obj_normalizer=obj_normalizers[key],
+          box_type=box_types[key],
+          objectness_smooth=objectness_smooths[key],
+          max_delta=max_deltas[key],
+          path_stride=path_strides[key],
+          scale_x_y=scale_xys[key],
+          update_on_repeat=update_on_repeat,
+          label_smoothing=label_smoothing)
+  def __call__(self, ground_truth, predictions, use_reduced_logs=True):
+    metric_dict = defaultdict(dict)
+    metric_dict['net']['box'] = 0
+    metric_dict['net']['class'] = 0
+    metric_dict['net']['conf'] = 0
+    loss_val, metric_loss = 0, 0
+    num_replicas_in_sync = tf.distribute.get_strategy().num_replicas_in_sync
+    for key in predictions.keys():
+      (_loss, _loss_box, _loss_conf, _loss_class, _mean_loss, _avg_iou,
+       _avg_obj) = self._loss_dict[key](ground_truth['true_conf'][key],
+                                        ground_truth['inds'][key],
+                                        ground_truth['upds'][key],
+                                        ground_truth['bbox'],
+                                        ground_truth['classes'],
+                                        predictions[key])
+      # after computing the loss, scale loss as needed for aggregation
+      # across FPN levels
+      _loss = self._loss_dict[key].post_path_aggregation(
+          _loss, ground_truth, predictions)
+      # after completing the scaling of the loss on each replica, handle
+      # scaling the loss for mergeing the loss across replicas
+      _loss = self._loss_dict[key].cross_replica_aggregation(
+          _loss, num_replicas_in_sync)
+      loss_val += _loss
+      # detach all the below gradients: none of them should make a
+      # contribution to the gradient form this point forwards
+      metric_loss += tf.stop_gradient(_mean_loss)
+      metric_dict[key]['loss'] = tf.stop_gradient(_mean_loss)
+      metric_dict[key]['avg_iou'] = tf.stop_gradient(_avg_iou)
+      metric_dict[key]["avg_obj"] = tf.stop_gradient(_avg_obj)
+      if not use_reduced_logs:
+        metric_dict[key]['conf_loss'] = tf.stop_gradient(_loss_conf)
+        metric_dict[key]['box_loss'] = tf.stop_gradient(_loss_box)
+        metric_dict[key]['class_loss'] = tf.stop_gradient(_loss_class)
+      metric_dict['net']['box'] += tf.stop_gradient(_loss_box)
+      metric_dict['net']['class'] += tf.stop_gradient(_loss_class)
+      metric_dict['net']['conf'] += tf.stop_gradient(_loss_conf)
+    return loss_val, metric_loss, metric_dict
--- a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Tests for yolo heads."""
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+from official.vision.beta.projects.yolo.losses import yolo_loss
+class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
+  def test_loss_init(self):
+    """Test creation of YOLO family models."""
+    def inpdict(input_shape, dtype = tf.float32):
+      inputs = {}
+      for key in input_shape:
+        inputs[key] = tf.ones(input_shape[key], dtype=dtype)
+      return inputs
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 255],
+        '4': [1, 26, 26, 255],
+        '5': [1, 13, 13, 255]
+    }
+    classes = 80
+    bps = 3
+    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
+    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
+               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
+               [348.0, 340.0]]
+    box_type = {key:"scaled" for key in masks.keys()}
+    keys = ['3', '4', '5']
+    path_strides = {key: 2**int(key) for key in keys}
+    loss = yolo_loss.YoloLoss(
+      keys, 
+      classes, 
+      anchors, 
+      masks = masks, 
+      path_strides=path_strides, 
+      truth_thresholds={key: 1.0 for key in keys}, 
+      ignore_thresholds={key: 0.7 for key in keys}, 
+      loss_types = {key: "ciou" for key in keys},
+      iou_normalizers = {key: 0.05 for key in keys},
+      cls_normalizers = {key: 0.5 for key in keys},
+      obj_normalizers = {key: 1.0 for key in keys},
+      objectness_smooths = {key: 1.0 for key in keys},
+      box_types = {key: "scaled" for key in keys},
+      scale_xys = {key: 2.0 for key in keys},
+      max_deltas = {key: 30.0 for key in keys},
+      label_smoothing=0.0,
+      use_scaled_loss=True,
+      update_on_repeat=True
+    )
+    count = inpdict({
+        '3': [1, 52, 52, 3, 1],
+        '4': [1, 26, 26, 3, 1],
+        '5': [1, 13, 13, 3, 1]
+    })
+    ind = inpdict({
+        '3': [1, 300, 3],
+        '4': [1, 300, 3],
+        '5': [1, 300, 3]
+    }, tf.int32)
+    truths = inpdict({
+        '3': [1, 300, 8],
+        '4': [1, 300, 8],
+        '5': [1, 300, 8]
+    })
+    boxes = tf.ones([1, 300, 4], dtype = tf.float32)
+    classes = tf.ones([1, 300], dtype = tf.float32)
+    gt = {
+      "true_conf": count, 
+      "inds": ind, 
+      "upds":truths, 
+      "bbox":boxes, 
+      "classes":classes
+    }
+    loss_val, metric_loss, metric_dict = loss(gt, inpdict(input_shape))
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/yolo/ops/loss_utils.py
+++ b/official/vision/beta/projects/yolo/ops/loss_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Yolo loss utility functions."""
+import tensorflow as tf
+import numpy as np
+from official.vision.beta.projects.yolo.ops import (box_ops, math_ops)
+@tf.custom_gradient
+def sigmoid_BCE(y, x_prime, label_smoothing):
+  """Applies the Sigmoid Cross Entropy Loss Using the same derivative as that 
+  found in the Darknet C library. The derivative of this method is not the same 
+  as the standard binary cross entropy with logits function.
+  The BCE with logits function equation is as follows: 
+    x = 1 / (1 + exp(-x_prime))
+    bce = -ylog(x) - (1 - y)log(1 - x) 
+  The standard BCE with logits function derivative is as follows: 
+    dloss = -y/x + (1-y)/(1-x)
+    dsigmoid = x * (1 - x)
+    dx = dloss * dsigmoid
+  This derivative can be reduced simply to: 
+    dx = (-y + x)
+  This simplification is used by the darknet library in order to improve 
+  training stability. The gradient is almost the same 
+  as tf.keras.losses.binary_crossentropy but varies slightly and   
+  yields different performance.
+  Args: 
+    y: `Tensor` holding ground truth data. 
+    x_prime: `Tensor` holding the predictions prior to application of the 
+      sigmoid operation.
+    label_smoothing: float value between 0.0 and 1.0 indicating the amount of 
+      smoothing to apply to the data.
+  Returns: 
+    bce: Tensor of the be applied loss values.
+    delta: callable function indicating the custom gradient for this operation.  
+  """
+  eps = 1e-9
+  x = tf.math.sigmoid(x_prime)
+  y = tf.stop_gradient(y * (1 - label_smoothing) + 0.5 * label_smoothing)
+  bce = -y * tf.math.log(x + eps) - (1 - y) * tf.math.log(1 - x + eps)
+  def delta(dpass):
+    x = tf.math.sigmoid(x_prime)
+    dx = (-y + x) * dpass
+    dy = tf.zeros_like(y)
+    return dy, dx, 0.0
+  return bce, delta
+def apply_mask(mask, x, value=0):
+  """This function is used for gradient masking. The YOLO loss function makes 
+  extensive use of dynamically shaped tensors. To allow this use case on the 
+  TPU while preserving the gradient correctly for back propagation we use this 
+  masking function to use a tf.where operation to hard set masked location to 
+  have a gradient and a value of zero. 
+  Args: 
+    mask: A `Tensor` with the same shape as x used to select values of 
+      importance.
+    x: A `Tensor` with the same shape as mask that will be getting masked.
+  Returns: 
+    x: A masked `Tensor` with the same shape as x.
+  """
+  mask = tf.cast(mask, tf.bool)
+  masked = tf.where(mask, x, tf.zeros_like(x) + value)
+  return masked
+def build_grid(indexes, truths, preds, ind_mask, update=False, grid=None):
+  """This function is used to broadcast all the indexes to the correct
+  ground truth mask, used for iou detection map in the scaled loss and
+  the classification mask in the darknet loss.
+  Args:
+    indexes: A `Tensor` for the indexes
+    truths: A `Tensor` for the ground truth.
+    preds: A `Tensor` for the predictions.
+    ind_mask: A `Tensor` for the index masks.
+    update: A `bool` for updating the grid.
+    grid: A `Tensor` for the grid.
+  Returns:
+    grid: A `Tensor` representing the augmented grid.
+  """
+  # this function is used to broadcast all the indexes to the correct
+  # into the correct ground truth mask, used for iou detection map
+  # in the scaled loss and the classification mask in the darknet loss
+  num_flatten = tf.shape(preds)[-1]
+  # is there a way to verify that we are not on the CPU?
+  ind_mask = tf.cast(ind_mask, indexes.dtype)
+  # find all the batch indexes using the cumulated sum of a ones tensor
+  # cumsum(ones) - 1 yeild the zero indexed batches
+  bhep = tf.reduce_max(tf.ones_like(indexes), axis=-1, keepdims=True)
+  bhep = tf.math.cumsum(bhep, axis=0) - 1
+  # concatnate the batch sizes to the indexes
+  indexes = tf.concat([bhep, indexes], axis=-1)
+  indexes = apply_mask(tf.cast(ind_mask, indexes.dtype), indexes)
+  indexes = (indexes + (ind_mask - 1))
+  # reshape the indexes into the correct shape for the loss,
+  # just flatten all indexes but the last
+  indexes = tf.reshape(indexes, [-1, 4])
+  # also flatten the ground truth value on all axis but the last
+  truths = tf.reshape(truths, [-1, num_flatten])
+  # build a zero grid in the samve shape as the predicitons
+  if grid is None:
+    grid = tf.zeros_like(preds)
+  # remove invalid values from the truths that may have
+  # come up from computation, invalid = nan and inf
+  truths = math_ops.rm_nan_inf(truths)
+  # scatter update the zero grid
+  if update:
+    grid = tf.tensor_scatter_nd_update(grid, indexes, truths)
+  else:
+    grid = tf.tensor_scatter_nd_max(grid, indexes, truths)
+  # stop gradient and return to avoid TPU errors and save compute
+  # resources
+  return grid
+class GridGenerator(object):
+  """Grid generator that generates anchor grids that will be used 
+  in to decode the predicted boxes."""
+  def __init__(self, anchors, masks=None, scale_anchors=None):
+    """Initialize Grid Generator
+    Args:
+      anchors: A `List[List[int]]` for the anchor boxes that are used in the 
+        model at all levels. 
+      mask: A `List[int]` for the output level that this specific model output 
+        Level.
+      scale_anchors: An `int` for how much to scale this level to get the 
+        original input shape.
+    """
+    self.dtype = tf.keras.backend.floatx()
+    if masks is not None:
+      self._num = len(masks)
+    else:
+      self._num = tf.shape(anchors)[0]
+    if masks is not None:
+      anchors = [anchors[mask] for mask in masks]
+    self._scale_anchors = scale_anchors
+    self._anchors = tf.convert_to_tensor(anchors)
+    return
+  def _build_grid_points(self, lwidth, lheight, anchors, dtype):
+    """Generate a grid that is used to detemine the relative centers 
+    of the bounding boxs. """
+    with tf.name_scope('center_grid'):
+      y = tf.range(0, lheight)
+      x = tf.range(0, lwidth)
+      num = tf.shape(anchors)[0]
+      x_left = tf.tile(
+          tf.transpose(tf.expand_dims(y, axis=-1), perm=[1, 0]), [lwidth, 1])
+      y_left = tf.tile(tf.expand_dims(x, axis=-1), [1, lheight])
+      x_y = tf.stack([x_left, y_left], axis=-1)
+      x_y = tf.cast(x_y, dtype=dtype)
+      x_y = tf.expand_dims(
+          tf.tile(tf.expand_dims(x_y, axis=-2), [1, 1, num, 1]), axis=0)
+    return x_y
+  def _build_anchor_grid(self, anchors, dtype):
+    """Get the transformed anchor boxes for each dimention. """
+    with tf.name_scope('anchor_grid'):
+      num = tf.shape(anchors)[0]
+      anchors = tf.cast(anchors, dtype=dtype)
+      anchors = tf.reshape(anchors, [1, 1, 1, num, 2])
+    return anchors
+  def _extend_batch(self, grid, batch_size):
+    return tf.tile(grid, [batch_size, 1, 1, 1, 1])
+  def __call__(self, width, height, batch_size, dtype=None):
+    if dtype is None:
+      self.dtype = tf.keras.backend.floatx()
+    else:
+      self.dtype = dtype
+    grid_points = self._build_grid_points(width, height, self._anchors,
+                                          self.dtype)
+    anchor_grid = self._build_anchor_grid(
+        tf.cast(self._anchors, self.dtype) /
+        tf.cast(self._scale_anchors, self.dtype), self.dtype)
+    grid_points = self._extend_batch(grid_points, batch_size)
+    anchor_grid = self._extend_batch(anchor_grid, batch_size)
+    return grid_points, anchor_grid
+TILE_SIZE = 50
+class PairWiseSearch(object):
+  """This method applies a pairwise search between the ground truth 
+  and the labels. The goal is to indicate the locations where the 
+  predictions overlap with ground truth for dynamic ground 
+  truth constructions."""
+  def __init__(self,
+               iou_type='iou',
+               any=True,
+               min_conf=0.0,
+               track_boxes=False,
+               track_classes=False):
+    """Initialization of Pair Wise Search.
+    Args: 
+      iou_type: An `str` for the iou type to use.
+      any: A `bool` for any match(no class match).
+      min_conf: An `int` for minimum confidence threshold.
+      track_boxes: A `bool` dynamic box assignment.
+      track_classes: A `bool` dynamic class assignment.
+    """
+    self.iou_type = iou_type
+    self._any = any
+    self._min_conf = min_conf
+    self._track_boxes = track_boxes
+    self._track_classes = track_classes
+    return
+  def box_iou(self, true_box, pred_box):
+    # based on the type of loss, compute the iou loss for a box
+    # compute_<name> indicated the type of iou to use
+    if self.iou_type == 'giou':
+      _, iou = box_ops.compute_giou(true_box, pred_box)
+    elif self.iou_type == 'ciou':
+      _, iou = box_ops.compute_ciou(true_box, pred_box)
+    else:
+      iou = box_ops.compute_iou(true_box, pred_box)
+    return iou
+  def _search_body(self, pred_box, pred_class, boxes, classes, running_boxes,
+                   running_classes, max_iou, idx):
+    # capture the batch size to be used, and gather a slice of
+    # boxes from the ground truth. currently TILE_SIZE = 50, to
+    # save memory
+    batch_size = tf.shape(boxes)[0]
+    box_slice = tf.slice(boxes, [0, idx * TILE_SIZE, 0],
+                         [batch_size, TILE_SIZE, 4])
+    # match the dimentions of the slice to the model predictions
+    # shape: [batch_size, 1, 1, num, TILE_SIZE, 4]
+    box_slice = tf.expand_dims(box_slice, axis=1)
+    box_slice = tf.expand_dims(box_slice, axis=1)
+    box_slice = tf.expand_dims(box_slice, axis=1)
+    box_grid = tf.expand_dims(pred_box, axis=-2)
+    # capture the classes
+    class_slice = tf.slice(classes, [0, idx * TILE_SIZE],
+                           [batch_size, TILE_SIZE])
+    class_slice = tf.expand_dims(class_slice, axis=1)
+    class_slice = tf.expand_dims(class_slice, axis=1)
+    class_slice = tf.expand_dims(class_slice, axis=1)
+    iou = self.box_iou(box_slice, box_grid)
+    if self._min_conf > 0.0:
+      if not self._any:
+        class_grid = tf.expand_dims(pred_class, axis=-2)
+        class_mask = tf.one_hot(
+            tf.cast(class_slice, tf.int32),
+            depth=tf.shape(pred_class)[-1],
+            dtype=pred_class.dtype)
+        class_mask = tf.reduce_any(tf.equal(class_mask, class_grid), axis=-1)
+      else:
+        class_mask = tf.reduce_max(pred_class, axis=-1, keepdims=True)
+      class_mask = tf.cast(class_mask, iou.dtype)
+      iou *= class_mask
+    max_iou_ = tf.concat([max_iou, iou], axis=-1)
+    max_iou = tf.reduce_max(max_iou_, axis=-1, keepdims=True)
+    ind = tf.expand_dims(tf.argmax(max_iou_, axis=-1), axis=-1)
+    if self._track_boxes:
+      running_boxes = tf.expand_dims(running_boxes, axis=-2)
+      box_slice = tf.zeros_like(running_boxes) + box_slice
+      box_slice = tf.concat([running_boxes, box_slice], axis=-2)
+      running_boxes = tf.gather_nd(box_slice, ind, batch_dims=4)
+    if self._track_classes:
+      running_classes = tf.expand_dims(running_classes, axis=-1)
+      class_slice = tf.zeros_like(running_classes) + class_slice
+      class_slice = tf.concat([running_classes, class_slice], axis=-1)
+      running_classes = tf.gather_nd(class_slice, ind, batch_dims=4)
+    return (pred_box, pred_class, boxes, classes, running_boxes,
+            running_classes, max_iou, idx + 1)
+  def __call__(self,
+               pred_boxes,
+               pred_classes,
+               boxes,
+               classes,
+               scale=None,
+               yxyx=True,
+               clip_thresh=0.0):
+    num_boxes = tf.shape(boxes)[-2]
+    num_tiles = (num_boxes // TILE_SIZE) - 1
+    if yxyx:
+      boxes = box_ops.yxyx_to_xcycwh(boxes)
+    if scale is not None:
+      boxes = boxes * tf.stop_gradient(scale)
+    if self._min_conf > 0.0:
+      pred_classes = tf.cast(pred_classes > self._min_conf, pred_classes.dtype)
+    def _loop_cond(pred_box, pred_class, boxes, classes, running_boxes,
+                   running_classes, max_iou, idx):
+      # check that the slice has boxes that all zeros
+      batch_size = tf.shape(boxes)[0]
+      box_slice = tf.slice(boxes, [0, idx * TILE_SIZE, 0],
+                           [batch_size, TILE_SIZE, 4])
+      return tf.logical_and(idx < num_tiles,
+                            tf.math.greater(tf.reduce_sum(box_slice), 0))
+    running_boxes = tf.zeros_like(pred_boxes)
+    running_classes = tf.zeros_like(tf.reduce_sum(running_boxes, axis=-1))
+    max_iou = tf.zeros_like(tf.reduce_sum(running_boxes, axis=-1))
+    max_iou = tf.expand_dims(max_iou, axis=-1)
+    (pred_boxes, pred_classes, boxes, classes, running_boxes, running_classes,
+     max_iou, idx) = tf.while_loop(_loop_cond, self._search_body, [
+         pred_boxes, pred_classes, boxes, classes, running_boxes,
+         running_classes, max_iou,
+         tf.constant(0)
+     ])
+    mask = tf.cast(max_iou > clip_thresh, running_boxes.dtype)
+    running_boxes *= mask
+    running_classes *= tf.squeeze(mask, axis=-1)
+    max_iou *= mask
+    max_iou = tf.squeeze(max_iou, axis=-1)
+    mask = tf.squeeze(mask, axis=-1)
+    return (tf.stop_gradient(running_boxes), tf.stop_gradient(running_classes),
+            tf.stop_gradient(max_iou), tf.stop_gradient(mask))
+def avgiou(iou):
+  """Computes the average intersection over union without counting locations
+  where the iou is zero.
+  Args:
+    iou: A `Tensor` representing the iou values.
+  Returns:
+    tf.stop_gradient(avg_iou): A `Tensor` representing average
+     intersection over union.
+  """
+  iou_sum = tf.reduce_sum(iou, axis=tf.range(1, tf.shape(tf.shape(iou))[0]))
+  counts = tf.cast(
+      tf.math.count_nonzero(iou, axis=tf.range(1,
+                                               tf.shape(tf.shape(iou))[0])),
+      iou.dtype)
+  avg_iou = tf.reduce_mean(math_ops.divide_no_nan(iou_sum, counts))
+  return tf.stop_gradient(avg_iou)
+def _scale_boxes(encoded_boxes, width, height, anchor_grid, grid_points,
+                 scale_xy):
+  # split the boxes
+  pred_xy = encoded_boxes[..., 0:2]
+  pred_wh = encoded_boxes[..., 2:4]
+  # build a scaling tensor to get the offset of th ebox relative to the image
+  scaler = tf.convert_to_tensor([height, width, height, width])
+  scale_xy = tf.cast(scale_xy, encoded_boxes.dtype)
+  # apply the sigmoid
+  pred_xy = tf.math.sigmoid(pred_xy)
+  # scale the centers and find the offset of each box relative to
+  # their center pixel
+  pred_xy = pred_xy * scale_xy - 0.5 * (scale_xy - 1)
+  # scale the offsets and add them to the grid points or a tensor that is
+  # the realtive location of each pixel
+  box_xy = grid_points + pred_xy
+  # scale the width and height of the predictions and corlate them
+  # to anchor boxes
+  box_wh = tf.math.exp(pred_wh) * anchor_grid
+  # build the final predicted box
+  scaled_box = tf.concat([box_xy, box_wh], axis=-1)
+  pred_box = scaled_box / scaler
+  # shift scaled boxes
+  scaled_box = tf.concat([pred_xy, box_wh], axis=-1)
+  return (scaler, scaled_box, pred_box)
+@tf.custom_gradient
+def _darknet_boxes(encoded_boxes, width, height, anchor_grid, grid_points,
+                   max_delta, scale_xy):
+  (scaler, scaled_box, pred_box) = _scale_boxes(encoded_boxes, width, height,
+                                                anchor_grid, grid_points,
+                                                scale_xy)
+  def delta(dy_scaler, dy_scaled, dy):
+    dy_xy, dy_wh = tf.split(dy, 2, axis=-1)
+    dy_xy_, dy_wh_ = tf.split(dy_scaled, 2, axis=-1)
+    # add all the gradients that may have been applied to the
+    # boxes and those that have been applied to the width and height
+    dy_wh += dy_wh_
+    dy_xy += dy_xy_
+    # propagate the exponential applied to the width and height in
+    # order to ensure the gradient propagated is of the correct
+    # magnitude
+    pred_wh = encoded_boxes[..., 2:4]
+    dy_wh *= tf.math.exp(pred_wh)
+    dbox = tf.concat([dy_xy, dy_wh], axis=-1)
+    # apply the gradient clipping to xy and wh
+    dbox = math_ops.rm_nan_inf(dbox)
+    delta = tf.cast(max_delta, dbox.dtype)
+    dbox = tf.clip_by_value(dbox, -delta, delta)
+    return dbox, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+  return (scaler, scaled_box, pred_box), delta
+def _new_coord_scale_boxes(encoded_boxes, width, height, anchor_grid,
+                           grid_points, scale_xy):
+  # split the boxes
+  pred_xy = encoded_boxes[..., 0:2]
+  pred_wh = encoded_boxes[..., 2:4]
+  # build a scaling tensor to get the offset of th ebox relative to the image
+  scaler = tf.convert_to_tensor([height, width, height, width])
+  scale_xy = tf.cast(scale_xy, pred_xy.dtype)
+  # apply the sigmoid
+  pred_xy = tf.math.sigmoid(pred_xy)
+  pred_wh = tf.math.sigmoid(pred_wh)
+  # scale the xy offset predictions according to the config
+  pred_xy = pred_xy * scale_xy - 0.5 * (scale_xy - 1)
+  # find the true offset from the grid points and the scaler
+  # where the grid points are the relative offset of each pixel with
+  # in the image
+  box_xy = grid_points + pred_xy
+  # decode the widht and height of the boxes and correlate them
+  # to the anchor boxes
+  box_wh = (2 * pred_wh)**2 * anchor_grid
+  # build the final boxes
+  scaled_box = tf.concat([box_xy, box_wh], axis=-1)
+  pred_box = scaled_box / scaler
+  # shift scaled boxes
+  scaled_box = tf.concat([pred_xy, box_wh], axis=-1)
+  return (scaler, scaled_box, pred_box)
+@tf.custom_gradient
+def _darknet_new_coord_boxes(encoded_boxes, width, height, anchor_grid,
+                             grid_points, max_delta, scale_xy):
+  (scaler, scaled_box,
+   pred_box) = _new_coord_scale_boxes(encoded_boxes, width, height, anchor_grid,
+                                      grid_points, scale_xy)
+  def delta(dy_scaler, dy_scaled, dy):
+    dy_xy, dy_wh = tf.split(dy, 2, axis=-1)
+    dy_xy_, dy_wh_ = tf.split(dy_scaled, 2, axis=-1)
+    # add all the gradients that may have been applied to the
+    # boxes and those that have been applied to the width and height
+    dy_wh += dy_wh_
+    dy_xy += dy_xy_
+    dbox = tf.concat([dy_xy, dy_wh], axis=-1)
+    # apply the gradient clipping to xy and wh
+    dbox = math_ops.rm_nan_inf(dbox)
+    delta = tf.cast(max_delta, dbox.dtype)
+    dbox = tf.clip_by_value(dbox, -delta, delta)
+    return dbox, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+  return (scaler, scaled_box, pred_box), delta
+def _anchor_free_scale_boxes(encoded_boxes, width, height, stride, grid_points,
+                             scale_xy):
+  # split the boxes
+  pred_xy = encoded_boxes[..., 0:2]
+  pred_wh = encoded_boxes[..., 2:4]
+  # build a scaling tensor to get the offset of th ebox relative to the image
+  scaler = tf.convert_to_tensor([height, width, height, width])
+  scale_xy = tf.cast(scale_xy, encoded_boxes.dtype)
+  # scale the centers and find the offset of each box relative to
+  # their center pixel
+  pred_xy = pred_xy * scale_xy - 0.5 * (scale_xy - 1)
+  # scale the offsets and add them to the grid points or a tensor that is
+  # the realtive location of each pixel
+  box_xy = (grid_points + pred_xy) * stride
+  # scale the width and height of the predictions and corlate them
+  # to anchor boxes
+  box_wh = tf.math.exp(pred_wh) * stride
+  # build the final predicted box
+  scaled_box = tf.concat([box_xy, box_wh], axis=-1)
+  pred_box = scaled_box / scaler
+  return (scaler, scaled_box, pred_box)
+def get_predicted_box(width,
+                      height,
+                      encoded_boxes,
+                      anchor_grid,
+                      grid_points,
+                      scale_xy,
+                      stride,
+                      darknet=False,
+                      box_type="original",
+                      max_delta=np.inf):
+  """Decodes the predicted boxes from the model format to a usable 
+  [x, y, w, h] format for use in the loss function as well as for use 
+  within the detection generator.   
+  Args: 
+    width: A `float` scalar indicating the width of the prediction layer.
+    height: A `float` scalar indicating the height of the prediction layer
+    encoded_boxes: A `Tensor` of shape [..., height, width, 4] holding encoded 
+      boxes.
+    anchor_grid: A `Tensor` of shape [..., 1, 1, 2] holding the anchor boxes 
+      organized for box decoding, box width and height.  
+    grid_points: A `Tensor` of shape [..., height, width, 2] holding the anchor 
+      boxes for decoding the box centers.
+    scale_xy: A `float` scaler used to indicate the range for each center 
+      outside of its given [..., i, j, 4] index, where i and j are indexing 
+      pixels along the width and height of the predicted output map. 
+    stride: An `int` defining the amount of down stride realtive to the input 
+      image.
+    darknet: A `bool` used to select between custom gradient and default 
+      autograd.  
+    box_type: An `str` indicating the type of box encoding that is being used.
+    max_delta: A `float` scaler used for gradient clipping in back propagation. 
+  Returns: 
+    scaler: A `Tensor` of shape [4] returned to allow the scaling of the ground 
+      truth boxes to be of the same magnitude as the decoded predicted boxes.
+    scaled_box: A `Tensor` of shape [..., height, width, 4] with the predicted 
+      boxes.
+    pred_box: A `Tensor` of shape [..., height, width, 4] with the predicted 
+      boxes divided by the scaler parameter used to put all boxes in the [0, 1] 
+      range. 
+  """
+  if box_type == 'anchor_free':
+    (scaler, scaled_box,
+     pred_box) = _anchor_free_scale_boxes(encoded_boxes, width, height, stride,
+                                          grid_points, scale_xy)
+  elif darknet:
+    # if we are using the darknet loss we shoud nto propagate the
+    # decoding of the box
+    if box_type == 'scaled':
+      (scaler, scaled_box,
+       pred_box) = _darknet_new_coord_boxes(encoded_boxes, width, height,
+                                            anchor_grid, grid_points, max_delta,
+                                            scale_xy)
+    else:
+      (scaler, scaled_box,
+       pred_box) = _darknet_boxes(encoded_boxes, width, height, anchor_grid,
+                                  grid_points, max_delta, scale_xy)
+  else:
+    # if we are using the scaled loss we should propagate the decoding of
+    # the boxes
+    if box_type == 'scaled':
+      (scaler, scaled_box,
+       pred_box) = _new_coord_scale_boxes(encoded_boxes, width, height,
+                                          anchor_grid, grid_points, scale_xy)
+    else:
+      (scaler, scaled_box, pred_box) = _scale_boxes(encoded_boxes, width,
+                                                    height, anchor_grid,
+                                                    grid_points, scale_xy)
+  return (scaler, scaled_box, pred_box)