Merge pull request #10251 from PurdueDualityLab:loss_fn_pr

PiperOrigin-RevId: 396512110

Merge pull request #10251 from PurdueDualityLab:loss_fn_pr
PiperOrigin-RevId: 396512110
c6d7d57d · A. Unique TensorFlower · 31fb7a65 · 7f90664e · c6d7d57d · c6d7d57d
Commit c6d7d57d authored Sep 13, 2021 by A. Unique TensorFlower
14 changed files
--- a/official/vision/beta/projects/yolo/losses/__init__.py
+++ b/official/vision/beta/projects/yolo/losses/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/official/vision/beta/projects/yolo/losses/yolo_loss.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss.py
--- a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for yolo heads."""
+from absl.testing import parameterized
+import tensorflow as tf
+from official.vision.beta.projects.yolo.losses import yolo_loss
+class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.parameters(
+      (True),
+      (False),
+  )
+  def test_loss_init(self, scaled):
+    """Test creation of YOLO family models."""
+    def inpdict(input_shape, dtype=tf.float32):
+      inputs = {}
+      for key in input_shape:
+        inputs[key] = tf.ones(input_shape[key], dtype=dtype)
+      return inputs
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 255],
+        '4': [1, 26, 26, 255],
+        '5': [1, 13, 13, 255]
+    }
+    classes = 80
+    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
+    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
+               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
+               [348.0, 340.0]]
+    keys = ['3', '4', '5']
+    path_strides = {key: 2**int(key) for key in keys}
+    loss = yolo_loss.YoloLoss(
+        keys,
+        classes,
+        anchors,
+        masks=masks,
+        path_strides=path_strides,
+        truth_thresholds={key: 1.0 for key in keys},
+        ignore_thresholds={key: 0.7 for key in keys},
+        loss_types={key: 'ciou' for key in keys},
+        iou_normalizers={key: 0.05 for key in keys},
+        cls_normalizers={key: 0.5 for key in keys},
+        obj_normalizers={key: 1.0 for key in keys},
+        objectness_smooths={key: 1.0 for key in keys},
+        box_types={key: 'scaled' for key in keys},
+        scale_xys={key: 2.0 for key in keys},
+        max_deltas={key: 30.0 for key in keys},
+        label_smoothing=0.0,
+        use_scaled_loss=scaled,
+        update_on_repeat=True)
+    count = inpdict({
+        '3': [1, 52, 52, 3, 1],
+        '4': [1, 26, 26, 3, 1],
+        '5': [1, 13, 13, 3, 1]
+    })
+    ind = inpdict({
+        '3': [1, 300, 3],
+        '4': [1, 300, 3],
+        '5': [1, 300, 3]
+    }, tf.int32)
+    truths = inpdict({'3': [1, 300, 8], '4': [1, 300, 8], '5': [1, 300, 8]})
+    boxes = tf.ones([1, 300, 4], dtype=tf.float32)
+    classes = tf.ones([1, 300], dtype=tf.float32)
+    gt = {
+        'true_conf': count,
+        'inds': ind,
+        'upds': truths,
+        'bbox': boxes,
+        'classes': classes
+    }
+    _, _, _ = loss(gt, inpdict(input_shape))
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Contains definitions of Darknet Backbone Networks.
   The models are inspired by ResNet and CSPNet.
@@ -390,7 +389,7 @@ class Darknet(tf.keras.Model):
      norm_momentum=0.99,
      norm_epsilon=0.001,
      dilate=False,
-      kernel_initializer='glorot_uniform',
+      kernel_initializer='VarianceScaling',
      kernel_regularizer=None,
      bias_regularizer=None,
      **kwargs):
@@ -507,10 +506,12 @@ class Darknet(tf.keras.Model):
    self._default_dict['name'] = f'{name}_csp_down'
    if self._dilate:
      self._default_dict['dilation_rate'] = config.dilation_rate
+      degrid = int(tf.math.log(float(config.dilation_rate)) / tf.math.log(2.))
    else:
      self._default_dict['dilation_rate'] = 1
+      degrid = 0
-    # swap/add dilation
+    # swap/add dialation
    x, x_route = nn_blocks.CSPRoute(
        filters=config.filters,
        filter_scale=csp_filter_scale,
@@ -518,7 +519,7 @@ class Darknet(tf.keras.Model):
        **self._default_dict)(
            inputs)
-    dilated_reps = config.repetitions - self._default_dict['dilation_rate'] // 2
+    dilated_reps = config.repetitions - degrid
    for i in range(dilated_reps):
      self._default_dict['name'] = f'{name}_{i}'
      x = nn_blocks.DarkResidual(
@@ -528,8 +529,8 @@ class Darknet(tf.keras.Model):
              x)
    for i in range(dilated_reps, config.repetitions):
-      self._default_dict[
+      self._default_dict['dilation_rate'] = max(
-          'dilation_rate'] = self._default_dict['dilation_rate'] // 2
+          1, self._default_dict['dilation_rate'] // 2)
      self._default_dict[
          'name'] = f"{name}_{i}_degridded_{self._default_dict['dilation_rate']}"
      x = nn_blocks.DarkResidual(
@@ -592,8 +593,8 @@ class Darknet(tf.keras.Model):
        filters=config.filters, downsample=True, **self._default_dict)(
            inputs)
-    dilated_reps = config.repetitions - (
+    dilated_reps = config.repetitions - self._default_dict[
-        self._default_dict['dilation_rate'] // 2) - 1
+        'dilation_rate'] // 2 - 1
    for i in range(dilated_reps):
      self._default_dict['name'] = f'{name}_{i}'
      x = nn_blocks.DarkResidual(
@@ -661,12 +662,13 @@ class Darknet(tf.keras.Model):
 @factory.register_backbone_builder('darknet')
 def build_darknet(
    input_specs: tf.keras.layers.InputSpec,
-    backbone_config: hyperparams.Config,
+    backbone_cfg: hyperparams.Config,
    norm_activation_config: hyperparams.Config,
    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
  """Builds darknet."""
-  backbone_cfg = backbone_config.get()
+  backbone_cfg = backbone_cfg.get()
  model = Darknet(
      model_id=backbone_cfg.model_id,
      min_level=backbone_cfg.min_level,

--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Feature Pyramid Network and Path Aggregation variants used in YOLO."""
 import tensorflow as tf
@@ -39,7 +38,7 @@ class YoloFPN(tf.keras.layers.Layer):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
@@ -184,7 +183,7 @@ class YoloPAN(tf.keras.layers.Layer):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               fpn_input=True,
@@ -206,7 +205,7 @@ class YoloPAN(tf.keras.layers.Layer):
        by zero.
      kernel_initializer: kernel_initializer for convolutional layers.
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
      fpn_input: `bool`, for whether the input into this fucntion is an FPN or
        a backbone.
      fpn_filter_scale: `int`, scaling factor for the FPN filters.
@@ -374,7 +373,7 @@ class YoloDecoder(tf.keras.Model):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
@@ -389,8 +388,8 @@ class YoloDecoder(tf.keras.Model):
      use_fpn: `bool`, use the FPN found in the YoloV4 model.
      use_spatial_attention: `bool`, use the spatial attention module.
      csp_stack: `bool`, CSPize the FPN.
-      fpn_depth: `int`, number of layers ot use in each FPN path
+      fpn_depth: `int`, number of layers ot use in each FPN path if you choose
-        if you choose to use an FPN.
+        to use an FPN.
      fpn_filter_scale: `int`, scaling factor for the FPN filters.
      path_process_len: `int`, number of layers ot use in each Decoder path.
      max_level_process_len: `int`, number of layers ot use in the largest

--- a/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Yolo heads."""
 import tensorflow as tf
@@ -30,10 +29,11 @@ class YoloHead(tf.keras.layers.Layer):
               output_extras=0,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               activation=None,
+               smart_bias=False,
               **kwargs):
    """Yolo Prediction Head initialization function.
@@ -52,6 +52,7 @@ class YoloHead(tf.keras.layers.Layer):
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
      activation: `str`, the activation function to use typically leaky or mish.
+      smart_bias: `bool` whether or not use smart bias.
      **kwargs: keyword arguments to be passed.
    """
@@ -68,6 +69,7 @@ class YoloHead(tf.keras.layers.Layer):
    self._output_extras = output_extras
    self._output_conv = (classes + output_extras + 5) * boxes_per_level
+    self._smart_bias = smart_bias
    self._base_config = dict(
        activation=activation,
@@ -85,10 +87,29 @@ class YoloHead(tf.keras.layers.Layer):
        use_bn=False,
        **self._base_config)
+  def bias_init(self, scale, inshape, isize=640, no_per_conf=8):
+    def bias(shape, dtype):
+      init = tf.keras.initializers.Zeros()
+      base = init(shape, dtype=dtype)
+      if self._smart_bias:
+        base = tf.reshape(base, [self._boxes_per_level, -1])
+        box, conf, classes = tf.split(base, [4, 1, -1], axis=-1)
+        conf += tf.math.log(no_per_conf / ((isize / scale)**2))
+        classes += tf.math.log(0.6 / (self._classes - 0.99))
+        base = tf.concat([box, conf, classes], axis=-1)
+        base = tf.reshape(base, [-1])
+      return base
+    return bias
  def build(self, input_shape):
    self._head = dict()
    for key in self._key_list:
-      self._head[key] = nn_blocks.ConvBN(**self._conv_config)
+      scale = 2**int(key)
+      self._head[key] = nn_blocks.ConvBN(
+          bias_initializer=self.bias_init(scale, input_shape[key][-1]),
+          **self._conv_config)
  def call(self, inputs):
    outputs = dict()
@@ -107,6 +128,10 @@ class YoloHead(tf.keras.layers.Layer):
          'Model has to be built before number of boxes can be determined.')
    return (self._max_level - self._min_level + 1) * self._boxes_per_level
+  @property
+  def num_heads(self):
+    return self._max_level - self._min_level + 1
  def get_config(self):
    config = dict(
        min_level=self._min_level,

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
@@ -15,7 +15,10 @@
 """Contains common building blocks for yolo layer (detection layer)."""
 import tensorflow as tf
+from official.vision.beta.modeling.layers import detection_generator
+from official.vision.beta.projects.yolo.losses import yolo_loss
 from official.vision.beta.projects.yolo.ops import box_ops
+from official.vision.beta.projects.yolo.ops import loss_utils
 @tf.keras.utils.register_keras_serializable(package='yolo')
@@ -36,11 +39,11 @@ class YoloLayer(tf.keras.Model):
               cls_normalizer=1.0,
               obj_normalizer=1.0,
               use_scaled_loss=False,
-               darknet=None,
+               update_on_repeat=False,
               pre_nms_points=5000,
               label_smoothing=0.0,
               max_boxes=200,
-               new_cords=False,
+               box_type='original',
               path_scale=None,
               scale_xy=None,
               nms_type='greedy',
@@ -70,14 +73,25 @@ class YoloLayer(tf.keras.Model):
      obj_normalizer: `float` for how much to scale loss on the detection map.
      use_scaled_loss: `bool` for whether to use the scaled loss
        or the traditional loss.
-      darknet: `bool` for whether to use the DarkNet or PyTorch loss function
+      update_on_repeat: `bool` indicating how you would like to handle repeated
-        implementation.
+        indexes in a given [j, i] index. Setting this to True will give more
+        consistent MAP, setting it to falls will improve recall by 1-2% but will
+        sacrifice some MAP.
      pre_nms_points: `int` number of top candidate detections per class before
        NMS.
      label_smoothing: `float` for how much to smooth the loss on the classes.
      max_boxes: `int` for the maximum number of boxes retained over all
        classes.
-      new_cords: `bool` for using the ScaledYOLOv4 coordinates.
+      box_type: `str`, there are 3 different box types that will affect training
+        differently {original, scaled and anchor_free}. The original method
+        decodes the boxes by applying an exponential to the model width and
+        height maps, then scaling the maps by the anchor boxes. This method is
+        used in Yolo-v4, Yolo-v3, and all its counterparts. The Scale method
+        squares the width and height and scales both by a fixed factor of 4.
+        This method is used in the Scale Yolo models, as well as Yolov4-CSP.
+        Finally, anchor_free is like the original method but will not apply an
+        activation function to the boxes, this is used for some of the newer
+        anchor free versions of YOLO.
      path_scale: `dict` for the size of the input tensors. Defaults to
        precalulated values from the `mask`.
      scale_xy: dictionary `float` values inidcating how far each pixel can see
@@ -91,18 +105,6 @@ class YoloLayer(tf.keras.Model):
      objectness_smooth: `float` for how much to smooth the loss on the
        detection map.
      **kwargs: Addtional keyword arguments.
-    Return:
-      loss: `float` for the actual loss.
-      box_loss: `float` loss on the boxes used for metrics.
-      conf_loss: `float` loss on the confidence used for metrics.
-      class_loss: `float` loss on the classes used for metrics.
-      avg_iou: `float` metric for the average iou between predictions
-        and ground truth.
-      avg_obj: `float` metric for the average confidence of the model
-        for predictions.
-      recall50: `float` metric for how accurate the model is.
-      precision50: `float` metric for how precise the model is.
    """
    super().__init__(**kwargs)
    self._masks = masks
@@ -121,29 +123,18 @@ class YoloLayer(tf.keras.Model):
    self._loss_type = loss_type
    self._use_scaled_loss = use_scaled_loss
-    self._darknet = darknet
+    self._update_on_repeat = update_on_repeat
    self._pre_nms_points = pre_nms_points
    self._label_smoothing = label_smoothing
    self._keys = list(masks.keys())
    self._len_keys = len(self._keys)
-    self._new_cords = new_cords
+    self._box_type = box_type
    self._path_scale = path_scale or {
        key: 2**int(key) for key, _ in masks.items()
    }
-    self._nms_types = {
+    self._nms_type = nms_type
-        'greedy': 1,
-        'iou': 2,
-        'giou': 3,
-        'ciou': 4,
-        'diou': 5,
-        'class_independent': 6,
-        'weighted_diou': 7
-    }
-    self._nms_type = self._nms_types[nms_type]
    self._scale_xy = scale_xy or {key: 1.0 for key, _ in masks.items()}
    self._generator = {}
@@ -156,27 +147,33 @@ class YoloLayer(tf.keras.Model):
    return
  def get_generators(self, anchors, path_scale, path_key):
-    return None
+    anchor_generator = loss_utils.GridGenerator(
+        anchors, scale_anchors=path_scale)
-  def rm_nan_inf(self, x, val=0.0):
+    return anchor_generator
-    x = tf.where(tf.math.is_nan(x), tf.cast(val, dtype=x.dtype), x)
-    x = tf.where(tf.math.is_inf(x), tf.cast(val, dtype=x.dtype), x)
-    return x
  def parse_prediction_path(self, key, inputs):
+    shape_ = tf.shape(inputs)
    shape = inputs.get_shape().as_list()
-    height, width = shape[1], shape[2]
+    batchsize, height, width = shape_[0], shape[1], shape[2]
+    if height is None or width is None:
+      height, width = shape_[1], shape_[2]
+    generator = self._generator[key]
    len_mask = self._len_mask[key]
+    scale_xy = self._scale_xy[key]
    # reshape the yolo output to (batchsize,
    #                             width,
    #                             height,
    #                             number_anchors,
    #                             remaining_points)
    data = tf.reshape(inputs, [-1, height, width, len_mask, self._classes + 5])
+    # use the grid generator to get the formatted anchor boxes and grid points
+    # in shape [1, height, width, 2]
+    centers, anchors = generator(height, width, batchsize, dtype=data.dtype)
    # split the yolo detections into boxes, object score map, classes
    boxes, obns_scores, class_scores = tf.split(
        data, [4, 1, self._classes], axis=-1)
@@ -184,25 +181,32 @@ class YoloLayer(tf.keras.Model):
    # determine the number of classes
    classes = class_scores.get_shape().as_list()[-1]
+    # configurable to use the new coordinates in scaled Yolo v4 or not
+    _, _, boxes = loss_utils.get_predicted_box(
+        tf.cast(height, data.dtype),
+        tf.cast(width, data.dtype),
+        boxes,
+        anchors,
+        centers,
+        scale_xy,
+        stride=self._path_scale[key],
+        darknet=False,
+        box_type=self._box_type[key])
    # convert boxes from yolo(x, y, w. h) to tensorflow(ymin, xmin, ymax, xmax)
    boxes = box_ops.xcycwh_to_yxyx(boxes)
    # activate and detection map
    obns_scores = tf.math.sigmoid(obns_scores)
-    # threshold the detection map
-    obns_mask = tf.cast(obns_scores > self._thresh, obns_scores.dtype)
    # convert detection map to class detection probabailities
-    class_scores = tf.math.sigmoid(class_scores) * obns_mask * obns_scores
+    class_scores = tf.math.sigmoid(class_scores) * obns_scores
-    class_scores *= tf.cast(class_scores > self._thresh, class_scores.dtype)
-    fill = height * width * len_mask
    # platten predictions to [batchsize, N, -1] for non max supression
+    fill = height * width * len_mask
    boxes = tf.reshape(boxes, [-1, fill, 4])
    class_scores = tf.reshape(class_scores, [-1, fill, classes])
    obns_scores = tf.reshape(obns_scores, [-1, fill])
    return obns_scores, boxes, class_scores
  def call(self, inputs):
@@ -224,26 +228,49 @@ class YoloLayer(tf.keras.Model):
    # colate all predicitons
    boxes = tf.concat(boxes, axis=1)
-    object_scores = tf.keras.backend.concatenate(object_scores, axis=1)
+    object_scores = tf.concat(object_scores, axis=1)
-    class_scores = tf.keras.backend.concatenate(class_scores, axis=1)
+    class_scores = tf.concat(class_scores, axis=1)
+    # get masks to threshold all the predicitons
+    object_mask = tf.cast(object_scores > self._thresh, object_scores.dtype)
+    class_mask = tf.cast(class_scores > self._thresh, class_scores.dtype)
+    # apply thresholds mask to all the predicitons
+    object_scores *= object_mask
+    class_scores *= (tf.expand_dims(object_mask, axis=-1) * class_mask)
+    # apply nms
+    if self._nms_type == 'greedy':
      # greedy NMS
      boxes = tf.cast(boxes, dtype=tf.float32)
      class_scores = tf.cast(class_scores, dtype=tf.float32)
-    nms_items = tf.image.combined_non_max_suppression(
+      boxes, object_scores_, class_scores, num_detections = (
+          tf.image.combined_non_max_suppression(
              tf.expand_dims(boxes, axis=-2),
              class_scores,
              self._pre_nms_points,
              self._max_boxes,
              iou_threshold=self._nms_thresh,
-        score_threshold=self._thresh)
+              score_threshold=self._thresh))
      # cast the boxes and predicitons abck to original datatype
-    boxes = tf.cast(nms_items.nmsed_boxes, object_scores.dtype)
+      boxes = tf.cast(boxes, object_scores.dtype)
-    class_scores = tf.cast(nms_items.nmsed_classes, object_scores.dtype)
+      class_scores = tf.cast(class_scores, object_scores.dtype)
-    object_scores = tf.cast(nms_items.nmsed_scores, object_scores.dtype)
+      object_scores = tf.cast(object_scores_, object_scores.dtype)
+    else:
-    # compute the number of valid detections
+      # TPU NMS
-    num_detections = tf.math.reduce_sum(tf.math.ceil(object_scores), axis=-1)
+      boxes = tf.cast(boxes, dtype=tf.float32)
+      class_scores = tf.cast(class_scores, dtype=tf.float32)
+      (boxes, confidence, classes,
+       num_detections) = detection_generator._generate_detections_v2(  # pylint:disable=protected-access
+           tf.expand_dims(boxes, axis=-2),
+           class_scores,
+           pre_nms_top_k=self._pre_nms_points,
+           max_num_detections=self._max_boxes,
+           nms_iou_threshold=self._nms_thresh,
+           pre_nms_score_threshold=self._thresh)
+      boxes = tf.cast(boxes, object_scores.dtype)
+      class_scores = tf.cast(classes, object_scores.dtype)
+      object_scores = tf.cast(confidence, object_scores.dtype)
    # format and return
    return {
@@ -258,9 +285,28 @@ class YoloLayer(tf.keras.Model):
    """Generates a dictionary of losses to apply to each path.
    Done in the detection generator because all parameters are the same
-    across both loss and detection generator.
+    across both loss and detection generator
    """
-    return None
+    loss = yolo_loss.YoloLoss(
+        keys=self._keys,
+        classes=self._classes,
+        anchors=self._anchors,
+        masks=self._masks,
+        path_strides=self._path_scale,
+        truth_thresholds=self._truth_thresh,
+        ignore_thresholds=self._ignore_thresh,
+        loss_types=self._loss_type,
+        iou_normalizers=self._iou_normalizer,
+        cls_normalizers=self._cls_normalizer,
+        obj_normalizers=self._obj_normalizer,
+        objectness_smooths=self._objectness_smooth,
+        box_types=self._box_type,
+        max_deltas=self._max_delta,
+        scale_xys=self._scale_xy,
+        use_scaled_loss=self._use_scaled_loss,
+        update_on_repeat=self._update_on_repeat,
+        label_smoothing=self._label_smoothing)
+    return loss
  def get_config(self):
    return {

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
@@ -39,7 +39,10 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
               [348.0, 340.0]]
-    layer = dg.YoloLayer(masks, anchors, classes, max_boxes=10)
+    box_type = {key: 'scaled' for key in masks.keys()}
+    layer = dg.YoloLayer(
+        masks, anchors, classes, box_type=box_type, max_boxes=10)
    inputs = {}
    for key in input_shape:

--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Contains common building blocks for yolo neural networks."""
-from typing import Callable, List
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.vision.beta.ops import spatial_transform_ops
@@ -48,7 +46,7 @@ class ConvBN(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -97,7 +95,14 @@ class ConvBN(tf.keras.layers.Layer):
    self._strides = strides
    self._padding = padding
    self._dilation_rate = dilation_rate
+    if kernel_initializer == 'VarianceScaling':
+      # to match pytorch initialization method
+      self._kernel_initializer = tf.keras.initializers.VarianceScaling(
+          scale=1 / 3, mode='fan_in', distribution='uniform')
+    else:
      self._kernel_initializer = kernel_initializer
    self._bias_initializer = bias_initializer
    self._kernel_regularizer = kernel_regularizer
@@ -194,7 +199,7 @@ class DarkResidual(tf.keras.layers.Layer):
               filters=1,
               filter_scale=2,
               dilation_rate=1,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               kernel_regularizer=None,
               bias_regularizer=None,
@@ -366,7 +371,7 @@ class CSPTiny(tf.keras.layers.Layer):
  def __init__(self,
               filters=1,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -532,7 +537,7 @@ class CSPRoute(tf.keras.layers.Layer):
               filters,
               filter_scale=2,
               activation='mish',
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -661,7 +666,7 @@ class CSPConnect(tf.keras.layers.Layer):
               drop_first=False,
               activation='mish',
               kernel_size=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -761,122 +766,6 @@ class CSPConnect(tf.keras.layers.Layer):
    return x
-class CSPStack(tf.keras.layers.Layer):
-  """CSP Stack layer.
-  CSP full stack, combines the route and the connect in case you dont want to
-  jsut quickly wrap an existing callable or list of layers to
-  make it a cross stage partial. Added for ease of use. you should be able
-  to wrap any layer stack with a CSP independent of wether it belongs
-  to the Darknet family. if filter_scale = 2, then the blocks in the stack
-  passed into the the CSP stack should also have filters = filters/filter_scale
-  Cross Stage Partial networks (CSPNets) were proposed in:
-  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu,
-        Ping-Yang Chen, Jun-Wei Hsieh
-      CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
-        arXiv:1911.11929
-  """
-  def __init__(self,
-               filters,
-               model_to_wrap=None,
-               filter_scale=2,
-               activation='mish',
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               bias_regularizer=None,
-               kernel_regularizer=None,
-               downsample=True,
-               use_bn=True,
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               **kwargs):
-    """CSPStack layer initializer.
-    Args:
-      filters: integer for output depth, or the number of features to learn.
-      model_to_wrap: callable Model or a list of callable objects that will
-        process the output of CSPRoute, and be input into CSPConnect.
-        list will be called sequentially.
-      filter_scale: integer dictating (filters//2) or the number of filters in
-        the partial feature stack.
-      activation: string for activation function to use in layer.
-      kernel_initializer: string to indicate which function to use to initialize
-        weights.
-      bias_initializer: string to indicate which function to use to initialize
-        bias.
-      bias_regularizer: string to indicate which function to use to regularizer
-        bias.
-      kernel_regularizer: string to indicate which function to use to
-        regularizer weights.
-      downsample: down_sample the input.
-      use_bn: boolean for whether to use batch normalization.
-      use_sync_bn: boolean for whether sync batch normalization statistics
-        of all batch norm layers to the models global statistics
-        (across all input batches).
-      norm_momentum: float for moment to use for batch normalization.
-      norm_epsilon: float for batch normalization epsilon.
-      **kwargs: Keyword Arguments.
-    Raises:
-      TypeError: model_to_wrap is not a layer or a list of layers
-    """
-    super().__init__(**kwargs)
-    # layer params
-    self._filters = filters
-    self._filter_scale = filter_scale
-    self._activation = activation
-    self._downsample = downsample
-    # convoultion params
-    self._kernel_initializer = kernel_initializer
-    self._bias_initializer = bias_initializer
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._use_bn = use_bn
-    self._use_sync_bn = use_sync_bn
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-    if model_to_wrap is None:
-      self._model_to_wrap = []
-    elif isinstance(model_to_wrap, Callable):
-      self._model_to_wrap = [model_to_wrap]
-    elif isinstance(model_to_wrap, List):
-      self._model_to_wrap = model_to_wrap
-    else:
-      raise TypeError(
-          'the input to the CSPStack must be a list of layers that we can' +
-          'iterate through, or \n a callable')
-  def build(self, input_shape):
-    dark_conv_args = {
-        'filters': self._filters,
-        'filter_scale': self._filter_scale,
-        'activation': self._activation,
-        'kernel_initializer': self._kernel_initializer,
-        'bias_initializer': self._bias_initializer,
-        'bias_regularizer': self._bias_regularizer,
-        'use_bn': self._use_bn,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon,
-        'kernel_regularizer': self._kernel_regularizer,
-    }
-    self._route = CSPRoute(downsample=self._downsample, **dark_conv_args)
-    self._connect = CSPConnect(**dark_conv_args)
-  def call(self, inputs, training=None):
-    x, x_route = self._route(inputs)
-    for layer in self._model_to_wrap:
-      x = layer(x)
-    x = self._connect([x, x_route])
-    return x
 @tf.keras.utils.register_keras_serializable(package='yolo')
 class PathAggregationBlock(tf.keras.layers.Layer):
  """Path Aggregation block."""
@@ -884,7 +773,7 @@ class PathAggregationBlock(tf.keras.layers.Layer):
  def __init__(self,
               filters=1,
               drop_final=True,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1120,7 +1009,7 @@ class SAM(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1192,7 +1081,7 @@ class CAM(tf.keras.layers.Layer):
  def __init__(self,
               reduction_ratio=1.0,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1285,7 +1174,7 @@ class CBAM(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1354,8 +1243,7 @@ class DarkRouteProcess(tf.keras.layers.Layer):
                          insert_spp = False)(x)
  """
-  def __init__(
+  def __init__(self,
-      self,
               filters=2,
               repetitions=2,
               insert_spp=False,
@@ -1363,7 +1251,7 @@ class DarkRouteProcess(tf.keras.layers.Layer):
               insert_cbam=False,
               csp_stack=0,
               csp_scale=2,
-      kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,

--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
@@ -106,86 +106,6 @@ class CSPRouteTest(tf.test.TestCase, parameterized.TestCase):
    self.assertNotIn(None, grad)
-class CSPStackTest(tf.test.TestCase, parameterized.TestCase):
-  def build_layer(self, layer_type, filters, filter_scale, count, stack_type,
-                  downsample):
-    if stack_type is not None:
-      layers = []
-      if layer_type == 'residual':
-        for _ in range(count):
-          layers.append(
-              nn_blocks.DarkResidual(
-                  filters=filters // filter_scale, filter_scale=filter_scale))
-      else:
-        for _ in range(count):
-          layers.append(nn_blocks.ConvBN(filters=filters))
-      if stack_type == 'model':
-        layers = tf.keras.Sequential(layers=layers)
-    else:
-      layers = None
-    stack = nn_blocks.CSPStack(
-        filters=filters,
-        filter_scale=filter_scale,
-        downsample=downsample,
-        model_to_wrap=layers)
-    return stack
-  @parameterized.named_parameters(
-      ('no_stack', 224, 224, 64, 2, 'residual', None, 0, True),
-      ('residual_stack', 224, 224, 64, 2, 'residual', 'list', 2, True),
-      ('conv_stack', 224, 224, 64, 2, 'conv', 'list', 3, False),
-      ('callable_no_scale', 224, 224, 64, 1, 'residual', 'model', 5, False))
-  def test_pass_through(self, width, height, filters, mod, layer_type,
-                        stack_type, count, downsample):
-    x = tf.keras.Input(shape=(width, height, filters))
-    test_layer = self.build_layer(layer_type, filters, mod, count, stack_type,
-                                  downsample)
-    outx = test_layer(x)
-    print(outx)
-    print(outx.shape.as_list())
-    if downsample:
-      self.assertAllEqual(outx.shape.as_list(),
-                          [None, width // 2, height // 2, filters])
-    else:
-      self.assertAllEqual(outx.shape.as_list(), [None, width, height, filters])
-  @parameterized.named_parameters(
-      ('no_stack', 224, 224, 64, 2, 'residual', None, 0, True),
-      ('residual_stack', 224, 224, 64, 2, 'residual', 'list', 2, True),
-      ('conv_stack', 224, 224, 64, 2, 'conv', 'list', 3, False),
-      ('callable_no_scale', 224, 224, 64, 1, 'residual', 'model', 5, False))
-  def test_gradient_pass_though(self, width, height, filters, mod, layer_type,
-                                stack_type, count, downsample):
-    loss = tf.keras.losses.MeanSquaredError()
-    optimizer = tf.keras.optimizers.SGD()
-    init = tf.random_normal_initializer()
-    x = tf.Variable(
-        initial_value=init(shape=(1, width, height, filters), dtype=tf.float32))
-    if not downsample:
-      y = tf.Variable(
-          initial_value=init(
-              shape=(1, width, height, filters), dtype=tf.float32))
-    else:
-      y = tf.Variable(
-          initial_value=init(
-              shape=(1, width // 2, height // 2, filters), dtype=tf.float32))
-    test_layer = self.build_layer(layer_type, filters, mod, count, stack_type,
-                                  downsample)
-    with tf.GradientTape() as tape:
-      x_hat = test_layer(x)
-      grad_loss = loss(x_hat, y)
-    grad = tape.gradient(grad_loss, test_layer.trainable_variables)
-    optimizer.apply_gradients(zip(grad, test_layer.trainable_variables))
-    self.assertNotIn(None, grad)
 class ConvBNTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.named_parameters(

--- a/official/vision/beta/projects/yolo/modeling/yolo_model.py
+++ b/official/vision/beta/projects/yolo/modeling/yolo_model.py
@@ -17,7 +17,7 @@
 import tensorflow as tf
-# Static base Yolo Models that do not require configuration
+# static base Yolo Models that do not require configuration
 # similar to a backbone model id.
 # this is done greatly simplify the model config
@@ -85,26 +85,27 @@ class Yolo(tf.keras.Model):
    """Detection initialization function.
    Args:
-      backbone: `tf.keras.Model`, a backbone network.
+      backbone: `tf.keras.Model` a backbone network.
-      decoder: `tf.keras.Model`, a decoder network.
+      decoder: `tf.keras.Model` a decoder network.
-      head: `YoloHead`, the YOLO head.
+      head: `RetinaNetHead`, the RetinaNet head.
-      detection_generator: `tf.keras.Model`, the detection generator.
+      detection_generator: the detection generator.
      **kwargs: keyword arguments to be passed.
    """
-    super().__init__(**kwargs)
+    super(Yolo, self).__init__(**kwargs)
    self._config_dict = {
        "backbone": backbone,
        "decoder": decoder,
        "head": head,
-        "detection_generator": detection_generator
+        "filter": detection_generator
    }
    # model components
    self._backbone = backbone
    self._decoder = decoder
    self._head = head
-    self._detection_generator = detection_generator
+    self._filter = detection_generator
+    return
  def call(self, inputs, training=False):
    maps = self._backbone(inputs)
@@ -114,7 +115,7 @@ class Yolo(tf.keras.Model):
      return {"raw_output": raw_predictions}
    else:
      # Post-processing.
-      predictions = self._detection_generator(raw_predictions)
+      predictions = self._filter(raw_predictions)
      predictions.update({"raw_output": raw_predictions})
      return predictions
@@ -131,8 +132,8 @@ class Yolo(tf.keras.Model):
    return self._head
  @property
-  def detection_generator(self):
+  def filter(self):
-    return self._detection_generator
+    return self._filter
  def get_config(self):
    return self._config_dict
@@ -140,3 +141,29 @@ class Yolo(tf.keras.Model):
  @classmethod
  def from_config(cls, config):
    return cls(**config)
+  def get_weight_groups(self, train_vars):
+    """Sort the list of trainable variables into groups for optimization.
+    Args:
+      train_vars: a list of tf.Variables that need to get sorted into their
+        respective groups.
+    Returns:
+      weights: a list of tf.Variables for the weights.
+      bias: a list of tf.Variables for the bias.
+      other: a list of tf.Variables for the other operations.
+    """
+    bias = []
+    weights = []
+    other = []
+    for var in train_vars:
+      if "bias" in var.name:
+        bias.append(var)
+      elif "beta" in var.name:
+        bias.append(var)
+      elif "kernel" in var.name or "weight" in var.name:
+        weights.append(var)
+      else:
+        other.append(var)
+    return weights, bias, other
--- a/official/vision/beta/projects/yolo/ops/box_ops.py
+++ b/official/vision/beta/projects/yolo/ops/box_ops.py
@@ -38,51 +38,26 @@ def yxyx_to_xcycwh(box: tf.Tensor):
  return box
-@tf.custom_gradient
+def xcycwh_to_yxyx(box: tf.Tensor):
-def _xcycwh_to_yxyx(box: tf.Tensor, scale):
-  """Private function to allow custom gradients with defaults."""
-  with tf.name_scope('xcycwh_to_yxyx'):
-    xy, wh = tf.split(box, 2, axis=-1)
-    xy_min = xy - wh / 2
-    xy_max = xy + wh / 2
-    x_min, y_min = tf.split(xy_min, 2, axis=-1)
-    x_max, y_max = tf.split(xy_max, 2, axis=-1)
-    box = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
-    def delta(dbox):
-      # y_min = top, x_min = left, y_max = bottom, x_max = right
-      dt, dl, db, dr = tf.split(dbox, 4, axis=-1)
-      dx = dl + dr
-      dy = dt + db
-      dw = (dr - dl) / scale
-      dh = (db - dt) / scale
-      dbox = tf.concat([dx, dy, dw, dh], axis=-1)
-      return dbox, 0.0
-  return box, delta
-def xcycwh_to_yxyx(box: tf.Tensor, darknet=False):
  """Converts boxes from x_center, y_center, width, height to yxyx format.
  Args:
    box: any `Tensor` whose last dimension is 4 representing the coordinates of
      boxes in x_center, y_center, width, height.
-    darknet: `bool`, if True a scale of 1.0 is used.
  Returns:
    box: a `Tensor` whose shape is the same as `box` in new format.
  """
-  if darknet:
+  with tf.name_scope('xcycwh_to_yxyx'):
-    scale = 1.0
+    xy, wh = tf.split(box, 2, axis=-1)
-  else:
+    xy_min = xy - wh / 2
-    scale = 2.0
+    xy_max = xy + wh / 2
-  box = _xcycwh_to_yxyx(box, scale)
+    x_min, y_min = tf.split(xy_min, 2, axis=-1)
+    x_max, y_max = tf.split(xy_max, 2, axis=-1)
+    box = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
  return box
-# IOU
 def intersect_and_union(box1, box2, yxyx=False):
  """Calculates the intersection and union between box1 and box2.
@@ -98,8 +73,9 @@ def intersect_and_union(box1, box2, yxyx=False):
    intersection: a `Tensor` who represents the intersection.
    union: a `Tensor` who represents the union.
  """
  if not yxyx:
+    box1_area = tf.reduce_prod(tf.split(box1, 2, axis=-1)[-1], axis=-1)
+    box2_area = tf.reduce_prod(tf.split(box2, 2, axis=-1)[-1], axis=-1)
    box1 = xcycwh_to_yxyx(box1)
    box2 = xcycwh_to_yxyx(box2)
@@ -110,13 +86,14 @@ def intersect_and_union(box1, box2, yxyx=False):
  intersect_wh = tf.math.maximum(intersect_maxes - intersect_mins, 0.0)
  intersection = tf.reduce_prod(intersect_wh, axis=-1)
+  if yxyx:
    box1_area = tf.reduce_prod(b1ma - b1mi, axis=-1)
    box2_area = tf.reduce_prod(b2ma - b2mi, axis=-1)
  union = box1_area + box2_area - intersection
  return intersection, union
-def smallest_encompassing_box(box1, box2, yxyx=False):
+def smallest_encompassing_box(box1, box2, yxyx=False, clip=False):
  """Calculates the smallest box that encompasses box1 and box2.
  Args:
@@ -126,6 +103,7 @@ def smallest_encompassing_box(box1, box2, yxyx=False):
      boxes.
    yxyx: a `bool` indicating whether the input box is of the format x_center
      y_center, width, height or y_min, x_min, y_max, x_max.
+    clip: a `bool`, whether or not to clip boxes.
  Returns:
    box_c: a `Tensor` whose last dimension is 4 representing the coordinates of
@@ -141,15 +119,15 @@ def smallest_encompassing_box(box1, box2, yxyx=False):
  bcmi = tf.math.minimum(b1mi, b2mi)
  bcma = tf.math.maximum(b1ma, b2ma)
-  bca = tf.reduce_prod(bcma - bcmi, keepdims=True, axis=-1)
  box_c = tf.concat([bcmi, bcma], axis=-1)
  if not yxyx:
    box_c = yxyx_to_xcycwh(box_c)
-  box_c = tf.where(bca == 0.0, tf.zeros_like(box_c), box_c)
+  if clip:
-  return box_c
+    bca = tf.reduce_prod(bcma - bcmi, keepdims=True, axis=-1)
+    box_c = tf.where(bca <= 0.0, tf.zeros_like(box_c), box_c)
+  return bcmi, bcma, box_c
 def compute_iou(box1, box2, yxyx=False):
@@ -166,15 +144,13 @@ def compute_iou(box1, box2, yxyx=False):
  Returns:
    iou: a `Tensor` who represents the intersection over union.
  """
-  # get box corners
  with tf.name_scope('iou'):
    intersection, union = intersect_and_union(box1, box2, yxyx=yxyx)
    iou = math_ops.divide_no_nan(intersection, union)
-    iou = math_ops.rm_nan_inf(iou, val=0.0)
  return iou
-def compute_giou(box1, box2, yxyx=False, darknet=False):
+def compute_giou(box1, box2, yxyx=False):
  """Calculates the General intersection over union between box1 and box2.
  Args:
@@ -184,38 +160,30 @@ def compute_giou(box1, box2, yxyx=False, darknet=False):
      boxes.
    yxyx: a `bool` indicating whether the input box is of the format x_center
      y_center, width, height or y_min, x_min, y_max, x_max.
-    darknet: a `bool` indicating whether the calling function is the YOLO
-      darknet loss.
  Returns:
    giou: a `Tensor` who represents the General intersection over union.
  """
  with tf.name_scope('giou'):
-    # get IOU
    if not yxyx:
-      box1 = xcycwh_to_yxyx(box1, darknet=darknet)
+      yxyx1 = xcycwh_to_yxyx(box1)
-      box2 = xcycwh_to_yxyx(box2, darknet=darknet)
+      yxyx2 = xcycwh_to_yxyx(box2)
-      yxyx = True
+    else:
+      yxyx1, yxyx2 = box1, box2
-    intersection, union = intersect_and_union(box1, box2, yxyx=yxyx)
+    cmi, cma, _ = smallest_encompassing_box(yxyx1, yxyx2, yxyx=True)
+    intersection, union = intersect_and_union(yxyx1, yxyx2, yxyx=True)
    iou = math_ops.divide_no_nan(intersection, union)
-    iou = math_ops.rm_nan_inf(iou, val=0.0)
-    # find the smallest box to encompase both box1 and box2
+    bcwh = cma - cmi
-    boxc = smallest_encompassing_box(box1, box2, yxyx=yxyx)
+    c = tf.math.reduce_prod(bcwh, axis=-1)
-    if yxyx:
-      boxc = yxyx_to_xcycwh(boxc)
-    _, cwch = tf.split(boxc, 2, axis=-1)
-    c = tf.math.reduce_prod(cwch, axis=-1)
-    # compute giou
    regularization = math_ops.divide_no_nan((c - union), c)
    giou = iou - regularization
-    giou = tf.clip_by_value(giou, clip_value_min=-1.0, clip_value_max=1.0)
  return iou, giou
-def compute_diou(box1, box2, beta=1.0, yxyx=False, darknet=False):
+def compute_diou(box1, box2, beta=1.0, yxyx=False):
  """Calculates the distance intersection over union between box1 and box2.
  Args:
@@ -227,8 +195,6 @@ def compute_diou(box1, box2, beta=1.0, yxyx=False, darknet=False):
      regularization term.
    yxyx: a `bool` indicating whether the input box is of the format x_center
      y_center, width, height or y_min, x_min, y_max, x_max.
-    darknet: a `bool` indicating whether the calling function is the YOLO
-      darknet loss.
  Returns:
    diou: a `Tensor` who represents the distance intersection over union.
@@ -236,30 +202,27 @@ def compute_diou(box1, box2, beta=1.0, yxyx=False, darknet=False):
  with tf.name_scope('diou'):
    # compute center distance
    if not yxyx:
-      box1 = xcycwh_to_yxyx(box1, darknet=darknet)
+      xycc1, xycc2 = box1, box2
-      box2 = xcycwh_to_yxyx(box2, darknet=darknet)
+      yxyx1 = xcycwh_to_yxyx(box1)
-      yxyx = True
+      yxyx2 = xcycwh_to_yxyx(box2)
+    else:
-    intersection, union = intersect_and_union(box1, box2, yxyx=yxyx)
+      yxyx1, yxyx2 = box1, box2
-    boxc = smallest_encompassing_box(box1, box2, yxyx=yxyx)
+      xycc1 = yxyx_to_xcycwh(box1)
+      xycc2 = yxyx_to_xcycwh(box2)
+    cmi, cma, _ = smallest_encompassing_box(yxyx1, yxyx2, yxyx=True)
+    intersection, union = intersect_and_union(yxyx1, yxyx2, yxyx=True)
    iou = math_ops.divide_no_nan(intersection, union)
-    iou = math_ops.rm_nan_inf(iou, val=0.0)
-    if yxyx:
-      boxc = yxyx_to_xcycwh(boxc)
-      box1 = yxyx_to_xcycwh(box1)
-      box2 = yxyx_to_xcycwh(box2)
-    b1xy, _ = tf.split(box1, 2, axis=-1)
+    b1xy, _ = tf.split(xycc1, 2, axis=-1)
-    b2xy, _ = tf.split(box2, 2, axis=-1)
+    b2xy, _ = tf.split(xycc2, 2, axis=-1)
-    _, bcwh = tf.split(boxc, 2, axis=-1)
+    bcwh = cma - cmi
    center_dist = tf.reduce_sum((b1xy - b2xy)**2, axis=-1)
    c_diag = tf.reduce_sum(bcwh**2, axis=-1)
    regularization = math_ops.divide_no_nan(center_dist, c_diag)
    diou = iou - regularization**beta
-    diou = tf.clip_by_value(diou, clip_value_min=-1.0, clip_value_max=1.0)
  return iou, diou
@@ -280,33 +243,48 @@ def compute_ciou(box1, box2, yxyx=False, darknet=False):
    ciou: a `Tensor` who represents the complete intersection over union.
  """
  with tf.name_scope('ciou'):
-    # compute DIOU and IOU
+    if not yxyx:
-    iou, diou = compute_diou(box1, box2, yxyx=yxyx, darknet=darknet)
+      xycc1, xycc2 = box1, box2
+      yxyx1 = xcycwh_to_yxyx(box1)
+      yxyx2 = xcycwh_to_yxyx(box2)
+    else:
+      yxyx1, yxyx2 = box1, box2
+      xycc1 = yxyx_to_xcycwh(box1)
+      xycc2 = yxyx_to_xcycwh(box2)
-    if yxyx:
+    # Build the smallest encomapssing box.
-      box1 = yxyx_to_xcycwh(box1)
+    cmi, cma, _ = smallest_encompassing_box(yxyx1, yxyx2, yxyx=True)
-      box2 = yxyx_to_xcycwh(box2)
+    intersection, union = intersect_and_union(yxyx1, yxyx2, yxyx=True)
+    iou = math_ops.divide_no_nan(intersection, union)
-    _, _, b1w, b1h = tf.split(box1, 4, axis=-1)
-    _, _, b2w, b2h = tf.split(box1, 4, axis=-1)
+    b1xy, b1w, b1h = tf.split(xycc1, [2, 1, 1], axis=-1)
+    b2xy, b2w, b2h = tf.split(xycc2, [2, 1, 1], axis=-1)
-    # computer aspect ratio consistency
+    bchw = cma - cmi
-    terma = tf.cast(math_ops.divide_no_nan(b1w, b1h), tf.float32)
-    termb = tf.cast(math_ops.divide_no_nan(b2w, b2h), tf.float32)
+    # Center regularization
-    arcterm = tf.square(tf.math.atan(terma) - tf.math.atan(termb))
+    center_dist = tf.reduce_sum((b1xy - b2xy)**2, axis=-1)
-    v = tf.squeeze(4 * arcterm / (math.pi**2), axis=-1)
+    c_diag = tf.reduce_sum(bchw**2, axis=-1)
-    v = tf.cast(v, b1w.dtype)
+    regularization = math_ops.divide_no_nan(center_dist, c_diag)
-    a = tf.stop_gradient(math_ops.divide_no_nan(v, ((1 - iou) + v)))
+    # Computer aspect ratio consistency
-    ciou = diou - (v * a)
+    terma = math_ops.divide_no_nan(b1w, b1h)  # gt
-    ciou = tf.clip_by_value(ciou, clip_value_min=-1.0, clip_value_max=1.0)
+    termb = math_ops.divide_no_nan(b2w, b2h)  # pred
+    arcterm = tf.squeeze(
+        tf.math.pow(tf.math.atan(termb) - tf.math.atan(terma), 2), axis=-1)
+    v = (4 / math.pi**2) * arcterm
+    # Compute the aspect ratio weight, should be treated as a constant
+    a = tf.stop_gradient(math_ops.divide_no_nan(v, 1 - iou + v))
+    if darknet:
+      grad_scale = tf.stop_gradient(tf.square(b2w) + tf.square(b2h))
+      v *= tf.squeeze(grad_scale, axis=-1)
+    ciou = iou - regularization - (v * a)
  return iou, ciou
-def aggregated_comparitive_iou(boxes1,
+def aggregated_comparitive_iou(boxes1, boxes2=None, iou_type=0, beta=0.6):
-                               boxes2=None,
-                               iou_type=0,
-                               beta=0.6):
  """Calculates the IOU between two set of boxes.
  Similar to bbox_overlap but far more versitile.
@@ -333,11 +311,11 @@ def aggregated_comparitive_iou(boxes1,
  else:
    boxes2 = tf.transpose(boxes1, perm=(0, 2, 1, 3))
-  if iou_type == 0:  # diou
+  if iou_type == 0 or iou_type == 'diou':  # diou
    _, iou = compute_diou(boxes1, boxes2, beta=beta, yxyx=True)
-  elif iou_type == 1:  # giou
+  elif iou_type == 1 or iou_type == 'giou':  # giou
    _, iou = compute_giou(boxes1, boxes2, yxyx=True)
-  elif iou_type == 2:  # ciou
+  elif iou_type == 2 or iou_type == 'ciou':  # ciou
    _, iou = compute_ciou(boxes1, boxes2, yxyx=True)
  else:
    iou = compute_iou(boxes1, boxes2, yxyx=True)

--- a/official/vision/beta/projects/yolo/ops/loss_utils.py
+++ b/official/vision/beta/projects/yolo/ops/loss_utils.py
--- a/official/vision/beta/projects/yolo/ops/math_ops.py
+++ b/official/vision/beta/projects/yolo/ops/math_ops.py
@@ -58,25 +58,4 @@ def divide_no_nan(a, b):
  Returns:
    a `Tensor` representing a divided by b, with all nan values removed.
  """
-  zero = tf.cast(0.0, b.dtype)
+  return a / (b + 1e-9)
-  return tf.where(b == zero, zero, a / b)
-def mul_no_nan(x, y):
-  """Nan safe multiply operation.
-  Built to allow model compilation in tflite and
-  to allow one tensor to mask another. Where ever x is zero the
-  multiplication is not computed and the value is replaced with a zero. This is
-  required because 0 * nan = nan. This can make computation unstable in some
-  cases where the intended behavior is for zero to mean ignore.
-  Args:
-    x: any `Tensor` of any type.
-    y: any `Tensor` of any type with the same shape as tensor x.
-  Returns:
-    a `Tensor` representing x times y, where x is used to safely mask the
-    tensor y.
-  """
-  return tf.where(x == 0, tf.cast(0, x.dtype), x * y)