Merge branch 'panoptic-segmentation' into panoptic-segmentation

ca552843 · Srihari Humbarwadi · GitHub · 7e2f7a35 · 6b90e134 · ca552843
Unverified Commit ca552843 authored Sep 16, 2021 by Srihari Humbarwadi Committed by GitHub Sep 16, 2021
20 changed files
--- a/official/vision/beta/projects/video_ssl/tasks/__init__.py
+++ b/official/vision/beta/projects/video_ssl/tasks/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tasks package definition."""
+
+from official.vision.beta.projects.video_ssl.tasks import linear_eval
+from official.vision.beta.projects.video_ssl.tasks import pretrain
--- a/official/vision/beta/projects/video_ssl/tasks/linear_eval.py
+++ b/official/vision/beta/projects/video_ssl/tasks/linear_eval.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Video ssl linear evaluation task definition."""
+from typing import Any, Optional, List, Tuple
+from absl import logging
+import tensorflow as tf
+
+# pylint: disable=unused-import
+from official.core import task_factory
+from official.vision.beta.projects.video_ssl.configs import video_ssl as exp_cfg
+from official.vision.beta.projects.video_ssl.modeling import video_ssl_model
+from official.vision.beta.tasks import video_classification
+
+
+@task_factory.register_task_cls(exp_cfg.VideoSSLEvalTask)
+class VideoSSLEvalTask(video_classification.VideoClassificationTask):
+  """A task for video ssl linear evaluation."""
+
+  def initialize(self, model: tf.keras.Model):
+    """Loading pretrained checkpoint."""
+    if not self.task_config.init_checkpoint:
+      return
+
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+
+    # Restoring checkpoint.
+    if self.task_config.init_checkpoint_modules == 'backbone':
+      ckpt = tf.train.Checkpoint(backbone=model.backbone)
+      ckpt.read(ckpt_dir_or_file)
+    else:
+      raise NotImplementedError
+
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
+
+  def train_step(self,
+                 inputs: Tuple[Any, Any],
+                 model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer,
+                 metrics: Optional[List[Any]] = None):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    model.backbone.trainable = False
+    logging.info('Setting the backbone to non-trainable.')
+
+    return super(video_classification.VideoClassificationTask,
+                 self).train_step(inputs, model, optimizer, metrics)
--- a/official/vision/beta/projects/video_ssl/tasks/pretrain.py
+++ b/official/vision/beta/projects/video_ssl/tasks/pretrain.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Video ssl pretrain task definition."""
+from absl import logging
+import tensorflow as tf
+
+# pylint: disable=unused-import
+from official.core import input_reader
+from official.core import task_factory
+from official.vision.beta.modeling import factory_3d
+from official.vision.beta.projects.video_ssl.configs import video_ssl as exp_cfg
+from official.vision.beta.projects.video_ssl.dataloaders import video_ssl_input
+from official.vision.beta.projects.video_ssl.losses import losses
+from official.vision.beta.projects.video_ssl.modeling import video_ssl_model
+from official.vision.beta.tasks import video_classification
+
+
+@task_factory.register_task_cls(exp_cfg.VideoSSLPretrainTask)
+class VideoSSLPretrainTask(video_classification.VideoClassificationTask):
+  """A task for video ssl pretraining."""
+
+  def build_model(self):
+    """Builds video ssl pretraining model."""
+    common_input_shape = [
+        d1 if d1 == d2 else None
+        for d1, d2 in zip(self.task_config.train_data.feature_shape,
+                          self.task_config.validation_data.feature_shape)
+    ]
+    input_specs = tf.keras.layers.InputSpec(shape=[None] + common_input_shape)
+    logging.info('Build model input %r', common_input_shape)
+
+    model = factory_3d.build_model(
+        self.task_config.model.model_type,
+        input_specs=input_specs,
+        model_config=self.task_config.model,
+        num_classes=self.task_config.train_data.num_classes)
+    return model
+
+  def _get_decoder_fn(self, params):
+    decoder = video_ssl_input.Decoder()
+    return decoder.decode
+
+  def build_inputs(self, params: exp_cfg.DataConfig, input_context=None):
+    """Builds classification input."""
+
+    parser = video_ssl_input.Parser(input_params=params)
+    postprocess_fn = video_ssl_input.PostBatchProcessor(params)
+
+    reader = input_reader.InputReader(
+        params,
+        dataset_fn=self._get_dataset_fn(params),
+        decoder_fn=self._get_decoder_fn(params),
+        parser_fn=parser.parse_fn(params.is_training),
+        postprocess_fn=postprocess_fn)
+
+    dataset = reader.read(input_context=input_context)
+
+    return dataset
+
+  def build_losses(self, model_outputs, num_replicas, model):
+    """Sparse categorical cross entropy loss.
+
+    Args:
+      model_outputs: Output logits of the model.
+      num_replicas: distributed replica number.
+      model: keras model for calculating weight decay.
+
+    Returns:
+      The total loss tensor.
+    """
+    all_losses = {}
+    contrastive_metrics = {}
+    losses_config = self.task_config.losses
+    total_loss = None
+    contrastive_loss_dict = losses.contrastive_loss(
+        model_outputs, num_replicas, losses_config.normalize_hidden,
+        losses_config.temperature, model,
+        self.task_config.losses.l2_weight_decay)
+    total_loss = contrastive_loss_dict['total_loss']
+    all_losses.update({
+        'total_loss': total_loss
+    })
+    all_losses[self.loss] = total_loss
+    contrastive_metrics.update({
+        'contrast_acc': contrastive_loss_dict['contrast_acc'],
+        'contrast_entropy': contrastive_loss_dict['contrast_entropy'],
+        'reg_loss': contrastive_loss_dict['reg_loss']
+    })
+    return all_losses, contrastive_metrics
+
+  def build_metrics(self, training=True):
+    """Gets streaming metrics for training/validation."""
+    metrics = [
+        tf.keras.metrics.Mean(name='contrast_acc'),
+        tf.keras.metrics.Mean(name='contrast_entropy'),
+        tf.keras.metrics.Mean(name='reg_loss')
+    ]
+    return metrics
+
+  def process_metrics(self, metrics, contrastive_metrics):
+    """Process and update metrics."""
+    contrastive_metric_values = contrastive_metrics.values()
+    for metric, contrastive_metric_value in zip(metrics,
+                                                contrastive_metric_values):
+      metric.update_state(contrastive_metric_value)
+
+  def train_step(self, inputs, model, optimizer, metrics=None):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, _ = inputs
+
+    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
+    with tf.GradientTape() as tape:
+      if self.task_config.train_data.output_audio:
+        outputs = model(features, training=True)
+      else:
+        outputs = model(features['image'], training=True)
+      # Casting output layer as float32 is necessary when mixed_precision is
+      # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32.
+      outputs = tf.nest.map_structure(
+          lambda x: tf.cast(x, tf.float32), outputs)
+
+      all_losses, contrastive_metrics = self.build_losses(
+          model_outputs=outputs, num_replicas=num_replicas,
+          model=model)
+      loss = all_losses[self.loss]
+      scaled_loss = loss
+
+      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
+      # scaled for numerical stability.
+      if isinstance(
+          optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    # Scales back gradient before apply_gradients when LossScaleOptimizer is
+    # used.
+    if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+
+    logs = all_losses
+    if metrics:
+      self.process_metrics(metrics, contrastive_metrics)
+      logs.update({m.name: m.result() for m in metrics})
+    return logs
+
+  def validation_step(self, inputs, model, metrics=None):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    raise NotImplementedError
+
+  def inference_step(self, features, model):
+    """Performs the forward step."""
+    raise NotImplementedError
--- a/official/vision/beta/projects/video_ssl/tasks/pretrain_test.py
+++ b/official/vision/beta/projects/video_ssl/tasks/pretrain_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+import functools
+import os
+import random
+
+import orbit
+import tensorflow as tf
+
+# pylint: disable=unused-import
+from official.core import exp_factory
+from official.core import task_factory
+from official.modeling import optimization
+from official.vision import beta
+from official.vision.beta.dataloaders import tfexample_utils
+from official.vision.beta.projects.video_ssl.tasks import pretrain
+
+
+class VideoClassificationTaskTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(VideoClassificationTaskTest, self).setUp()
+    data_dir = os.path.join(self.get_temp_dir(), 'data')
+    tf.io.gfile.makedirs(data_dir)
+    self._data_path = os.path.join(data_dir, 'data.tfrecord')
+    # pylint: disable=g-complex-comprehension
+    examples = [
+        tfexample_utils.make_video_test_example(
+            image_shape=(36, 36, 3),
+            audio_shape=(20, 128),
+            label=random.randint(0, 100)) for _ in range(2)
+    ]
+    # pylint: enable=g-complex-comprehension
+    tfexample_utils.dump_to_tfrecord(self._data_path, tf_examples=examples)
+
+  def test_task(self):
+    config = exp_factory.get_exp_config('video_ssl_pretrain_kinetics600')
+    config.task.train_data.global_batch_size = 2
+    config.task.train_data.input_path = self._data_path
+
+    task = pretrain.VideoSSLPretrainTask(
+        config.task)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    strategy = tf.distribute.get_strategy()
+
+    dataset = orbit.utils.make_distributed_dataset(
+        strategy,
+        functools.partial(task.build_inputs),
+        config.task.train_data)
+
+    iterator = iter(dataset)
+    opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config)
+    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
+    logs = task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    self.assertIn('total_loss', logs)
+    self.assertIn('reg_loss', logs)
+    self.assertIn('contrast_acc', logs)
+    self.assertIn('contrast_entropy', logs)
+
+  def test_task_factory(self):
+    config = exp_factory.get_exp_config('video_ssl_pretrain_kinetics600')
+    task = task_factory.get_task(config.task)
+    self.assertIs(type(task), pretrain.VideoSSLPretrainTask)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/video_ssl/train.py
+++ b/official/vision/beta/projects/video_ssl/train.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Training driver."""
+
+from absl import app
+from absl import flags
+import gin
+
+# pylint: disable=unused-import
+from official.common import registry_imports
+from official.common import distribute_utils
+from official.common import flags as tfm_flags
+from official.core import task_factory
+from official.core import train_lib
+from official.core import train_utils
+from official.modeling import performance
+from official.vision.beta.projects.video_ssl.modeling import video_ssl_model
+from official.vision.beta.projects.video_ssl.tasks import linear_eval
+from official.vision.beta.projects.video_ssl.tasks import pretrain
+# pylint: disable=unused-import
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  params = train_utils.parse_configuration(FLAGS)
+  model_dir = FLAGS.model_dir
+  if 'train' in FLAGS.mode:
+    # Pure eval modes do not output yaml files. Otherwise continuous eval job
+    # may race against the train job for writing the same file.
+    train_utils.serialize_config(params, model_dir)
+
+  if 'train_and_eval' in FLAGS.mode:
+    assert (params.task.train_data.feature_shape ==
+            params.task.validation_data.feature_shape), (
+                f'train {params.task.train_data.feature_shape} != validate '
+                f'{params.task.validation_data.feature_shape}')
+
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  distribution_strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  with distribution_strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+
+  train_lib.run_experiment(
+      distribution_strategy=distribution_strategy,
+      task=task,
+      mode=FLAGS.mode,
+      params=params,
+      model_dir=model_dir)
+
+  train_utils.save_gin_config(FLAGS.mode, model_dir)
+
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  app.run(main)
--- a/official/vision/beta/projects/volumetric_models/tasks/semantic_segmentation_3d.py
+++ b/official/vision/beta/projects/volumetric_models/tasks/semantic_segmentation_3d.py
@@ -79,8 +79,8 @@ class SemanticSegmentation3DTask(base_task.Task):
    # Restoring checkpoint.
    if 'all' in self.task_config.init_checkpoint_modules:
      ckpt = tf.train.Checkpoint(**model.checkpoint_items)
-      status = ckpt.restore(ckpt_dir_or_file)
-      status.assert_consumed()
+      status = ckpt.read(ckpt_dir_or_file)
+      status.expect_partial().assert_existing_objects_matched()
    else:
      ckpt_items = {}
      if 'backbone' in self.task_config.init_checkpoint_modules:
@@ -89,7 +89,7 @@ class SemanticSegmentation3DTask(base_task.Task):
        ckpt_items.update(decoder=model.decoder)

      ckpt = tf.train.Checkpoint(**ckpt_items)
-      status = ckpt.restore(ckpt_dir_or_file)
+      status = ckpt.read(ckpt_dir_or_file)
      status.expect_partial().assert_existing_objects_matched()

    logging.info('Finished loading pretrained checkpoint from %s',

--- a/official/vision/beta/projects/yolo/README.md
+++ b/official/vision/beta/projects/yolo/README.md
@@ -17,30 +17,31 @@ repository.

 ## Description

-Yolo v1 the original implementation was released in 2015 providing a ground
-breaking algorithm that would quickly process images, and locate objects in a
-single pass through the detector. The original implementation based used a
-backbone derived from state of the art object classifier of the time, like
+YOLO v1 the original implementation was released in 2015 providing a
+ground breaking algorithm that would quickly process images and locate objects
+in a single pass through the detector. The original implementation used a
+backbone derived from state of the art object classifiers of the time, like
 [GoogLeNet](https://arxiv.org/abs/1409.4842) and
 [VGG](https://arxiv.org/abs/1409.1556). More attention was given to the novel
-Yolo Detection head that allowed for Object Detection with a single pass of an
+YOLO Detection head that allowed for Object Detection with a single pass of an
 image. Though limited, the network could predict up to 90 bounding boxes per
-image, and was tested for about 80 classes per box. Also, the model could only
-make prediction at one scale. These attributes caused yolo v1 to be more
-limited, and less versatile, so as the year passed, the Developers continued to
+image, and was tested for about 80 classes per box. Also, the model can only
+make predictions at one scale. These attributes caused YOLO v1 to be more
+limited and less versatile, so as the year passed, the Developers continued to
 update and develop this model.

-Yolo v3 and v4 serve as the most up to date and capable versions of the Yolo
-network group. These model uses a custom backbone called Darknet53 that uses
+YOLO v3 and v4 serve as the most up to date and capable versions of the YOLO
+network group. This model uses a custom backbone called Darknet53 that uses
 knowledge gained from the ResNet paper to improve its predictions. The new
 backbone also allows for objects to be detected at multiple scales. As for the
 new detection head, the model now predicts the bounding boxes using a set of
-anchor box priors (Anchor Boxes) as suggestions. The multiscale predictions in
-combination with the Anchor boxes allows for the network to make up to 1000
-object predictions on a single image. Finally, the new loss function forces the
-network to make better prediction by using Intersection Over Union (IOU) to
-inform the model's confidence rather than relying on the mean squared error for
-the entire output.
+anchor box priors (Anchor Boxes) as suggestions. Multiscale predictions in
+combination with Anchor boxes allow for the network to make up to 1000 object
+predictions on a single image. Finally, the new loss function forces the network
+to make better predictions by using Intersection Over Union (IOU) to inform the
+model's confidence rather than relying on the mean squared error for the entire
+output.
+

 ## Authors

@@ -59,9 +60,9 @@ the entire output.

 ## Our Goal

-Our goal with this model conversion is to provide implementations of the
-Backbone and Yolo Head. We have built the model in such a way that the Yolo
-head could be connected to a new, more powerful backbone if a person chose to.
+Our goal with this model conversion is to provide implementation of the Backbone
+and YOLO Head. We have built the model in such a way that the YOLO head could be
+connected to a new, more powerful backbone if a person chose to.

 ## Models in the library

@@ -79,3 +80,5 @@ head could be connected to a new, more powerful backbone if a person chose to.
 [![Python 3.8](https://img.shields.io/badge/Python-3.8-3776AB)](https://www.python.org/downloads/release/python-380/)


+DISCLAIMER: this YOLO implementation is still under development. No support
+will be provided during the development phase.
--- a/official/vision/beta/projects/yolo/configs/backbones.py
+++ b/official/vision/beta/projects/yolo/configs/backbones.py
@@ -30,6 +30,8 @@ class Darknet(hyperparams.Config):
  width_scale: float = 1.0
  depth_scale: float = 1.0
  dilate: bool = False
+  min_level: int = 3
+  max_level: int = 5


 @dataclasses.dataclass

--- a/official/vision/beta/projects/yolo/configs/darknet_classification.py
+++ b/official/vision/beta/projects/yolo/configs/darknet_classification.py
@@ -15,9 +15,8 @@
 # Lint as: python3
 """Image classification with darknet configs."""

-from typing import List, Optional
-
 import dataclasses
+from typing import List, Optional

 from official.core import config_definitions as cfg
 from official.core import exp_factory
@@ -35,7 +34,7 @@ class ImageClassificationModel(hyperparams.Config):
      type='darknet', darknet=backbones.Darknet())
  dropout_rate: float = 0.0
  norm_activation: common.NormActivation = common.NormActivation()
-  # Adds a BatchNormalization layer pre-GlobalAveragePooling in classification
+  # Adds a Batch Normalization layer pre-GlobalAveragePooling in classification.
  add_head_batch_norm: bool = False



--- a/official/vision/beta/projects/yolo/dataloaders/yolo_detection_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_detection_input.py
@@ -67,7 +67,7 @@ class Parser(parser.Parser):
      max_level: `int` number of maximum level of the output feature pyramid.
      masks: a `Tensor`, `List` or `numpy.ndarray` for anchor masks.
      max_process_size: an `int` for maximum image width and height.
-      min_process_size: an `int` for minimum image width and height ,
+      min_process_size: an `int` for minimum image width and height.
      max_num_instances: an `int` number of maximum number of instances in an
        image.
      random_flip: a `bool` if True, augment training with random horizontal

--- a/official/staging/__init__.py
+++ b/official/staging/__init__.py
--- a/official/vision/beta/projects/yolo/losses/yolo_loss.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Yolo Loss function."""
+import abc
+import collections
+import functools
+
+import tensorflow as tf
+
+from official.vision.beta.projects.yolo.ops import box_ops
+from official.vision.beta.projects.yolo.ops import loss_utils
+from official.vision.beta.projects.yolo.ops import math_ops
+
+
+class YoloLossBase(object, metaclass=abc.ABCMeta):
+  """Parameters for the YOLO loss functions used at each detection generator.
+
+  This base class implements the base functionality required to implement a Yolo
+  Loss function.
+  """
+
+  def __init__(self,
+               classes,
+               mask,
+               anchors,
+               path_stride=1,
+               ignore_thresh=0.7,
+               truth_thresh=1.0,
+               loss_type='ciou',
+               iou_normalizer=1.0,
+               cls_normalizer=1.0,
+               obj_normalizer=1.0,
+               label_smoothing=0.0,
+               objectness_smooth=True,
+               update_on_repeat=False,
+               box_type='original',
+               scale_x_y=1.0,
+               max_delta=10):
+    """Loss Function Initialization.
+
+    Args:
+      classes: `int` for the number of classes
+      mask: `List[int]` for the output level that this specific model output
+        level
+      anchors: `List[List[int]]` for the anchor boxes that are used in the model
+        at all levels. For anchor free prediction set the anchor list to be the
+        same as the image resolution.
+      path_stride: `int` for how much to scale this level to get the orginal
+        input shape.
+      ignore_thresh: `float` for the IOU value over which the loss is not
+        propagated, and a detection is assumed to have been made.
+      truth_thresh: `float` for the IOU value over which the loss is propagated
+        despite a detection being made.
+      loss_type: `str` for the typeof iou loss to use with in {ciou, diou, giou,
+        iou}.
+      iou_normalizer: `float` for how much to scale the loss on the IOU or the
+        boxes.
+      cls_normalizer: `float` for how much to scale the loss on the classes.
+      obj_normalizer: `float` for how much to scale loss on the detection map.
+      label_smoothing: `float` for how much to smooth the loss on the classes.
+      objectness_smooth: `float` for how much to smooth the loss on the
+        detection map.
+      update_on_repeat: `bool` for whether to replace with the newest or the
+        best value when an index is consumed by multiple objects.
+      box_type: `bool` for which scaling type to use.
+      scale_x_y: dictionary `float` values inidcating how far each pixel can see
+        outside of its containment of 1.0. a value of 1.2 indicates there is a
+        20% extended radius around each pixel that this specific pixel can
+        predict values for a center at. the center can range from 0 - value/2 to
+        1 + value/2, this value is set in the yolo filter, and resused here.
+        there should be one value for scale_xy for each level from min_level to
+        max_level.
+      max_delta: gradient clipping to apply to the box loss.
+    """
+    self._loss_type = loss_type
+    self._classes = tf.constant(tf.cast(classes, dtype=tf.int32))
+    self._num = tf.cast(len(mask), dtype=tf.int32)
+    self._truth_thresh = truth_thresh
+    self._ignore_thresh = ignore_thresh
+    self._masks = mask
+    self._anchors = anchors
+
+    self._iou_normalizer = iou_normalizer
+    self._cls_normalizer = cls_normalizer
+    self._obj_normalizer = obj_normalizer
+    self._scale_x_y = scale_x_y
+    self._max_delta = max_delta
+
+    self._label_smoothing = tf.cast(label_smoothing, tf.float32)
+    self._objectness_smooth = float(objectness_smooth)
+    self._update_on_repeat = update_on_repeat
+    self._box_type = box_type
+    self._path_stride = path_stride
+
+    box_kwargs = dict(
+        stride=self._path_stride,
+        scale_xy=self._scale_x_y,
+        box_type=self._box_type,
+        max_delta=self._max_delta)
+    self._decode_boxes = functools.partial(
+        loss_utils.get_predicted_box, **box_kwargs)
+    self._search_pairs = lambda pred_boxes, pred_classes, boxes, classes, scale, yxyx: (None, None, None, None)  # pylint:disable=line-too-long
+    self._build_per_path_attributes()
+    self._build_per_path_attributes()
+
+  def box_loss(self, true_box, pred_box, darknet=False):
+    """Call iou function and use it to compute the loss for the box maps."""
+    if self._loss_type == 'giou':
+      iou, liou = box_ops.compute_giou(true_box, pred_box)
+    elif self._loss_type == 'ciou':
+      iou, liou = box_ops.compute_ciou(true_box, pred_box, darknet=darknet)
+    else:
+      liou = iou = box_ops.compute_iou(true_box, pred_box)
+    loss_box = 1 - liou
+    return iou, liou, loss_box
+
+  def _tiled_global_box_search(self,
+                               pred_boxes,
+                               pred_classes,
+                               boxes,
+                               classes,
+                               true_conf,
+                               smoothed,
+                               scale=None):
+    """Search of all groundtruths to associate groundtruths to predictions."""
+
+    # Search all predictions against ground truths to find mathcing boxes for
+    # each pixel.
+    _, _, iou_max, _ = self._search_pairs(
+        pred_boxes, pred_classes, boxes, classes, scale=scale, yxyx=True)
+
+    if iou_max is None:
+      return true_conf, tf.ones_like(true_conf)
+
+    # Find the exact indexes to ignore and keep.
+    ignore_mask = tf.cast(iou_max < self._ignore_thresh, pred_boxes.dtype)
+    iou_mask = iou_max > self._ignore_thresh
+
+    if not smoothed:
+      # Ignore all pixels where a box was not supposed to be predicted but a
+      # high confidence box was predicted.
+      obj_mask = true_conf + (1 - true_conf) * ignore_mask
+    else:
+      # Replace pixels in the tre confidence map with the max iou predicted
+      # with in that cell.
+      obj_mask = tf.ones_like(true_conf)
+      iou_ = (1 - self._objectness_smooth) + self._objectness_smooth * iou_max
+      iou_ = tf.where(iou_max > 0, iou_, tf.zeros_like(iou_))
+      true_conf = tf.where(iou_mask, iou_, true_conf)
+
+    # Stop gradient so while loop is not tracked.
+    obj_mask = tf.stop_gradient(obj_mask)
+    true_conf = tf.stop_gradient(true_conf)
+    return true_conf, obj_mask
+
+  def __call__(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Call function to compute the loss and a set of metrics per FPN level.
+
+    Args:
+      true_counts: `Tensor` of shape [batchsize, height, width, num_anchors]
+        represeneting how many boxes are in a given pixel [j, i] in the output
+        map.
+      inds: `Tensor` of shape [batchsize, None, 3] indicating the location [j,
+        i] that a given box is associatied with in the FPN prediction map.
+      y_true: `Tensor` of shape [batchsize, None, 8] indicating the actual box
+        associated with each index in the inds tensor list.
+      boxes: `Tensor` of shape [batchsize, None, 4] indicating the original
+        ground truth boxes for each image as they came from the decoder used for
+        bounding box search.
+      classes: `Tensor` of shape [batchsize, None, 1] indicating the original
+        ground truth classes for each image as they came from the decoder used
+        for bounding box search.
+      y_pred: `Tensor` of shape [batchsize, height, width, output_depth] holding
+        the models output at a specific FPN level.
+
+    Returns:
+      loss: `float` for the actual loss.
+      box_loss: `float` loss on the boxes used for metrics.
+      conf_loss: `float` loss on the confidence used for metrics.
+      class_loss: `float` loss on the classes used for metrics.
+      avg_iou: `float` metric for the average iou between predictions and ground
+        truth.
+      avg_obj: `float` metric for the average confidence of the model for
+        predictions.
+    """
+    (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf, ind_mask,
+     grid_mask) = self._compute_loss(true_counts, inds, y_true, boxes, classes,
+                                     y_pred)
+
+    # Temporary metrics
+    box_loss = tf.stop_gradient(0.05 * box_loss / self._iou_normalizer)
+
+    # Metric compute using done here to save time and resources.
+    sigmoid_conf = tf.stop_gradient(tf.sigmoid(pred_conf))
+    iou = tf.stop_gradient(iou)
+    avg_iou = loss_utils.average_iou(
+        loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), iou))
+    avg_obj = loss_utils.average_iou(
+        tf.squeeze(sigmoid_conf, axis=-1) * grid_mask)
+    return (loss, box_loss, conf_loss, class_loss, mean_loss,
+            tf.stop_gradient(avg_iou), tf.stop_gradient(avg_obj))
+
+  @abc.abstractmethod
+  def _build_per_path_attributes(self):
+    """Additional initialization required for each YOLO loss version."""
+    ...
+
+  @abc.abstractmethod
+  def _compute_loss(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """The actual logic to apply to the raw model for optimization."""
+    ...
+
+  def post_path_aggregation(self, loss, ground_truths, predictions):  # pylint:disable=unused-argument
+    """This method allows for post processing of a loss value.
+
+    After the loss has been aggregated across all the FPN levels some post
+    proceessing may need to occur to poroperly scale the loss. The default
+    behavior is to pass the loss through with no alterations.
+
+    Args:
+      loss: `tf.float` scalar for the actual loss.
+      ground_truths: `Dict` holding all the ground truth tensors.
+      predictions: `Dict` holding all the predicted values.
+
+    Returns:
+      loss: `tf.float` scalar for the scaled loss.
+    """
+    return loss
+
+  @abc.abstractmethod
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """This controls how the loss should be aggregated across replicas."""
+    ...
+
+
+@tf.custom_gradient
+def grad_sigmoid(values):
+  """This function scales the gradient as if a signmoid was applied.
+
+  This is used in the Darknet Loss when the choosen box type is the scaled
+  coordinate type. This function is used to match the propagated gradient to
+  match that of the Darkent Yolov4 model. This is an Identity operation that
+  allows us to add some extra steps to the back propagation.
+
+  Args:
+    values: A tensor of any shape.
+
+  Returns:
+    values: The unaltered input tensor.
+    delta: A custom gradient function that adds the sigmoid step to the
+      backpropagation.
+  """
+
+  def delta(dy):
+    t = tf.math.sigmoid(values)
+    return dy * t * (1 - t)
+
+  return values, delta
+
+
+class DarknetLoss(YoloLossBase):
+  """This class implements the full logic for the standard Yolo models."""
+
+  def _build_per_path_attributes(self):
+    """Paramterization of pair wise search and grid generators.
+
+    Objects created here are used for box decoding and dynamic ground truth
+    association.
+    """
+    self._anchor_generator = loss_utils.GridGenerator(
+        masks=self._masks,
+        anchors=self._anchors,
+        scale_anchors=self._path_stride)
+
+    if self._ignore_thresh > 0.0:
+      self._search_pairs = loss_utils.PairWiseSearch(
+          iou_type='iou', any_match=True, min_conf=0.25)
+    return
+
+  def _compute_loss(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Per FPN path loss logic used for Yolov3, Yolov4, and Yolo-Tiny."""
+    if self._box_type == 'scaled':
+      # Darknet Model Propagates a sigmoid once in back prop so we replicate
+      # that behaviour
+      y_pred = grad_sigmoid(y_pred)
+
+    # Generate and store constants and format output.
+    shape = tf.shape(true_counts)
+    batch_size, width, height, num = shape[0], shape[1], shape[2], shape[3]
+    fwidth = tf.cast(width, tf.float32)
+    fheight = tf.cast(height, tf.float32)
+    grid_points, anchor_grid = self._anchor_generator(
+        width, height, batch_size, dtype=tf.float32)
+
+    # Cast all input compontnts to float32 and stop gradient to save memory.
+    boxes = tf.stop_gradient(tf.cast(boxes, tf.float32))
+    classes = tf.stop_gradient(tf.cast(classes, tf.float32))
+    y_true = tf.stop_gradient(tf.cast(y_true, tf.float32))
+    true_counts = tf.stop_gradient(tf.cast(true_counts, tf.float32))
+    true_conf = tf.stop_gradient(tf.clip_by_value(true_counts, 0.0, 1.0))
+    grid_points = tf.stop_gradient(grid_points)
+    anchor_grid = tf.stop_gradient(anchor_grid)
+
+    # Split all the ground truths to use as seperate items in loss computation.
+    (true_box, ind_mask, true_class, _, _) = tf.split(
+        y_true, [4, 1, 1, 1, 1], axis=-1)
+    true_conf = tf.squeeze(true_conf, axis=-1)
+    true_class = tf.squeeze(true_class, axis=-1)
+    grid_mask = true_conf
+
+    # Splits all predictions.
+    y_pred = tf.cast(
+        tf.reshape(y_pred, [batch_size, width, height, num, -1]), tf.float32)
+    pred_box, pred_conf, pred_class = tf.split(y_pred, [4, 1, -1], axis=-1)
+
+    # Decode the boxes to be used for loss compute.
+    _, _, pred_box = self._decode_boxes(
+        fwidth, fheight, pred_box, anchor_grid, grid_points, darknet=True)
+
+    # If the ignore threshold is enabled, search all boxes ignore all
+    # IOU valeus larger than the ignore threshold that are not in the
+    # noted ground truth list.
+    if self._ignore_thresh != 0.0:
+      (true_conf, obj_mask) = self._tiled_global_box_search(
+          pred_box,
+          tf.stop_gradient(tf.sigmoid(pred_class)),
+          boxes,
+          classes,
+          true_conf,
+          smoothed=self._objectness_smooth > 0)
+
+    # Build the one hot class list that are used for class loss.
+    true_class = tf.one_hot(
+        tf.cast(true_class, tf.int32),
+        depth=tf.shape(pred_class)[-1],
+        dtype=pred_class.dtype)
+    true_classes = tf.stop_gradient(loss_utils.apply_mask(ind_mask, true_class))
+
+    # Reorganize the one hot class list as a grid.
+    true_class = loss_utils.build_grid(
+        inds, true_classes, pred_class, ind_mask, update=False)
+    true_class = tf.stop_gradient(true_class)
+
+    # Use the class mask to find the number of objects located in
+    # each predicted grid cell/pixel.
+    counts = true_class
+    counts = tf.reduce_sum(counts, axis=-1, keepdims=True)
+    reps = tf.gather_nd(counts, inds, batch_dims=1)
+    reps = tf.squeeze(reps, axis=-1)
+    reps = tf.stop_gradient(tf.where(reps == 0.0, tf.ones_like(reps), reps))
+
+    # Compute the loss for only the cells in which the boxes are located.
+    pred_box = loss_utils.apply_mask(ind_mask,
+                                     tf.gather_nd(pred_box, inds, batch_dims=1))
+    iou, _, box_loss = self.box_loss(true_box, pred_box, darknet=True)
+    box_loss = loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), box_loss)
+    box_loss = math_ops.divide_no_nan(box_loss, reps)
+    box_loss = tf.cast(tf.reduce_sum(box_loss, axis=1), dtype=y_pred.dtype)
+
+    # Compute the sigmoid binary cross entropy for the class maps.
+    class_loss = tf.reduce_mean(
+        loss_utils.sigmoid_bce(
+            tf.expand_dims(true_class, axis=-1),
+            tf.expand_dims(pred_class, axis=-1), self._label_smoothing),
+        axis=-1)
+
+    # Apply normalization to the class losses.
+    if self._cls_normalizer < 1.0:
+      # Build a mask based on the true class locations.
+      cls_norm_mask = true_class
+      # Apply the classes weight to class indexes were one_hot is one.
+      class_loss *= ((1 - cls_norm_mask) + cls_norm_mask * self._cls_normalizer)
+
+    # Mask to the class loss and compute the sum over all the objects.
+    class_loss = tf.reduce_sum(class_loss, axis=-1)
+    class_loss = loss_utils.apply_mask(grid_mask, class_loss)
+    class_loss = math_ops.rm_nan_inf(class_loss, val=0.0)
+    class_loss = tf.cast(
+        tf.reduce_sum(class_loss, axis=(1, 2, 3)), dtype=y_pred.dtype)
+
+    # Compute the sigmoid binary cross entropy for the confidence maps.
+    bce = tf.reduce_mean(
+        loss_utils.sigmoid_bce(
+            tf.expand_dims(true_conf, axis=-1), pred_conf, 0.0),
+        axis=-1)
+
+    # Mask the confidence loss and take the sum across all the grid cells.
+    if self._ignore_thresh != 0.0:
+      bce = loss_utils.apply_mask(obj_mask, bce)
+    conf_loss = tf.cast(tf.reduce_sum(bce, axis=(1, 2, 3)), dtype=y_pred.dtype)
+
+    # Apply the weights to each loss.
+    box_loss *= self._iou_normalizer
+    conf_loss *= self._obj_normalizer
+
+    # Add all the losses together then take the mean over the batches.
+    loss = box_loss + class_loss + conf_loss
+    loss = tf.reduce_mean(loss)
+
+    # Reduce the mean of the losses to use as a metric.
+    box_loss = tf.reduce_mean(box_loss)
+    conf_loss = tf.reduce_mean(conf_loss)
+    class_loss = tf.reduce_mean(class_loss)
+
+    return (loss, box_loss, conf_loss, class_loss, loss, iou, pred_conf,
+            ind_mask, grid_mask)
+
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """This method is not specific to each loss path, but each loss type."""
+    return loss / num_replicas_in_sync
+
+
+class ScaledLoss(YoloLossBase):
+  """This class implements the full logic for the scaled Yolo models."""
+
+  def _build_per_path_attributes(self):
+    """Paramterization of pair wise search and grid generators.
+
+    Objects created here are used for box decoding and dynamic ground truth
+    association.
+    """
+    self._anchor_generator = loss_utils.GridGenerator(
+        masks=self._masks,
+        anchors=self._anchors,
+        scale_anchors=self._path_stride)
+
+    if self._ignore_thresh > 0.0:
+      self._search_pairs = loss_utils.PairWiseSearch(
+          iou_type=self._loss_type, any_match=False, min_conf=0.25)
+    return
+
+  def _compute_loss(self, true_counts, inds, y_true, boxes, classes, y_pred):
+    """Per FPN path loss logic for Yolov4-csp, Yolov4-Large, and Yolov5."""
+    # Generate shape constants.
+    shape = tf.shape(true_counts)
+    batch_size, width, height, num = shape[0], shape[1], shape[2], shape[3]
+    fwidth = tf.cast(width, tf.float32)
+    fheight = tf.cast(height, tf.float32)
+
+    # Cast all input compontnts to float32 and stop gradient to save memory.
+    y_true = tf.cast(y_true, tf.float32)
+    true_counts = tf.cast(true_counts, tf.float32)
+    true_conf = tf.clip_by_value(true_counts, 0.0, 1.0)
+    grid_points, anchor_grid = self._anchor_generator(
+        width, height, batch_size, dtype=tf.float32)
+
+    # Split the y_true list.
+    (true_box, ind_mask, true_class, _, _) = tf.split(
+        y_true, [4, 1, 1, 1, 1], axis=-1)
+    grid_mask = true_conf = tf.squeeze(true_conf, axis=-1)
+    true_class = tf.squeeze(true_class, axis=-1)
+    num_objs = tf.cast(tf.reduce_sum(ind_mask), dtype=y_pred.dtype)
+
+    # Split up the predicitons.
+    y_pred = tf.cast(
+        tf.reshape(y_pred, [batch_size, width, height, num, -1]), tf.float32)
+    pred_box, pred_conf, pred_class = tf.split(y_pred, [4, 1, -1], axis=-1)
+
+    # Decode the boxes for loss compute.
+    scale, pred_box, _ = self._decode_boxes(
+        fwidth, fheight, pred_box, anchor_grid, grid_points, darknet=False)
+
+    # If the ignore threshold is enabled, search all boxes ignore all
+    # IOU valeus larger than the ignore threshold that are not in the
+    # noted ground truth list.
+    if self._ignore_thresh != 0.0:
+      (_, obj_mask) = self._tiled_global_box_search(
+          pred_box,
+          tf.stop_gradient(tf.sigmoid(pred_class)),
+          boxes,
+          classes,
+          true_conf,
+          smoothed=False,
+          scale=scale)
+
+    # Scale and shift and select the ground truth boxes
+    # and predictions to the prediciton domain.
+    offset = tf.cast(
+        tf.gather_nd(grid_points, inds, batch_dims=1), true_box.dtype)
+    offset = tf.concat([offset, tf.zeros_like(offset)], axis=-1)
+    true_box = loss_utils.apply_mask(ind_mask, (scale * true_box) - offset)
+    pred_box = loss_utils.apply_mask(ind_mask,
+                                     tf.gather_nd(pred_box, inds, batch_dims=1))
+
+    # Select the correct/used prediction classes.
+    true_class = tf.one_hot(
+        tf.cast(true_class, tf.int32),
+        depth=tf.shape(pred_class)[-1],
+        dtype=pred_class.dtype)
+    true_class = loss_utils.apply_mask(ind_mask, true_class)
+    pred_class = loss_utils.apply_mask(
+        ind_mask, tf.gather_nd(pred_class, inds, batch_dims=1))
+
+    # Compute the box loss.
+    _, iou, box_loss = self.box_loss(true_box, pred_box, darknet=False)
+    box_loss = loss_utils.apply_mask(tf.squeeze(ind_mask, axis=-1), box_loss)
+    box_loss = math_ops.divide_no_nan(tf.reduce_sum(box_loss), num_objs)
+
+    # Use the box IOU to build the map for confidence loss computation.
+    iou = tf.maximum(tf.stop_gradient(iou), 0.0)
+    smoothed_iou = ((
+        (1 - self._objectness_smooth) * tf.cast(ind_mask, iou.dtype)) +
+                    self._objectness_smooth * tf.expand_dims(iou, axis=-1))
+    smoothed_iou = loss_utils.apply_mask(ind_mask, smoothed_iou)
+    true_conf = loss_utils.build_grid(
+        inds, smoothed_iou, pred_conf, ind_mask, update=self._update_on_repeat)
+    true_conf = tf.squeeze(true_conf, axis=-1)
+
+    # Compute the cross entropy loss for the confidence map.
+    bce = tf.keras.losses.binary_crossentropy(
+        tf.expand_dims(true_conf, axis=-1), pred_conf, from_logits=True)
+    if self._ignore_thresh != 0.0:
+      bce = loss_utils.apply_mask(obj_mask, bce)
+    conf_loss = tf.reduce_mean(bce)
+
+    # Compute the cross entropy loss for the class maps.
+    class_loss = tf.keras.losses.binary_crossentropy(
+        true_class,
+        pred_class,
+        label_smoothing=self._label_smoothing,
+        from_logits=True)
+    class_loss = loss_utils.apply_mask(
+        tf.squeeze(ind_mask, axis=-1), class_loss)
+    class_loss = math_ops.divide_no_nan(tf.reduce_sum(class_loss), num_objs)
+
+    # Apply the weights to each loss.
+    box_loss *= self._iou_normalizer
+    class_loss *= self._cls_normalizer
+    conf_loss *= self._obj_normalizer
+
+    # Add all the losses together then take the sum over the batches.
+    mean_loss = box_loss + class_loss + conf_loss
+    loss = mean_loss * tf.cast(batch_size, mean_loss.dtype)
+
+    return (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf,
+            ind_mask, grid_mask)
+
+  def post_path_aggregation(self, loss, ground_truths, predictions):
+    """This method allows for post processing of a loss value.
+
+    By default the model will have about 3 FPN levels {3, 4, 5}, on
+    larger model that have more like 4 or 5 FPN levels the loss needs to
+    be scaled such that the total update is scaled to the same effective
+    magintude as the model with 3 FPN levels. This helps to prevent gradient
+    explosions.
+
+    Args:
+      loss: `tf.float` scalar for the actual loss.
+      ground_truths: `Dict` holding all the ground truth tensors.
+      predictions: `Dict` holding all the predicted values.
+
+    Returns:
+      loss: `tf.float` scalar for the scaled loss.
+    """
+    scale = tf.stop_gradient(3 / len(list(predictions.keys())))
+    return loss * scale
+
+  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
+    """In the scaled loss, take the sum of the loss across replicas."""
+    return loss
+
+
+class YoloLoss:
+  """This class implements the aggregated loss across YOLO model FPN levels."""
+
+  def __init__(self,
+               keys,
+               classes,
+               anchors,
+               masks=None,
+               path_strides=None,
+               truth_thresholds=None,
+               ignore_thresholds=None,
+               loss_types=None,
+               iou_normalizers=None,
+               cls_normalizers=None,
+               obj_normalizers=None,
+               objectness_smooths=None,
+               box_types=None,
+               scale_xys=None,
+               max_deltas=None,
+               label_smoothing=0.0,
+               use_scaled_loss=False,
+               update_on_repeat=True):
+    """Loss Function Initialization.
+
+    Args:
+      keys: `List[str]` indicating the name of the FPN paths that need to be
+        optimized.
+      classes: `int` for the number of classes
+      anchors: `List[List[int]]` for the anchor boxes that are used in the model
+        at all levels. For anchor free prediction set the anchor list to be the
+        same as the image resolution.
+      masks: `List[int]` for the output level that this specific model output
+        level
+      path_strides: `Dict[int]` for how much to scale this level to get the
+        orginal input shape for each FPN path.
+      truth_thresholds: `Dict[float]` for the IOU value over which the loss is
+        propagated despite a detection being made for each FPN path.
+      ignore_thresholds: `Dict[float]` for the IOU value over which the loss is
+        not propagated, and a detection is assumed to have been made for each
+        FPN path.
+      loss_types: `Dict[str]` for the typeof iou loss to use with in {ciou,
+        diou, giou, iou} for each FPN path.
+      iou_normalizers: `Dict[float]` for how much to scale the loss on the IOU
+        or the boxes for each FPN path.
+      cls_normalizers: `Dict[float]` for how much to scale the loss on the
+        classes for each FPN path.
+      obj_normalizers: `Dict[float]` for how much to scale loss on the detection
+        map for each FPN path.
+      objectness_smooths: `Dict[float]` for how much to smooth the loss on the
+        detection map for each FPN path.
+      box_types: `Dict[bool]` for which scaling type to use for each FPN path.
+      scale_xys:  `Dict[float]` values inidcating how far each pixel can see
+        outside of its containment of 1.0. a value of 1.2 indicates there is a
+        20% extended radius around each pixel that this specific pixel can
+        predict values for a center at. the center can range from 0 - value/2 to
+        1 + value/2, this value is set in the yolo filter, and resused here.
+        there should be one value for scale_xy for each level from min_level to
+        max_level. One for each FPN path.
+      max_deltas: `Dict[float]` for gradient clipping to apply to the box loss
+        for each FPN path.
+      label_smoothing: `Dict[float]` for how much to smooth the loss on the
+        classes for each FPN path.
+      use_scaled_loss: `bool` for whether to use the scaled loss or the
+        traditional loss.
+      update_on_repeat: `bool` for whether to replace with the newest or the
+        best value when an index is consumed by multiple objects.
+    """
+
+    losses = {'darknet': DarknetLoss, 'scaled': ScaledLoss}
+
+    if use_scaled_loss:
+      loss_type = 'scaled'
+    else:
+      loss_type = 'darknet'
+
+    self._loss_dict = {}
+    for key in keys:
+      self._loss_dict[key] = losses[loss_type](
+          classes=classes,
+          anchors=anchors,
+          mask=masks[key],
+          truth_thresh=truth_thresholds[key],
+          ignore_thresh=ignore_thresholds[key],
+          loss_type=loss_types[key],
+          iou_normalizer=iou_normalizers[key],
+          cls_normalizer=cls_normalizers[key],
+          obj_normalizer=obj_normalizers[key],
+          box_type=box_types[key],
+          objectness_smooth=objectness_smooths[key],
+          max_delta=max_deltas[key],
+          path_stride=path_strides[key],
+          scale_x_y=scale_xys[key],
+          update_on_repeat=update_on_repeat,
+          label_smoothing=label_smoothing)
+
+  def __call__(self, ground_truth, predictions, use_reduced_logs=True):
+    metric_dict = collections.defaultdict(dict)
+    metric_dict['net']['box'] = 0
+    metric_dict['net']['class'] = 0
+    metric_dict['net']['conf'] = 0
+
+    loss_val, metric_loss = 0, 0
+    num_replicas_in_sync = tf.distribute.get_strategy().num_replicas_in_sync
+
+    for key in predictions.keys():
+      (loss, loss_box, loss_conf, loss_class, mean_loss, avg_iou,
+       avg_obj) = self._loss_dict[key](ground_truth['true_conf'][key],
+                                       ground_truth['inds'][key],
+                                       ground_truth['upds'][key],
+                                       ground_truth['bbox'],
+                                       ground_truth['classes'],
+                                       predictions[key])
+
+      # after computing the loss, scale loss as needed for aggregation
+      # across FPN levels
+      loss = self._loss_dict[key].post_path_aggregation(
+          loss, ground_truth, predictions)
+
+      # after completing the scaling of the loss on each replica, handle
+      # scaling the loss for mergeing the loss across replicas
+      loss = self._loss_dict[key].cross_replica_aggregation(
+          loss, num_replicas_in_sync)
+      loss_val += loss
+
+      # detach all the below gradients: none of them should make a
+      # contribution to the gradient form this point forwards
+      metric_loss += tf.stop_gradient(mean_loss)
+      metric_dict[key]['loss'] = tf.stop_gradient(mean_loss)
+      metric_dict[key]['avg_iou'] = tf.stop_gradient(avg_iou)
+      metric_dict[key]['avg_obj'] = tf.stop_gradient(avg_obj)
+
+      if not use_reduced_logs:
+        metric_dict[key]['conf_loss'] = tf.stop_gradient(loss_conf)
+        metric_dict[key]['box_loss'] = tf.stop_gradient(loss_box)
+        metric_dict[key]['class_loss'] = tf.stop_gradient(loss_class)
+
+      metric_dict['net']['box'] += tf.stop_gradient(loss_box)
+      metric_dict['net']['class'] += tf.stop_gradient(loss_class)
+      metric_dict['net']['conf'] += tf.stop_gradient(loss_conf)
+    return loss_val, metric_loss, metric_dict
--- a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for yolo heads."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.beta.projects.yolo.losses import yolo_loss
+
+
+class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (True),
+      (False),
+  )
+  def test_loss_init(self, scaled):
+    """Test creation of YOLO family models."""
+
+    def inpdict(input_shape, dtype=tf.float32):
+      inputs = {}
+      for key in input_shape:
+        inputs[key] = tf.ones(input_shape[key], dtype=dtype)
+      return inputs
+
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 255],
+        '4': [1, 26, 26, 255],
+        '5': [1, 13, 13, 255]
+    }
+    classes = 80
+    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
+    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
+               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
+               [348.0, 340.0]]
+    keys = ['3', '4', '5']
+    path_strides = {key: 2**int(key) for key in keys}
+
+    loss = yolo_loss.YoloLoss(
+        keys,
+        classes,
+        anchors,
+        masks=masks,
+        path_strides=path_strides,
+        truth_thresholds={key: 1.0 for key in keys},
+        ignore_thresholds={key: 0.7 for key in keys},
+        loss_types={key: 'ciou' for key in keys},
+        iou_normalizers={key: 0.05 for key in keys},
+        cls_normalizers={key: 0.5 for key in keys},
+        obj_normalizers={key: 1.0 for key in keys},
+        objectness_smooths={key: 1.0 for key in keys},
+        box_types={key: 'scaled' for key in keys},
+        scale_xys={key: 2.0 for key in keys},
+        max_deltas={key: 30.0 for key in keys},
+        label_smoothing=0.0,
+        use_scaled_loss=scaled,
+        update_on_repeat=True)
+
+    count = inpdict({
+        '3': [1, 52, 52, 3, 1],
+        '4': [1, 26, 26, 3, 1],
+        '5': [1, 13, 13, 3, 1]
+    })
+    ind = inpdict({
+        '3': [1, 300, 3],
+        '4': [1, 300, 3],
+        '5': [1, 300, 3]
+    }, tf.int32)
+    truths = inpdict({'3': [1, 300, 8], '4': [1, 300, 8], '5': [1, 300, 8]})
+    boxes = tf.ones([1, 300, 4], dtype=tf.float32)
+    classes = tf.ones([1, 300], dtype=tf.float32)
+
+    gt = {
+        'true_conf': count,
+        'inds': ind,
+        'upds': truths,
+        'bbox': boxes,
+        'classes': classes
+    }
+
+    _, _, _ = loss(gt, inpdict(input_shape))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
 """Contains definitions of Darknet Backbone Networks.

-   The models are inspired by ResNet, and CSPNet
+   The models are inspired by ResNet and CSPNet.

 Residual networks (ResNets) were proposed in:
 [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
@@ -390,7 +389,7 @@ class Darknet(tf.keras.Model):
      norm_momentum=0.99,
      norm_epsilon=0.001,
      dilate=False,
-      kernel_initializer='glorot_uniform',
+      kernel_initializer='VarianceScaling',
      kernel_regularizer=None,
      bias_regularizer=None,
      **kwargs):
@@ -507,10 +506,12 @@ class Darknet(tf.keras.Model):
    self._default_dict['name'] = f'{name}_csp_down'
    if self._dilate:
      self._default_dict['dilation_rate'] = config.dilation_rate
+      degrid = int(tf.math.log(float(config.dilation_rate)) / tf.math.log(2.))
    else:
      self._default_dict['dilation_rate'] = 1
+      degrid = 0

-    # swap/add dilation
+    # swap/add dialation
    x, x_route = nn_blocks.CSPRoute(
        filters=config.filters,
        filter_scale=csp_filter_scale,
@@ -518,7 +519,7 @@ class Darknet(tf.keras.Model):
        **self._default_dict)(
            inputs)

-    dilated_reps = config.repetitions - self._default_dict['dilation_rate'] // 2
+    dilated_reps = config.repetitions - degrid
    for i in range(dilated_reps):
      self._default_dict['name'] = f'{name}_{i}'
      x = nn_blocks.DarkResidual(
@@ -528,8 +529,8 @@ class Darknet(tf.keras.Model):
              x)

    for i in range(dilated_reps, config.repetitions):
-      self._default_dict[
-          'dilation_rate'] = self._default_dict['dilation_rate'] // 2
+      self._default_dict['dilation_rate'] = max(
+          1, self._default_dict['dilation_rate'] // 2)
      self._default_dict[
          'name'] = f"{name}_{i}_degridded_{self._default_dict['dilation_rate']}"
      x = nn_blocks.DarkResidual(
@@ -592,8 +593,8 @@ class Darknet(tf.keras.Model):
        filters=config.filters, downsample=True, **self._default_dict)(
            inputs)

-    dilated_reps = config.repetitions - (
-        self._default_dict['dilation_rate'] // 2) - 1
+    dilated_reps = config.repetitions - self._default_dict[
+        'dilation_rate'] // 2 - 1
    for i in range(dilated_reps):
      self._default_dict['name'] = f'{name}_{i}'
      x = nn_blocks.DarkResidual(
@@ -661,12 +662,13 @@ class Darknet(tf.keras.Model):
 @factory.register_backbone_builder('darknet')
 def build_darknet(
    input_specs: tf.keras.layers.InputSpec,
-    backbone_config: hyperparams.Config,
+    backbone_cfg: hyperparams.Config,
    norm_activation_config: hyperparams.Config,
    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
  """Builds darknet."""

-  backbone_cfg = backbone_config.get()
+  backbone_cfg = backbone_cfg.get()
+
  model = Darknet(
      model_id=backbone_cfg.model_id,
      min_level=backbone_cfg.min_level,

--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
 """Feature Pyramid Network and Path Aggregation variants used in YOLO."""

 import tensorflow as tf
@@ -39,7 +38,7 @@ class YoloFPN(tf.keras.layers.Layer):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
@@ -184,7 +183,7 @@ class YoloPAN(tf.keras.layers.Layer):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               fpn_input=True,
@@ -206,7 +205,7 @@ class YoloPAN(tf.keras.layers.Layer):
        by zero.
      kernel_initializer: kernel_initializer for convolutional layers.
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
      fpn_input: `bool`, for whether the input into this fucntion is an FPN or
        a backbone.
      fpn_filter_scale: `int`, scaling factor for the FPN filters.
@@ -374,7 +373,7 @@ class YoloDecoder(tf.keras.Model):
               use_sync_bn=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               **kwargs):
@@ -389,8 +388,8 @@ class YoloDecoder(tf.keras.Model):
      use_fpn: `bool`, use the FPN found in the YoloV4 model.
      use_spatial_attention: `bool`, use the spatial attention module.
      csp_stack: `bool`, CSPize the FPN.
-      fpn_depth: `int`, number of layers ot use in each FPN path
-        if you choose to use an FPN.
+      fpn_depth: `int`, number of layers ot use in each FPN path if you choose
+        to use an FPN.
      fpn_filter_scale: `int`, scaling factor for the FPN filters.
      path_process_len: `int`, number of layers ot use in each Decoder path.
      max_level_process_len: `int`, number of layers ot use in the largest

--- a/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
 """Yolo heads."""

 import tensorflow as tf
@@ -30,10 +29,11 @@ class YoloHead(tf.keras.layers.Layer):
               output_extras=0,
               norm_momentum=0.99,
               norm_epsilon=0.001,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               activation=None,
+               smart_bias=False,
               **kwargs):
    """Yolo Prediction Head initialization function.

@@ -52,6 +52,7 @@ class YoloHead(tf.keras.layers.Layer):
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
      activation: `str`, the activation function to use typically leaky or mish.
+      smart_bias: `bool` whether or not use smart bias.
      **kwargs: keyword arguments to be passed.
    """

@@ -68,6 +69,7 @@ class YoloHead(tf.keras.layers.Layer):
    self._output_extras = output_extras

    self._output_conv = (classes + output_extras + 5) * boxes_per_level
+    self._smart_bias = smart_bias

    self._base_config = dict(
        activation=activation,
@@ -85,10 +87,29 @@ class YoloHead(tf.keras.layers.Layer):
        use_bn=False,
        **self._base_config)

+  def bias_init(self, scale, inshape, isize=640, no_per_conf=8):
+
+    def bias(shape, dtype):
+      init = tf.keras.initializers.Zeros()
+      base = init(shape, dtype=dtype)
+      if self._smart_bias:
+        base = tf.reshape(base, [self._boxes_per_level, -1])
+        box, conf, classes = tf.split(base, [4, 1, -1], axis=-1)
+        conf += tf.math.log(no_per_conf / ((isize / scale)**2))
+        classes += tf.math.log(0.6 / (self._classes - 0.99))
+        base = tf.concat([box, conf, classes], axis=-1)
+        base = tf.reshape(base, [-1])
+      return base
+
+    return bias
+
  def build(self, input_shape):
    self._head = dict()
    for key in self._key_list:
-      self._head[key] = nn_blocks.ConvBN(**self._conv_config)
+      scale = 2**int(key)
+      self._head[key] = nn_blocks.ConvBN(
+          bias_initializer=self.bias_init(scale, input_shape[key][-1]),
+          **self._conv_config)

  def call(self, inputs):
    outputs = dict()
@@ -107,6 +128,10 @@ class YoloHead(tf.keras.layers.Layer):
          'Model has to be built before number of boxes can be determined.')
    return (self._max_level - self._min_level + 1) * self._boxes_per_level

+  @property
+  def num_heads(self):
+    return self._max_level - self._min_level + 1
+
  def get_config(self):
    config = dict(
        min_level=self._min_level,

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common building blocks for yolo layer (detection layer)."""
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import detection_generator
+from official.vision.beta.projects.yolo.losses import yolo_loss
+from official.vision.beta.projects.yolo.ops import box_ops
+from official.vision.beta.projects.yolo.ops import loss_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloLayer(tf.keras.Model):
+  """Yolo layer (detection generator)."""
+
+  def __init__(self,
+               masks,
+               anchors,
+               classes,
+               iou_thresh=0.0,
+               ignore_thresh=0.7,
+               truth_thresh=1.0,
+               nms_thresh=0.6,
+               max_delta=10.0,
+               loss_type='ciou',
+               iou_normalizer=1.0,
+               cls_normalizer=1.0,
+               obj_normalizer=1.0,
+               use_scaled_loss=False,
+               update_on_repeat=False,
+               pre_nms_points=5000,
+               label_smoothing=0.0,
+               max_boxes=200,
+               box_type='original',
+               path_scale=None,
+               scale_xy=None,
+               nms_type='greedy',
+               objectness_smooth=False,
+               **kwargs):
+    """Parameters for the loss functions used at each detection head output.
+
+    Args:
+      masks: `List[int]` for the output level that this specific model output
+        level.
+      anchors: `List[List[int]]` for the anchor boxes that are used in the
+        model.
+      classes: `int` for the number of classes.
+      iou_thresh: `float` to use many anchors per object if IoU(Obj, Anchor) >
+        iou_thresh.
+      ignore_thresh: `float` for the IOU value over which the loss is not
+        propagated, and a detection is assumed to have been made.
+      truth_thresh: `float` for the IOU value over which the loss is propagated
+        despite a detection being made'.
+      nms_thresh: `float` for the minimum IOU value for an overlap.
+      max_delta: gradient clipping to apply to the box loss.
+      loss_type: `str` for the typeof iou loss to use with in {ciou, diou,
+        giou, iou}.
+      iou_normalizer: `float` for how much to scale the loss on the IOU or the
+        boxes.
+      cls_normalizer: `float` for how much to scale the loss on the classes.
+      obj_normalizer: `float` for how much to scale loss on the detection map.
+      use_scaled_loss: `bool` for whether to use the scaled loss
+        or the traditional loss.
+      update_on_repeat: `bool` indicating how you would like to handle repeated
+        indexes in a given [j, i] index. Setting this to True will give more
+        consistent MAP, setting it to falls will improve recall by 1-2% but will
+        sacrifice some MAP.
+      pre_nms_points: `int` number of top candidate detections per class before
+        NMS.
+      label_smoothing: `float` for how much to smooth the loss on the classes.
+      max_boxes: `int` for the maximum number of boxes retained over all
+        classes.
+      box_type: `str`, there are 3 different box types that will affect training
+        differently {original, scaled and anchor_free}. The original method
+        decodes the boxes by applying an exponential to the model width and
+        height maps, then scaling the maps by the anchor boxes. This method is
+        used in Yolo-v4, Yolo-v3, and all its counterparts. The Scale method
+        squares the width and height and scales both by a fixed factor of 4.
+        This method is used in the Scale Yolo models, as well as Yolov4-CSP.
+        Finally, anchor_free is like the original method but will not apply an
+        activation function to the boxes, this is used for some of the newer
+        anchor free versions of YOLO.
+      path_scale: `dict` for the size of the input tensors. Defaults to
+        precalulated values from the `mask`.
+      scale_xy: dictionary `float` values inidcating how far each pixel can see
+        outside of its containment of 1.0. a value of 1.2 indicates there is a
+        20% extended radius around each pixel that this specific pixel can
+        predict values for a center at. the center can range from 0 - value/2
+        to 1 + value/2, this value is set in the yolo filter, and resused here.
+        there should be one value for scale_xy for each level from min_level to
+        max_level.
+      nms_type: `str` for which non max suppression to use.
+      objectness_smooth: `float` for how much to smooth the loss on the
+        detection map.
+      **kwargs: Addtional keyword arguments.
+    """
+    super().__init__(**kwargs)
+    self._masks = masks
+    self._anchors = anchors
+    self._thresh = iou_thresh
+    self._ignore_thresh = ignore_thresh
+    self._truth_thresh = truth_thresh
+    self._iou_normalizer = iou_normalizer
+    self._cls_normalizer = cls_normalizer
+    self._obj_normalizer = obj_normalizer
+    self._objectness_smooth = objectness_smooth
+    self._nms_thresh = nms_thresh
+    self._max_boxes = max_boxes
+    self._max_delta = max_delta
+    self._classes = classes
+    self._loss_type = loss_type
+
+    self._use_scaled_loss = use_scaled_loss
+    self._update_on_repeat = update_on_repeat
+
+    self._pre_nms_points = pre_nms_points
+    self._label_smoothing = label_smoothing
+    self._keys = list(masks.keys())
+    self._len_keys = len(self._keys)
+    self._box_type = box_type
+    self._path_scale = path_scale or {
+        key: 2**int(key) for key, _ in masks.items()
+    }
+
+    self._nms_type = nms_type
+    self._scale_xy = scale_xy or {key: 1.0 for key, _ in masks.items()}
+
+    self._generator = {}
+    self._len_mask = {}
+    for key in self._keys:
+      anchors = [self._anchors[mask] for mask in self._masks[key]]
+      self._generator[key] = self.get_generators(anchors, self._path_scale[key],  # pylint: disable=assignment-from-none
+                                                 key)
+      self._len_mask[key] = len(self._masks[key])
+    return
+
+  def get_generators(self, anchors, path_scale, path_key):
+    anchor_generator = loss_utils.GridGenerator(
+        anchors, scale_anchors=path_scale)
+    return anchor_generator
+
+  def parse_prediction_path(self, key, inputs):
+    shape_ = tf.shape(inputs)
+    shape = inputs.get_shape().as_list()
+    batchsize, height, width = shape_[0], shape[1], shape[2]
+
+    if height is None or width is None:
+      height, width = shape_[1], shape_[2]
+
+    generator = self._generator[key]
+    len_mask = self._len_mask[key]
+    scale_xy = self._scale_xy[key]
+
+    # reshape the yolo output to (batchsize,
+    #                             width,
+    #                             height,
+    #                             number_anchors,
+    #                             remaining_points)
+    data = tf.reshape(inputs, [-1, height, width, len_mask, self._classes + 5])
+
+    # use the grid generator to get the formatted anchor boxes and grid points
+    # in shape [1, height, width, 2]
+    centers, anchors = generator(height, width, batchsize, dtype=data.dtype)
+
+    # split the yolo detections into boxes, object score map, classes
+    boxes, obns_scores, class_scores = tf.split(
+        data, [4, 1, self._classes], axis=-1)
+
+    # determine the number of classes
+    classes = class_scores.get_shape().as_list()[-1]
+
+    # configurable to use the new coordinates in scaled Yolo v4 or not
+    _, _, boxes = loss_utils.get_predicted_box(
+        tf.cast(height, data.dtype),
+        tf.cast(width, data.dtype),
+        boxes,
+        anchors,
+        centers,
+        scale_xy,
+        stride=self._path_scale[key],
+        darknet=False,
+        box_type=self._box_type[key])
+
+    # convert boxes from yolo(x, y, w. h) to tensorflow(ymin, xmin, ymax, xmax)
+    boxes = box_ops.xcycwh_to_yxyx(boxes)
+
+    # activate and detection map
+    obns_scores = tf.math.sigmoid(obns_scores)
+
+    # convert detection map to class detection probabailities
+    class_scores = tf.math.sigmoid(class_scores) * obns_scores
+
+    # platten predictions to [batchsize, N, -1] for non max supression
+    fill = height * width * len_mask
+    boxes = tf.reshape(boxes, [-1, fill, 4])
+    class_scores = tf.reshape(class_scores, [-1, fill, classes])
+    obns_scores = tf.reshape(obns_scores, [-1, fill])
+    return obns_scores, boxes, class_scores
+
+  def call(self, inputs):
+    boxes = []
+    class_scores = []
+    object_scores = []
+    levels = list(inputs.keys())
+    min_level = int(min(levels))
+    max_level = int(max(levels))
+
+    # aggregare boxes over each scale
+    for i in range(min_level, max_level + 1):
+      key = str(i)
+      object_scores_, boxes_, class_scores_ = self.parse_prediction_path(
+          key, inputs[key])
+      boxes.append(boxes_)
+      class_scores.append(class_scores_)
+      object_scores.append(object_scores_)
+
+    # colate all predicitons
+    boxes = tf.concat(boxes, axis=1)
+    object_scores = tf.concat(object_scores, axis=1)
+    class_scores = tf.concat(class_scores, axis=1)
+
+    # get masks to threshold all the predicitons
+    object_mask = tf.cast(object_scores > self._thresh, object_scores.dtype)
+    class_mask = tf.cast(class_scores > self._thresh, class_scores.dtype)
+
+    # apply thresholds mask to all the predicitons
+    object_scores *= object_mask
+    class_scores *= (tf.expand_dims(object_mask, axis=-1) * class_mask)
+
+    # apply nms
+    if self._nms_type == 'greedy':
+      # greedy NMS
+      boxes = tf.cast(boxes, dtype=tf.float32)
+      class_scores = tf.cast(class_scores, dtype=tf.float32)
+      boxes, object_scores_, class_scores, num_detections = (
+          tf.image.combined_non_max_suppression(
+              tf.expand_dims(boxes, axis=-2),
+              class_scores,
+              self._pre_nms_points,
+              self._max_boxes,
+              iou_threshold=self._nms_thresh,
+              score_threshold=self._thresh))
+      # cast the boxes and predicitons abck to original datatype
+      boxes = tf.cast(boxes, object_scores.dtype)
+      class_scores = tf.cast(class_scores, object_scores.dtype)
+      object_scores = tf.cast(object_scores_, object_scores.dtype)
+    else:
+      # TPU NMS
+      boxes = tf.cast(boxes, dtype=tf.float32)
+      class_scores = tf.cast(class_scores, dtype=tf.float32)
+      (boxes, confidence, classes,
+       num_detections) = detection_generator._generate_detections_v2(  # pylint:disable=protected-access
+           tf.expand_dims(boxes, axis=-2),
+           class_scores,
+           pre_nms_top_k=self._pre_nms_points,
+           max_num_detections=self._max_boxes,
+           nms_iou_threshold=self._nms_thresh,
+           pre_nms_score_threshold=self._thresh)
+      boxes = tf.cast(boxes, object_scores.dtype)
+      class_scores = tf.cast(classes, object_scores.dtype)
+      object_scores = tf.cast(confidence, object_scores.dtype)
+
+    # format and return
+    return {
+        'bbox': boxes,
+        'classes': class_scores,
+        'confidence': object_scores,
+        'num_detections': num_detections,
+    }
+
+  @property
+  def losses(self):
+    """Generates a dictionary of losses to apply to each path.
+
+    Done in the detection generator because all parameters are the same
+    across both loss and detection generator
+    """
+    loss = yolo_loss.YoloLoss(
+        keys=self._keys,
+        classes=self._classes,
+        anchors=self._anchors,
+        masks=self._masks,
+        path_strides=self._path_scale,
+        truth_thresholds=self._truth_thresh,
+        ignore_thresholds=self._ignore_thresh,
+        loss_types=self._loss_type,
+        iou_normalizers=self._iou_normalizer,
+        cls_normalizers=self._cls_normalizer,
+        obj_normalizers=self._obj_normalizer,
+        objectness_smooths=self._objectness_smooth,
+        box_types=self._box_type,
+        max_deltas=self._max_delta,
+        scale_xys=self._scale_xy,
+        use_scaled_loss=self._use_scaled_loss,
+        update_on_repeat=self._update_on_repeat,
+        label_smoothing=self._label_smoothing)
+    return loss
+
+  def get_config(self):
+    return {
+        'masks': dict(self._masks),
+        'anchors': [list(a) for a in self._anchors],
+        'thresh': self._thresh,
+        'max_boxes': self._max_boxes,
+    }
--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for yolo detection generator."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.beta.projects.yolo.modeling.layers import detection_generator as dg
+
+
+class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (True),
+      (False),
+  )
+  def test_network_creation(self, nms):
+    """Test creation of ResNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 255],
+        '4': [1, 26, 26, 255],
+        '5': [1, 13, 13, 255]
+    }
+    classes = 80
+    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
+    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
+               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
+               [348.0, 340.0]]
+    box_type = {key: 'scaled' for key in masks.keys()}
+
+    layer = dg.YoloLayer(
+        masks, anchors, classes, box_type=box_type, max_boxes=10)
+
+    inputs = {}
+    for key in input_shape:
+      inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+
+    endpoints = layer(inputs)
+
+    boxes = endpoints['bbox']
+    classes = endpoints['classes']
+
+    self.assertAllEqual(boxes.shape.as_list(), [1, 10, 4])
+    self.assertAllEqual(classes.shape.as_list(), [1, 10])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
 """Contains common building blocks for yolo neural networks."""
-
-from typing import Callable, List
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.vision.beta.ops import spatial_transform_ops
@@ -49,7 +46,7 @@ class ConvBN(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -98,7 +95,14 @@ class ConvBN(tf.keras.layers.Layer):
    self._strides = strides
    self._padding = padding
    self._dilation_rate = dilation_rate
-    self._kernel_initializer = kernel_initializer
+
+    if kernel_initializer == 'VarianceScaling':
+      # to match pytorch initialization method
+      self._kernel_initializer = tf.keras.initializers.VarianceScaling(
+          scale=1 / 3, mode='fan_in', distribution='uniform')
+    else:
+      self._kernel_initializer = kernel_initializer
+
    self._bias_initializer = bias_initializer
    self._kernel_regularizer = kernel_regularizer

@@ -195,7 +199,7 @@ class DarkResidual(tf.keras.layers.Layer):
               filters=1,
               filter_scale=2,
               dilation_rate=1,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               kernel_regularizer=None,
               bias_regularizer=None,
@@ -367,7 +371,7 @@ class CSPTiny(tf.keras.layers.Layer):

  def __init__(self,
               filters=1,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -533,7 +537,7 @@ class CSPRoute(tf.keras.layers.Layer):
               filters,
               filter_scale=2,
               activation='mish',
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -549,7 +553,7 @@ class CSPRoute(tf.keras.layers.Layer):

    Args:
      filters: integer for output depth, or the number of features to learn
-      filter_scale: integer dicating (filters//2) or the number of filters in
+      filter_scale: integer dictating (filters//2) or the number of filters in
        the partial feature stack.
      activation: string for activation function to use in layer.
      kernel_initializer: string to indicate which function to use to
@@ -662,7 +666,7 @@ class CSPConnect(tf.keras.layers.Layer):
               drop_first=False,
               activation='mish',
               kernel_size=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -676,8 +680,8 @@ class CSPConnect(tf.keras.layers.Layer):
    """Initializer for CSPConnect block.

    Args:
-      filters: integer for output depth, or the number of features to learn
-      filter_scale: integer dicating (filters//2) or the number of filters in
+      filters: integer for output depth, or the number of features to learn.
+      filter_scale: integer dictating (filters//2) or the number of filters in
        the partial feature stack.
      drop_final: `bool`, whether to drop final conv layer.
      drop_first: `bool`, whether to drop first conv layer.
@@ -762,122 +766,6 @@ class CSPConnect(tf.keras.layers.Layer):
    return x


-class CSPStack(tf.keras.layers.Layer):
-  """CSP Stack layer.
-
-  CSP full stack, combines the route and the connect in case you dont want to
-  jsut quickly wrap an existing callable or list of layers to
-  make it a cross stage partial. Added for ease of use. you should be able
-  to wrap any layer stack with a CSP independent of wether it belongs
-  to the Darknet family. if filter_scale = 2, then the blocks in the stack
-  passed into the the CSP stack should also have filters = filters/filter_scale
-  Cross Stage Partial networks (CSPNets) were proposed in:
-
-  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu,
-        Ping-Yang Chen, Jun-Wei Hsieh
-      CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
-        arXiv:1911.11929
-  """
-
-  def __init__(self,
-               filters,
-               model_to_wrap=None,
-               filter_scale=2,
-               activation='mish',
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               bias_regularizer=None,
-               kernel_regularizer=None,
-               downsample=True,
-               use_bn=True,
-               use_sync_bn=False,
-               norm_momentum=0.99,
-               norm_epsilon=0.001,
-               **kwargs):
-    """CSPStack layer initializer.
-
-    Args:
-      filters: integer for output depth, or the number of features to learn.
-      model_to_wrap: callable Model or a list of callable objects that will
-        process the output of CSPRoute, and be input into CSPConnect.
-        list will be called sequentially.
-      filter_scale: integer dicating (filters//2) or the number of filters in
-        the partial feature stack.
-      activation: string for activation function to use in layer.
-      kernel_initializer: string to indicate which function to use to initialize
-        weights.
-      bias_initializer: string to indicate which function to use to initialize
-        bias.
-      bias_regularizer: string to indicate which function to use to regularizer
-        bias.
-      kernel_regularizer: string to indicate which function to use to
-        regularizer weights.
-      downsample: down_sample the input.
-      use_bn: boolean for whether to use batch normalization.
-      use_sync_bn: boolean for whether sync batch normalization statistics
-        of all batch norm layers to the models global statistics
-        (across all input batches).
-      norm_momentum: float for moment to use for batch normalization.
-      norm_epsilon: float for batch normalization epsilon.
-      **kwargs: Keyword Arguments.
-
-    Raises:
-      TypeError: model_to_wrap is not a layer or a list of layers
-    """
-
-    super().__init__(**kwargs)
-    # layer params
-    self._filters = filters
-    self._filter_scale = filter_scale
-    self._activation = activation
-    self._downsample = downsample
-
-    # convoultion params
-    self._kernel_initializer = kernel_initializer
-    self._bias_initializer = bias_initializer
-    self._kernel_regularizer = kernel_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._use_bn = use_bn
-    self._use_sync_bn = use_sync_bn
-    self._norm_momentum = norm_momentum
-    self._norm_epsilon = norm_epsilon
-
-    if model_to_wrap is None:
-      self._model_to_wrap = []
-    elif isinstance(model_to_wrap, Callable):
-      self._model_to_wrap = [model_to_wrap]
-    elif isinstance(model_to_wrap, List):
-      self._model_to_wrap = model_to_wrap
-    else:
-      raise TypeError(
-          'the input to the CSPStack must be a list of layers that we can' +
-          'iterate through, or \n a callable')
-
-  def build(self, input_shape):
-    dark_conv_args = {
-        'filters': self._filters,
-        'filter_scale': self._filter_scale,
-        'activation': self._activation,
-        'kernel_initializer': self._kernel_initializer,
-        'bias_initializer': self._bias_initializer,
-        'bias_regularizer': self._bias_regularizer,
-        'use_bn': self._use_bn,
-        'use_sync_bn': self._use_sync_bn,
-        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon,
-        'kernel_regularizer': self._kernel_regularizer,
-    }
-    self._route = CSPRoute(downsample=self._downsample, **dark_conv_args)
-    self._connect = CSPConnect(**dark_conv_args)
-
-  def call(self, inputs, training=None):
-    x, x_route = self._route(inputs)
-    for layer in self._model_to_wrap:
-      x = layer(x)
-    x = self._connect([x, x_route])
-    return x
-
-
 @tf.keras.utils.register_keras_serializable(package='yolo')
 class PathAggregationBlock(tf.keras.layers.Layer):
  """Path Aggregation block."""
@@ -885,7 +773,7 @@ class PathAggregationBlock(tf.keras.layers.Layer):
  def __init__(self,
               filters=1,
               drop_final=True,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1121,7 +1009,7 @@ class SAM(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1193,7 +1081,7 @@ class CAM(tf.keras.layers.Layer):

  def __init__(self,
               reduction_ratio=1.0,
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1286,7 +1174,7 @@ class CBAM(tf.keras.layers.Layer):
               strides=(1, 1),
               padding='same',
               dilation_rate=(1, 1),
-               kernel_initializer='glorot_uniform',
+               kernel_initializer='VarianceScaling',
               bias_initializer='zeros',
               bias_regularizer=None,
               kernel_regularizer=None,
@@ -1355,27 +1243,26 @@ class DarkRouteProcess(tf.keras.layers.Layer):
                          insert_spp = False)(x)
  """

-  def __init__(
-      self,
-      filters=2,
-      repetitions=2,
-      insert_spp=False,
-      insert_sam=False,
-      insert_cbam=False,
-      csp_stack=0,
-      csp_scale=2,
-      kernel_initializer='glorot_uniform',
-      bias_initializer='zeros',
-      bias_regularizer=None,
-      kernel_regularizer=None,
-      use_sync_bn=False,
-      norm_momentum=0.99,
-      norm_epsilon=0.001,
-      block_invert=False,
-      activation='leaky',
-      leaky_alpha=0.1,
-      spp_keys=None,
-      **kwargs):
+  def __init__(self,
+               filters=2,
+               repetitions=2,
+               insert_spp=False,
+               insert_sam=False,
+               insert_cbam=False,
+               csp_stack=0,
+               csp_scale=2,
+               kernel_initializer='VarianceScaling',
+               bias_initializer='zeros',
+               bias_regularizer=None,
+               kernel_regularizer=None,
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               block_invert=False,
+               activation='leaky',
+               leaky_alpha=0.1,
+               spp_keys=None,
+               **kwargs):
    """DarkRouteProcess initializer.

    Args:

--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
@@ -106,86 +106,6 @@ class CSPRouteTest(tf.test.TestCase, parameterized.TestCase):
    self.assertNotIn(None, grad)


-class CSPStackTest(tf.test.TestCase, parameterized.TestCase):
-
-  def build_layer(self, layer_type, filters, filter_scale, count, stack_type,
-                  downsample):
-    if stack_type is not None:
-      layers = []
-      if layer_type == 'residual':
-        for _ in range(count):
-          layers.append(
-              nn_blocks.DarkResidual(
-                  filters=filters // filter_scale, filter_scale=filter_scale))
-      else:
-        for _ in range(count):
-          layers.append(nn_blocks.ConvBN(filters=filters))
-
-      if stack_type == 'model':
-        layers = tf.keras.Sequential(layers=layers)
-    else:
-      layers = None
-
-    stack = nn_blocks.CSPStack(
-        filters=filters,
-        filter_scale=filter_scale,
-        downsample=downsample,
-        model_to_wrap=layers)
-    return stack
-
-  @parameterized.named_parameters(
-      ('no_stack', 224, 224, 64, 2, 'residual', None, 0, True),
-      ('residual_stack', 224, 224, 64, 2, 'residual', 'list', 2, True),
-      ('conv_stack', 224, 224, 64, 2, 'conv', 'list', 3, False),
-      ('callable_no_scale', 224, 224, 64, 1, 'residual', 'model', 5, False))
-  def test_pass_through(self, width, height, filters, mod, layer_type,
-                        stack_type, count, downsample):
-    x = tf.keras.Input(shape=(width, height, filters))
-    test_layer = self.build_layer(layer_type, filters, mod, count, stack_type,
-                                  downsample)
-    outx = test_layer(x)
-    print(outx)
-    print(outx.shape.as_list())
-    if downsample:
-      self.assertAllEqual(outx.shape.as_list(),
-                          [None, width // 2, height // 2, filters])
-    else:
-      self.assertAllEqual(outx.shape.as_list(), [None, width, height, filters])
-
-  @parameterized.named_parameters(
-      ('no_stack', 224, 224, 64, 2, 'residual', None, 0, True),
-      ('residual_stack', 224, 224, 64, 2, 'residual', 'list', 2, True),
-      ('conv_stack', 224, 224, 64, 2, 'conv', 'list', 3, False),
-      ('callable_no_scale', 224, 224, 64, 1, 'residual', 'model', 5, False))
-  def test_gradient_pass_though(self, width, height, filters, mod, layer_type,
-                                stack_type, count, downsample):
-    loss = tf.keras.losses.MeanSquaredError()
-    optimizer = tf.keras.optimizers.SGD()
-
-    init = tf.random_normal_initializer()
-    x = tf.Variable(
-        initial_value=init(shape=(1, width, height, filters), dtype=tf.float32))
-
-    if not downsample:
-      y = tf.Variable(
-          initial_value=init(
-              shape=(1, width, height, filters), dtype=tf.float32))
-    else:
-      y = tf.Variable(
-          initial_value=init(
-              shape=(1, width // 2, height // 2, filters), dtype=tf.float32))
-    test_layer = self.build_layer(layer_type, filters, mod, count, stack_type,
-                                  downsample)
-
-    with tf.GradientTape() as tape:
-      x_hat = test_layer(x)
-      grad_loss = loss(x_hat, y)
-    grad = tape.gradient(grad_loss, test_layer.trainable_variables)
-    optimizer.apply_gradients(zip(grad, test_layer.trainable_variables))
-
-    self.assertNotIn(None, grad)
-
-
 class ConvBNTest(tf.test.TestCase, parameterized.TestCase):

  @parameterized.named_parameters(