Merge pull request #10286 from PurdueDualityLab:task_pr

PiperOrigin-RevId: 402338060

Merge pull request #10286 from PurdueDualityLab:task_pr
PiperOrigin-RevId: 402338060
6a55ecde · A. Unique TensorFlower · 2d353306 · 379d64c5 · 6a55ecde · 6a55ecde
Commit 6a55ecde authored Oct 11, 2021 by A. Unique TensorFlower
20 changed files
--- a/official/vision/beta/projects/yolo/README.md
+++ b/official/vision/beta/projects/yolo/README.md
@@ -73,10 +73,14 @@ connected to a new, more powerful backbone if a person chose to.
 | Yolo-v3 spp      |
 | Yolo-v4          |
 | Yolo-v4 tiny     |
+| Yolo-v4 csp      |
+| Yolo-v4 large    |
+
+## Models Zoo

-## Requirements

-[![TensorFlow 2.2](https://img.shields.io/badge/TensorFlow-2.2-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
+## Requirements
+[![TensorFlow 2.6](https://img.shields.io/badge/TensorFlow-2.6-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0)
 [![Python 3.8](https://img.shields.io/badge/Python-3.8-3776AB)](https://www.python.org/downloads/release/python-380/)



--- a/official/vision/beta/projects/yolo/common/registry_imports.py
+++ b/official/vision/beta/projects/yolo/common/registry_imports.py
@@ -15,7 +15,22 @@
 """All necessary imports for registration."""

 # pylint: disable=unused-import
+# pylint: disable=g-bad-import-order
 from official.common import registry_imports
+
+# import configs
 from official.vision.beta.projects.yolo.configs import darknet_classification
+from official.vision.beta.projects.yolo.configs import yolo as yolo_config
+
+# import modeling components
 from official.vision.beta.projects.yolo.modeling.backbones import darknet
+from official.vision.beta.projects.yolo.modeling.decoders import yolo_decoder
+
+# import tasks
 from official.vision.beta.projects.yolo.tasks import image_classification
+from official.vision.beta.projects.yolo.tasks import yolo as yolo_task
+
+# import optimization packages
+from official.vision.beta.projects.yolo.optimization import optimizer_factory
+from official.vision.beta.projects.yolo.optimization.configs import optimizer_config
+from official.vision.beta.projects.yolo.optimization.configs import optimization_config
--- a/official/vision/beta/projects/yolo/configs/decoders.py
+++ b/official/vision/beta/projects/yolo/configs/decoders.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Decoders configurations."""
+import dataclasses
+from typing import Optional
+from official.modeling import hyperparams
+from official.vision.beta.configs import decoders
+
+
+@dataclasses.dataclass
+class YoloDecoder(hyperparams.Config):
+  """Builds Yolo decoder.
+
+  If the name is specified, or version is specified we ignore input parameters
+  and use version and name defaults.
+  """
+  version: Optional[str] = None
+  type: Optional[str] = None
+  use_fpn: Optional[bool] = None
+  use_spatial_attention: bool = False
+  use_separable_conv: bool = False
+  csp_stack: Optional[bool] = None
+  fpn_depth: Optional[int] = None
+  fpn_filter_scale: Optional[int] = None
+  path_process_len: Optional[int] = None
+  max_level_process_len: Optional[int] = None
+  embed_spp: Optional[bool] = None
+  activation: Optional[str] = 'same'
+
+
+@dataclasses.dataclass
+class Decoder(decoders.Decoder):
+  type: Optional[str] = 'yolo_decoder'
+  yolo_decoder: YoloDecoder = YoloDecoder()
--- a/official/vision/beta/projects/yolo/configs/experiments/csp_darknet53.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/csp_darknet53.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/csp_darknet53_tfds.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/csp_darknet53_tfds.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/darknet53.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/darknet53.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/darknet53_tfds.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/darknet53_tfds.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
+# --experiment_type=scaled_yolo
+# mAP 47.6
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'altered_cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.3
+      object_normalizer:
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16], box: [19, 36], box: [40, 28],
+              box: [36, 75], box: [76, 55], box: [72, 146],
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
+# --experiment_type=yolo_darknet
+# mAP 43.0
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: true
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: regular
+        activation: leaky
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: false
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer:
+        'all': 0.07
+      cls_normalizer:
+        'all': 1.0
+      object_normalizer:
+        'all': 1.0
+      objectness_smooth:
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16], box: [19, 36], box: [40, 28],
+              box: [36, 75], box: [76, 55], box: [72, 146],
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 64
+    dtype: float32
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    is_training: true
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic:
+        mosaic_frequency: 0.75
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'crop'
+        mosaic_center: 0.2
+        aug_scale_min: 0.2
+        aug_scale_max: 1.6
+        jitter: 0.3
+      max_num_instances: 200
+      letter_box: false
+      random_flip: true
+      aug_rand_saturation: 1.5
+      aug_rand_brightness: 1.5
+      aug_rand_hue: 0.1
+      aug_scale_min: 0.1
+      aug_scale_max: 1.9
+      aug_rand_translate: 0.0
+      jitter: 0.3
+      area_thresh: 0.1
+      random_pad: true
+      use_tie_breaker: true
+      anchor_thresh: 0.4
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 200
+      letter_box: false
+      use_tie_breaker: true
+      anchor_thresh: 0.4
+  weight_decay: 0.000
+  init_checkpoint: 'gs://tf_model_garden/vision/yolo/ckpt-15000'
+  init_checkpoint_modules: 'backbone'
+  annotation_file: null
+trainer:
+  train_steps: 555000
+  validation_steps: 625
+  steps_per_loop: 1850
+  summary_interval: 1850
+  validation_interval: 9250
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema:
+      average_decay: 0.9998
+      trainable_weights_only: false
+      dynamic_decay: true
+    learning_rate:
+      type: stepwise
+      stepwise:
+        boundaries: [400000]
+        name: PiecewiseConstantDecay
+        values: [0.00131, 0.000131]
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.949
+        momentum_start: 0.949
+        nesterov: true
+        warmup_steps: 1000
+        weight_decay: 0.0005
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000  # learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/yolo.py
+++ b/official/vision/beta/projects/yolo/configs/yolo.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""YOLO configuration definition."""
+import dataclasses
+import os
+from typing import Any, List, Optional, Union
+
+import numpy as np
+
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.vision.beta.configs import common
+from official.vision.beta.projects.yolo import optimization
+from official.vision.beta.projects.yolo.configs import backbones
+from official.vision.beta.projects.yolo.configs import decoders
+
+
+# pytype: disable=annotation-type-mismatch
+
+MIN_LEVEL = 1
+MAX_LEVEL = 7
+GLOBAL_SEED = 1000
+
+
+def _build_dict(min_level, max_level, value):
+  vals = {str(key): value for key in range(min_level, max_level + 1)}
+  vals['all'] = None
+  return lambda: vals
+
+
+def _build_path_scales(min_level, max_level):
+  return lambda: {str(key): 2**key for key in range(min_level, max_level + 1)}
+
+
+@dataclasses.dataclass
+class FPNConfig(hyperparams.Config):
+  """FPN config."""
+  all: Optional[Any] = None
+
+  def get(self):
+    """Allow for a key for each level or a single key for all the levels."""
+    values = self.as_dict()
+    if 'all' in values and values['all'] is not None:
+      for key in values:
+        if key != 'all':
+          values[key] = values['all']
+    return values
+
+
+# pylint: disable=missing-class-docstring
+@dataclasses.dataclass
+class TfExampleDecoder(hyperparams.Config):
+  regenerate_source_id: bool = False
+  coco91_to_80: bool = True
+
+
+@dataclasses.dataclass
+class TfExampleDecoderLabelMap(hyperparams.Config):
+  regenerate_source_id: bool = False
+  label_map: str = ''
+
+
+@dataclasses.dataclass
+class DataDecoder(hyperparams.OneOfConfig):
+  type: Optional[str] = 'simple_decoder'
+  simple_decoder: TfExampleDecoder = TfExampleDecoder()
+  label_map_decoder: TfExampleDecoderLabelMap = TfExampleDecoderLabelMap()
+
+
+@dataclasses.dataclass
+class Mosaic(hyperparams.Config):
+  mosaic_frequency: float = 0.0
+  mixup_frequency: float = 0.0
+  mosaic_center: float = 0.2
+  mosaic_crop_mode: Optional[str] = None
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  jitter: float = 0.0
+
+
+@dataclasses.dataclass
+class Parser(hyperparams.Config):
+  max_num_instances: int = 200
+  letter_box: Optional[bool] = True
+  random_flip: bool = True
+  random_pad: float = False
+  jitter: float = 0.0
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  aug_rand_saturation: float = 0.0
+  aug_rand_brightness: float = 0.0
+  aug_rand_hue: float = 0.0
+  aug_rand_angle: float = 0.0
+  aug_rand_translate: float = 0.0
+  aug_rand_perspective: float = 0.0
+  use_tie_breaker: bool = True
+  best_match_only: bool = False
+  anchor_thresh: float = -0.01
+  area_thresh: float = 0.1
+  mosaic: Mosaic = Mosaic()
+
+
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """Input config for training."""
+  global_batch_size: int = 64
+  input_path: str = ''
+  tfds_name: str = ''
+  tfds_split: str = ''
+  global_batch_size: int = 1
+  is_training: bool = True
+  dtype: str = 'float16'
+  decoder: DataDecoder = DataDecoder()
+  parser: Parser = Parser()
+  shuffle_buffer_size: int = 10000
+  tfds_download: bool = True
+  cache: bool = False
+  drop_remainder: bool = True
+
+
+@dataclasses.dataclass
+class YoloHead(hyperparams.Config):
+  """Parameterization for the YOLO Head."""
+  smart_bias: bool = True
+
+
+@dataclasses.dataclass
+class YoloDetectionGenerator(hyperparams.Config):
+  box_type: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 'original'))
+  scale_xy: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  path_scales: FPNConfig = dataclasses.field(
+      default_factory=_build_path_scales(MIN_LEVEL, MAX_LEVEL))
+  nms_type: str = 'greedy'
+  iou_thresh: float = 0.001
+  nms_thresh: float = 0.6
+  max_boxes: int = 200
+  pre_nms_points: int = 5000
+
+
+@dataclasses.dataclass
+class YoloLoss(hyperparams.Config):
+  ignore_thresh: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 0.0))
+  truth_thresh: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  box_loss_type: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 'ciou'))
+  iou_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  cls_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  object_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  max_delta: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, np.inf))
+  objectness_smooth: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 0.0))
+  label_smoothing: float = 0.0
+  use_scaled_loss: bool = True
+  update_on_repeat: bool = True
+
+
+@dataclasses.dataclass
+class Box(hyperparams.Config):
+  box: List[int] = dataclasses.field(default=list)
+
+
+@dataclasses.dataclass
+class AnchorBoxes(hyperparams.Config):
+  boxes: Optional[List[Box]] = None
+  level_limits: Optional[List[int]] = None
+  anchors_per_scale: int = 3
+
+  def get(self, min_level, max_level):
+    """Distribute them in order to each level.
+
+    Args:
+      min_level: `int` the lowest output level.
+      max_level: `int` the heighest output level.
+    Returns:
+      anchors_per_level: A `Dict[List[int]]` of the anchor boxes for each level.
+      self.level_limits: A `List[int]` of the box size limits to link to each
+        level under anchor free conditions.
+    """
+    if self.level_limits is None:
+      boxes = [box.box for box in self.boxes]
+    else:
+      boxes = [[1.0, 1.0]] * ((max_level - min_level) + 1)
+      self.anchors_per_scale = 1
+
+    anchors_per_level = dict()
+    start = 0
+    for i in range(min_level, max_level + 1):
+      anchors_per_level[str(i)] = boxes[start:start + self.anchors_per_scale]
+      start += self.anchors_per_scale
+    return anchors_per_level, self.level_limits
+
+
+@dataclasses.dataclass
+class Yolo(hyperparams.Config):
+  input_size: Optional[List[int]] = dataclasses.field(
+      default_factory=lambda: [512, 512, 3])
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='darknet', darknet=backbones.Darknet(model_id='cspdarknet53'))
+  decoder: decoders.Decoder = decoders.Decoder(
+      type='yolo_decoder',
+      yolo_decoder=decoders.YoloDecoder(version='v4', type='regular'))
+  head: YoloHead = YoloHead()
+  detection_generator: YoloDetectionGenerator = YoloDetectionGenerator()
+  loss: YoloLoss = YoloLoss()
+  norm_activation: common.NormActivation = common.NormActivation(
+      activation='mish',
+      use_sync_bn=True,
+      norm_momentum=0.99,
+      norm_epsilon=0.001)
+  num_classes: int = 80
+  anchor_boxes: AnchorBoxes = AnchorBoxes()
+  darknet_based_model: bool = False
+
+
+@dataclasses.dataclass
+class YoloTask(cfg.TaskConfig):
+  per_category_metrics: bool = False
+  smart_bias_lr: float = 0.0
+  model: Yolo = Yolo()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(is_training=False)
+  weight_decay: float = 0.0
+  annotation_file: Optional[str] = None
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: Union[
+      str, List[str]] = 'all'  # all, backbone, and/or decoder
+  gradient_clip_norm: float = 0.0
+  seed = GLOBAL_SEED
+
+
+COCO_INPUT_PATH_BASE = 'coco'
+COCO_TRAIN_EXAMPLES = 118287
+COCO_VAL_EXAMPLES = 5000
+
+
+@exp_factory.register_config_factory('yolo')
+def yolo() -> cfg.ExperimentConfig:
+  """Yolo general config."""
+  return cfg.ExperimentConfig(
+      task=YoloTask(),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+
+@exp_factory.register_config_factory('yolo_darknet')
+def yolo_darknet() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOv3 and v4."""
+  train_batch_size = 64
+  eval_batch_size = 8
+  train_epochs = 300
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+  validation_interval = 5
+
+  max_num_instances = 200
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint='',
+          init_checkpoint_modules='backbone',
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolo(
+              darknet_based_model=True,
+              norm_activation=common.NormActivation(use_sync_bn=True),
+              head=YoloHead(smart_bias=True),
+              loss=YoloLoss(use_scaled_loss=False, update_on_repeat=True),
+              anchor_boxes=AnchorBoxes(
+                  anchors_per_scale=3,
+                  boxes=[
+                      Box(box=[12, 16]),
+                      Box(box=[19, 36]),
+                      Box(box=[40, 28]),
+                      Box(box=[36, 75]),
+                      Box(box=[76, 55]),
+                      Box(box=[72, 146]),
+                      Box(box=[142, 110]),
+                      Box(box=[192, 243]),
+                      Box(box=[459, 401])
+                  ])),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=False,
+                  aug_rand_saturation=1.5,
+                  aug_rand_brightness=1.5,
+                  aug_rand_hue=0.1,
+                  use_tie_breaker=True,
+                  best_match_only=False,
+                  anchor_thresh=0.4,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+                  mosaic=Mosaic(
+                      mosaic_frequency=0.75,
+                      mixup_frequency=0.0,
+                      mosaic_crop_mode='crop',
+                      mosaic_center=0.2))),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=True,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=False,
+                  use_tie_breaker=True,
+                  best_match_only=False,
+                  anchor_thresh=0.4,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema': {
+                  'average_decay': 0.9998,
+                  'trainable_weights_only': False,
+                  'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.949,
+                      'momentum_start': 0.949,
+                      'nesterov': True,
+                      'warmup_steps': 1000,
+                      'weight_decay': 0.0005,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [
+                          240 * steps_per_epoch
+                      ],
+                      'values': [
+                          0.00131 * train_batch_size / 64.0,
+                          0.000131 * train_batch_size / 64.0,
+                      ]
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 1000,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('scaled_yolo')
+def scaled_yolo() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOv4-csp and v4."""
+  train_batch_size = 64
+  eval_batch_size = 8
+  train_epochs = 300
+  warmup_epochs = 3
+
+  validation_interval = 5
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+
+  max_num_instances = 300
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint_modules='',
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolo(
+              darknet_based_model=False,
+              norm_activation=common.NormActivation(
+                  activation='mish',
+                  use_sync_bn=True,
+                  norm_epsilon=0.0001,
+                  norm_momentum=0.97),
+              head=YoloHead(smart_bias=True),
+              loss=YoloLoss(use_scaled_loss=True),
+              anchor_boxes=AnchorBoxes(
+                  anchors_per_scale=3,
+                  boxes=[
+                      Box(box=[12, 16]),
+                      Box(box=[19, 36]),
+                      Box(box=[40, 28]),
+                      Box(box=[36, 75]),
+                      Box(box=[76, 55]),
+                      Box(box=[72, 146]),
+                      Box(box=[142, 110]),
+                      Box(box=[192, 243]),
+                      Box(box=[459, 401])
+                  ])),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              dtype='float32',
+              parser=Parser(
+                  aug_rand_saturation=0.7,
+                  aug_rand_brightness=0.4,
+                  aug_rand_hue=0.015,
+                  letter_box=True,
+                  use_tie_breaker=True,
+                  best_match_only=True,
+                  anchor_thresh=4.0,
+                  random_pad=False,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+                  mosaic=Mosaic(
+                      mosaic_crop_mode='scale',
+                      mosaic_frequency=1.0,
+                      mixup_frequency=0.0,
+                  ))),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=True,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=True,
+                  use_tie_breaker=True,
+                  best_match_only=True,
+                  anchor_thresh=4.0,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema': {
+                  'average_decay': 0.9999,
+                  'trainable_weights_only': False,
+                  'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.937,
+                      'momentum_start': 0.8,
+                      'nesterov': True,
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'weight_decay': 0.0005 * train_batch_size / 64.0,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'cosine',
+                  'cosine': {
+                      'initial_learning_rate': 0.01,
+                      'alpha': 0.2,
+                      'decay_steps': train_epochs * steps_per_epoch,
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
--- a/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
@@ -75,11 +75,11 @@ class Parser(parser.Parser):
        saturation. saturation will be scaled between 1/value and value.
      aug_rand_brightness: `float` indicating the maximum scaling value for
        brightness. brightness will be scaled between 1/value and value.
-      letter_box: `boolean` indicating whether upon start of the datapipeline
+      letter_box: `boolean` indicating whether upon start of the data pipeline
        regardless of the preprocessing ops that are used, the aspect ratio of
        the images should be preserved.
      random_pad: `bool` indiccating wether to use padding to apply random
-        translation true for darknet yolo false for scaled yolo.
+        translation, true for darknet yolo false for scaled yolo.
      random_flip: `boolean` indicating whether or not to randomly flip the
        image horizontally.
      jitter: `float` for the maximum change in aspect ratio expected in each
@@ -147,6 +147,7 @@ class Parser(parser.Parser):
    # Set the per level values needed for operation
    self._darknet = darknet
    self._area_thresh = area_thresh
+    self._level_limits = level_limits

    self._seed = seed
    self._dtype = dtype
@@ -259,7 +260,7 @@ class Parser(parser.Parser):
        self._aug_rand_saturation,
        self._aug_rand_brightness,
        seed=self._seed,
-        darknet=self._darknet)
+        darknet=self._darknet or self._level_limits is not None)

    # Cast the image to the selcted datatype.
    image, labels = self._build_label(

--- a/official/vision/beta/projects/yolo/losses/yolo_loss.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss.py
@@ -40,7 +40,7 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
               loss_type='ciou',
               iou_normalizer=1.0,
               cls_normalizer=1.0,
-               obj_normalizer=1.0,
+               object_normalizer=1.0,
               label_smoothing=0.0,
               objectness_smooth=True,
               update_on_repeat=False,
@@ -65,7 +65,8 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
      iou_normalizer: `float` for how much to scale the loss on the IOU or the
        boxes.
      cls_normalizer: `float` for how much to scale the loss on the classes.
-      obj_normalizer: `float` for how much to scale loss on the detection map.
+      object_normalizer: `float` for how much to scale loss on the detection
+        map.
      label_smoothing: `float` for how much to smooth the loss on the classes.
      objectness_smooth: `float` for how much to smooth the loss on the
        detection map.
@@ -90,7 +91,7 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):

    self._iou_normalizer = iou_normalizer
    self._cls_normalizer = cls_normalizer
-    self._obj_normalizer = obj_normalizer
+    self._object_normalizer = object_normalizer
    self._scale_x_y = scale_x_y
    self._max_delta = max_delta

@@ -240,9 +241,14 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):

    Returns:
      loss: `tf.float` scalar for the scaled loss.
+      scale: `tf.float` how much the loss was scaled by.
    """
-    del box_loss, conf_loss, class_loss, ground_truths, predictions
-    return loss
+    del box_loss
+    del conf_loss
+    del class_loss
+    del ground_truths
+    del predictions
+    return loss, tf.ones_like(loss)

  @abc.abstractmethod
  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
@@ -349,16 +355,16 @@ class DarknetLoss(YoloLossBase):
        tf.cast(true_class, tf.int32),
        depth=tf.shape(pred_class)[-1],
        dtype=pred_class.dtype)
-    true_classes = tf.stop_gradient(loss_utils.apply_mask(ind_mask, true_class))
+    true_class = tf.stop_gradient(loss_utils.apply_mask(ind_mask, true_class))

    # Reorganize the one hot class list as a grid.
-    true_class = loss_utils.build_grid(
-        inds, true_classes, pred_class, ind_mask, update=False)
-    true_class = tf.stop_gradient(true_class)
+    true_class_grid = loss_utils.build_grid(
+        inds, true_class, pred_class, ind_mask, update=False)
+    true_class_grid = tf.stop_gradient(true_class_grid)

    # Use the class mask to find the number of objects located in
    # each predicted grid cell/pixel.
-    counts = true_class
+    counts = true_class_grid
    counts = tf.reduce_sum(counts, axis=-1, keepdims=True)
    reps = tf.gather_nd(counts, inds, batch_dims=1)
    reps = tf.squeeze(reps, axis=-1)
@@ -372,26 +378,50 @@ class DarknetLoss(YoloLossBase):
    box_loss = math_ops.divide_no_nan(box_loss, reps)
    box_loss = tf.cast(tf.reduce_sum(box_loss, axis=1), dtype=y_pred.dtype)

-    # Compute the sigmoid binary cross entropy for the class maps.
-    class_loss = tf.reduce_mean(
-        loss_utils.sigmoid_bce(
-            tf.expand_dims(true_class, axis=-1),
-            tf.expand_dims(pred_class, axis=-1), self._label_smoothing),
-        axis=-1)
-
-    # Apply normalization to the class losses.
-    if self._cls_normalizer < 1.0:
-      # Build a mask based on the true class locations.
-      cls_norm_mask = true_class
-      # Apply the classes weight to class indexes were one_hot is one.
-      class_loss *= ((1 - cls_norm_mask) + cls_norm_mask * self._cls_normalizer)
-
-    # Mask to the class loss and compute the sum over all the objects.
-    class_loss = tf.reduce_sum(class_loss, axis=-1)
-    class_loss = loss_utils.apply_mask(grid_mask, class_loss)
-    class_loss = math_ops.rm_nan_inf(class_loss, val=0.0)
-    class_loss = tf.cast(
-        tf.reduce_sum(class_loss, axis=(1, 2, 3)), dtype=y_pred.dtype)
+    if self._update_on_repeat:
+      # Converts list of gound truths into a grid where repeated values
+      # are replaced by the most recent value. So some class identities may
+      # get lost but the loss computation will be more stable. Results are
+      # more consistent.
+
+      # Compute the sigmoid binary cross entropy for the class maps.
+      class_loss = tf.reduce_mean(
+          loss_utils.sigmoid_bce(
+              tf.expand_dims(true_class_grid, axis=-1),
+              tf.expand_dims(pred_class, axis=-1), self._label_smoothing),
+          axis=-1)
+
+      # Apply normalization to the class losses.
+      if self._cls_normalizer < 1.0:
+        # Build a mask based on the true class locations.
+        cls_norm_mask = true_class_grid
+        # Apply the classes weight to class indexes were one_hot is one.
+        class_loss *= ((1 - cls_norm_mask) +
+                       cls_norm_mask * self._cls_normalizer)
+
+      # Mask to the class loss and compute the sum over all the objects.
+      class_loss = tf.reduce_sum(class_loss, axis=-1)
+      class_loss = loss_utils.apply_mask(grid_mask, class_loss)
+      class_loss = math_ops.rm_nan_inf(class_loss, val=0.0)
+      class_loss = tf.cast(
+          tf.reduce_sum(class_loss, axis=(1, 2, 3)), dtype=y_pred.dtype)
+    else:
+      # Computes the loss while keeping the structure as a list in
+      # order to ensure all objects are considered. In some cases can
+      # make training more unstable but may also return higher APs.
+      pred_class = loss_utils.apply_mask(
+          ind_mask, tf.gather_nd(pred_class, inds, batch_dims=1))
+      class_loss = tf.keras.losses.binary_crossentropy(
+          tf.expand_dims(true_class, axis=-1),
+          tf.expand_dims(pred_class, axis=-1),
+          label_smoothing=self._label_smoothing,
+          from_logits=True)
+      class_loss = loss_utils.apply_mask(ind_mask, class_loss)
+      class_loss = math_ops.divide_no_nan(class_loss,
+                                          tf.expand_dims(reps, axis=-1))
+      class_loss = tf.cast(
+          tf.reduce_sum(class_loss, axis=(1, 2)), dtype=y_pred.dtype)
+      class_loss *= self._cls_normalizer

    # Compute the sigmoid binary cross entropy for the confidence maps.
    bce = tf.reduce_mean(
@@ -406,7 +436,7 @@ class DarknetLoss(YoloLossBase):

    # Apply the weights to each loss.
    box_loss *= self._iou_normalizer
-    conf_loss *= self._obj_normalizer
+    conf_loss *= self._object_normalizer

    # Add all the losses together then take the mean over the batches.
    loss = box_loss + class_loss + conf_loss
@@ -547,7 +577,7 @@ class ScaledLoss(YoloLossBase):
    # Apply the weights to each loss.
    box_loss *= self._iou_normalizer
    class_loss *= self._cls_normalizer
-    conf_loss *= self._obj_normalizer
+    conf_loss *= self._object_normalizer

    # Add all the losses together then take the sum over the batches.
    mean_loss = box_loss + class_loss + conf_loss
@@ -575,12 +605,13 @@ class ScaledLoss(YoloLossBase):
      predictions: `Dict` holding all the predicted values.
    Returns:
      loss: `tf.float` scalar for the scaled loss.
+      scale: `tf.float` how much the loss was scaled by.
    """
    scale = tf.stop_gradient(3 / len(list(predictions.keys())))
-    return loss * scale
+    return loss * scale, 1 / scale

  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
-    """this method is not specific to each loss path, but each loss type."""
+    """This method is not specific to each loss path, but each loss type."""
    return loss


@@ -597,7 +628,7 @@ class YoloLoss:
               loss_types=None,
               iou_normalizers=None,
               cls_normalizers=None,
-               obj_normalizers=None,
+               object_normalizers=None,
               objectness_smooths=None,
               box_types=None,
               scale_xys=None,
@@ -627,8 +658,8 @@ class YoloLoss:
        or the boxes for each FPN path.
      cls_normalizers: `Dict[float]` for how much to scale the loss on the
        classes for each FPN path.
-      obj_normalizers: `Dict[float]` for how much to scale loss on the detection
-        map for each FPN path.
+      object_normalizers: `Dict[float]` for how much to scale loss on the
+        detection map for each FPN path.
      objectness_smooths: `Dict[float]` for how much to smooth the loss on the
        detection map for each FPN path.
      box_types: `Dict[bool]` for which scaling type to use for each FPN path.
@@ -666,7 +697,7 @@ class YoloLoss:
          loss_type=loss_types[key],
          iou_normalizer=iou_normalizers[key],
          cls_normalizer=cls_normalizers[key],
-          obj_normalizer=obj_normalizers[key],
+          object_normalizer=object_normalizers[key],
          box_type=box_types[key],
          objectness_smooth=objectness_smooths[key],
          max_delta=max_deltas[key],
@@ -695,10 +726,8 @@ class YoloLoss:

      # after computing the loss, scale loss as needed for aggregation
      # across FPN levels
-      loss = self._loss_dict[key].post_path_aggregation(loss, loss_box,
-                                                        loss_conf, loss_class,
-                                                        ground_truth,
-                                                        predictions)
+      loss, scale = self._loss_dict[key].post_path_aggregation(
+          loss, loss_box, loss_conf, loss_class, ground_truth, predictions)

      # after completing the scaling of the loss on each replica, handle
      # scaling the loss for mergeing the loss across replicas
@@ -708,12 +737,13 @@ class YoloLoss:

      # detach all the below gradients: none of them should make a
      # contribution to the gradient form this point forwards
-      metric_loss += tf.stop_gradient(mean_loss)
-      metric_dict[key]['loss'] = tf.stop_gradient(mean_loss)
+      metric_loss += tf.stop_gradient(mean_loss / scale)
+      metric_dict[key]['loss'] = tf.stop_gradient(mean_loss / scale)
      metric_dict[key]['avg_iou'] = tf.stop_gradient(avg_iou)
      metric_dict[key]['avg_obj'] = tf.stop_gradient(avg_obj)

-      metric_dict['net']['box'] += tf.stop_gradient(loss_box)
-      metric_dict['net']['class'] += tf.stop_gradient(loss_class)
-      metric_dict['net']['conf'] += tf.stop_gradient(loss_conf)
+      metric_dict['net']['box'] += tf.stop_gradient(loss_box / scale)
+      metric_dict['net']['class'] += tf.stop_gradient(loss_class / scale)
+      metric_dict['net']['conf'] += tf.stop_gradient(loss_conf / scale)
+
    return loss_val, metric_loss, metric_dict
--- a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
@@ -60,7 +60,7 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
        loss_types={key: 'ciou' for key in keys},
        iou_normalizers={key: 0.05 for key in keys},
        cls_normalizers={key: 0.5 for key in keys},
-        obj_normalizers={key: 1.0 for key in keys},
+        object_normalizers={key: 1.0 for key in keys},
        objectness_smooths={key: 1.0 for key in keys},
        box_types={key: 'scaled' for key in keys},
        scale_xys={key: 2.0 for key in keys},

--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -454,6 +454,9 @@ class Darknet(tf.keras.Model):
  def _build_struct(self, net, inputs):
    if self._use_reorg_input:
      inputs = nn_blocks.Reorg()(inputs)
+      net[0].filters = net[1].filters
+      net[0].output_name = net[1].output_name
+      del net[1]

    endpoints = collections.OrderedDict()
    stack_outputs = [inputs]

--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -13,10 +13,66 @@
 # limitations under the License.

 """Feature Pyramid Network and Path Aggregation variants used in YOLO."""
+from typing import Mapping, Union, Optional

 import tensorflow as tf
+
+from official.modeling import hyperparams
+from official.vision.beta.modeling.decoders import factory
 from official.vision.beta.projects.yolo.modeling.layers import nn_blocks

+# model configurations
+# the structure is as follows. model version, {v3, v4, v#, ... etc}
+# the model config type {regular, tiny, small, large, ... etc}
+YOLO_MODELS = {
+    'v4':
+        dict(
+            regular=dict(
+                embed_spp=False,
+                use_fpn=True,
+                max_level_process_len=None,
+                path_process_len=6),
+            tiny=dict(
+                embed_spp=False,
+                use_fpn=False,
+                max_level_process_len=2,
+                path_process_len=1),
+            csp=dict(
+                embed_spp=False,
+                use_fpn=True,
+                max_level_process_len=None,
+                csp_stack=5,
+                fpn_depth=5,
+                path_process_len=6),
+            csp_large=dict(
+                embed_spp=False,
+                use_fpn=True,
+                max_level_process_len=None,
+                csp_stack=7,
+                fpn_depth=7,
+                path_process_len=8,
+                fpn_filter_scale=2),
+        ),
+    'v3':
+        dict(
+            regular=dict(
+                embed_spp=False,
+                use_fpn=False,
+                max_level_process_len=None,
+                path_process_len=6),
+            tiny=dict(
+                embed_spp=False,
+                use_fpn=False,
+                max_level_process_len=2,
+                path_process_len=1),
+            spp=dict(
+                embed_spp=True,
+                use_fpn=False,
+                max_level_process_len=2,
+                path_process_len=1),
+        ),
+}
+

 @tf.keras.utils.register_keras_serializable(package='yolo')
 class _IdentityRoute(tf.keras.layers.Layer):
@@ -487,3 +543,66 @@ class YoloDecoder(tf.keras.Model):
  @classmethod
  def from_config(cls, config, custom_objects=None):
    return cls(**config)
+
+
+@factory.register_decoder_builder('yolo_decoder')
+def build_yolo_decoder(
+    input_specs: Mapping[str, tf.TensorShape],
+    model_config: hyperparams.Config,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+    **kwargs) -> Union[None, tf.keras.Model, tf.keras.layers.Layer]:
+  """Builds Yolo FPN/PAN decoder from a config.
+
+  Args:
+    input_specs: A `dict` of input specifications. A dictionary consists of
+      {level: TensorShape} from a backbone.
+    model_config: A OneOfConfig. Model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
+      None.
+    **kwargs: Additional kwargs arguments.
+
+  Returns:
+    A `tf.keras.Model` instance of the Yolo FPN/PAN decoder.
+  """
+  decoder_cfg = model_config.decoder.get()
+  norm_activation_config = model_config.norm_activation
+
+  activation = (
+      decoder_cfg.activation if decoder_cfg.activation != 'same' else
+      norm_activation_config.activation)
+
+  if decoder_cfg.version is None:  # custom yolo
+    raise ValueError('Decoder version cannot be None, specify v3 or v4.')
+
+  if decoder_cfg.version not in YOLO_MODELS:
+    raise ValueError(
+        'Unsupported model version please select from {v3, v4}, '
+        'or specify a custom decoder config using YoloDecoder in you yaml')
+
+  if decoder_cfg.type is None:
+    decoder_cfg.type = 'regular'
+
+  if decoder_cfg.type not in YOLO_MODELS[decoder_cfg.version]:
+    raise ValueError('Unsupported model type please select from '
+                     '{yolo_model.YOLO_MODELS[decoder_cfg.version].keys()}'
+                     'or specify a custom decoder config using YoloDecoder.')
+
+  base_model = YOLO_MODELS[decoder_cfg.version][decoder_cfg.type]
+
+  cfg_dict = decoder_cfg.as_dict()
+  for key in base_model:
+    if cfg_dict[key] is not None:
+      base_model[key] = cfg_dict[key]
+
+  base_dict = dict(
+      activation=activation,
+      use_spatial_attention=decoder_cfg.use_spatial_attention,
+      use_separable_conv=decoder_cfg.use_separable_conv,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+
+  base_model.update(base_dict)
+  model = YoloDecoder(input_specs, **base_model, **kwargs)
+  return model
--- a/official/vision/beta/projects/yolo/modeling/factory.py
+++ b/official/vision/beta/projects/yolo/modeling/factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains common factory functions yolo neural networks."""
+
+from absl import logging
+from official.vision.beta.modeling.backbones import factory as backbone_factory
+from official.vision.beta.modeling.decoders import factory as decoder_factory
+
+from official.vision.beta.projects.yolo.configs import yolo
+from official.vision.beta.projects.yolo.modeling import yolo_model
+from official.vision.beta.projects.yolo.modeling.heads import yolo_head
+from official.vision.beta.projects.yolo.modeling.layers import detection_generator
+
+
+def build_yolo_detection_generator(model_config: yolo.Yolo, anchor_boxes):
+  """Builds yolo detection generator."""
+  model = detection_generator.YoloLayer(
+      classes=model_config.num_classes,
+      anchors=anchor_boxes,
+      iou_thresh=model_config.detection_generator.iou_thresh,
+      nms_thresh=model_config.detection_generator.nms_thresh,
+      max_boxes=model_config.detection_generator.max_boxes,
+      pre_nms_points=model_config.detection_generator.pre_nms_points,
+      nms_type=model_config.detection_generator.nms_type,
+      box_type=model_config.detection_generator.box_type.get(),
+      path_scale=model_config.detection_generator.path_scales.get(),
+      scale_xy=model_config.detection_generator.scale_xy.get(),
+      label_smoothing=model_config.loss.label_smoothing,
+      use_scaled_loss=model_config.loss.use_scaled_loss,
+      update_on_repeat=model_config.loss.update_on_repeat,
+      truth_thresh=model_config.loss.truth_thresh.get(),
+      loss_type=model_config.loss.box_loss_type.get(),
+      max_delta=model_config.loss.max_delta.get(),
+      iou_normalizer=model_config.loss.iou_normalizer.get(),
+      cls_normalizer=model_config.loss.cls_normalizer.get(),
+      object_normalizer=model_config.loss.object_normalizer.get(),
+      ignore_thresh=model_config.loss.ignore_thresh.get(),
+      objectness_smooth=model_config.loss.objectness_smooth.get())
+  return model
+
+
+def build_yolo_head(input_specs, model_config: yolo.Yolo, l2_regularization):
+  """Builds yolo head."""
+  min_level = min(map(int, input_specs.keys()))
+  max_level = max(map(int, input_specs.keys()))
+  head = yolo_head.YoloHead(
+      min_level=min_level,
+      max_level=max_level,
+      classes=model_config.num_classes,
+      boxes_per_level=model_config.anchor_boxes.anchors_per_scale,
+      norm_momentum=model_config.norm_activation.norm_momentum,
+      norm_epsilon=model_config.norm_activation.norm_epsilon,
+      kernel_regularizer=l2_regularization,
+      smart_bias=model_config.head.smart_bias)
+  return head
+
+
+def build_yolo(input_specs, model_config, l2_regularization):
+  """Builds yolo model."""
+  backbone = model_config.backbone.get()
+  anchor_dict, _ = model_config.anchor_boxes.get(
+      backbone.min_level, backbone.max_level)
+  backbone = backbone_factory.build_backbone(input_specs, model_config.backbone,
+                                             model_config.norm_activation,
+                                             l2_regularization)
+  decoder = decoder_factory.build_decoder(backbone.output_specs, model_config,
+                                          l2_regularization)
+
+  head = build_yolo_head(decoder.output_specs, model_config, l2_regularization)
+  detection_generator_obj = build_yolo_detection_generator(model_config,
+                                                           anchor_dict)
+
+  model = yolo_model.Yolo(
+      backbone=backbone,
+      decoder=decoder,
+      head=head,
+      detection_generator=detection_generator_obj)
+  model.build(input_specs.shape)
+
+  model.summary(print_fn=logging.info)
+
+  losses = detection_generator_obj.get_losses()
+  return model, losses
--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
@@ -36,7 +36,7 @@ class YoloLayer(tf.keras.Model):
               loss_type='ciou',
               iou_normalizer=1.0,
               cls_normalizer=1.0,
-               obj_normalizer=1.0,
+               object_normalizer=1.0,
               use_scaled_loss=False,
               update_on_repeat=False,
               pre_nms_points=5000,
@@ -67,7 +67,8 @@ class YoloLayer(tf.keras.Model):
      iou_normalizer: `float` for how much to scale the loss on the IOU or the
        boxes.
      cls_normalizer: `float` for how much to scale the loss on the classes.
-      obj_normalizer: `float` for how much to scale loss on the detection map.
+      object_normalizer: `float` for how much to scale loss on the detection
+        map.
      use_scaled_loss: `bool` for whether to use the scaled loss
        or the traditional loss.
      update_on_repeat: `bool` indicating how you would like to handle repeated
@@ -110,7 +111,7 @@ class YoloLayer(tf.keras.Model):
    self._truth_thresh = truth_thresh
    self._iou_normalizer = iou_normalizer
    self._cls_normalizer = cls_normalizer
-    self._obj_normalizer = obj_normalizer
+    self._object_normalizer = object_normalizer
    self._objectness_smooth = objectness_smooth
    self._nms_thresh = nms_thresh
    self._max_boxes = max_boxes
@@ -289,7 +290,7 @@ class YoloLayer(tf.keras.Model):
        loss_types=self._loss_type,
        iou_normalizers=self._iou_normalizer,
        cls_normalizers=self._cls_normalizer,
-        obj_normalizers=self._obj_normalizer,
+        object_normalizers=self._object_normalizer,
        objectness_smooths=self._objectness_smooth,
        box_types=self._box_type,
        max_deltas=self._max_delta,

--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
@@ -14,7 +14,9 @@

 """Contains common building blocks for yolo neural networks."""
 from typing import Callable, List, Tuple
+
 import tensorflow as tf
+
 from official.modeling import tf_utils
 from official.vision.beta.ops import spatial_transform_ops

@@ -141,6 +143,7 @@ class ConvBN(tf.keras.layers.Layer):
    # activation params
    self._activation = activation
    self._leaky_alpha = leaky_alpha
+    self._fuse = False

    super().__init__(**kwargs)

@@ -164,6 +167,8 @@ class ConvBN(tf.keras.layers.Layer):
          momentum=self._norm_momentum,
          epsilon=self._norm_epsilon,
          axis=self._bn_axis)
+    else:
+      self.bn = None

    if self._activation == 'leaky':
      self._activation_fn = tf.keras.layers.LeakyReLU(alpha=self._leaky_alpha)
@@ -174,11 +179,44 @@ class ConvBN(tf.keras.layers.Layer):

  def call(self, x):
    x = self.conv(x)
-    if self._use_bn:
+    if self._use_bn and not self._fuse:
      x = self.bn(x)
    x = self._activation_fn(x)
    return x

+  def fuse(self):
+    if self.bn is not None and not self._use_separable_conv:
+      # Fuse convolution and batchnorm, gives me +2 to 3 FPS 2ms latency.
+      # layers: https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+      if self._fuse:
+        return
+
+      self._fuse = True
+      conv_weights = self.conv.get_weights()[0]
+      gamma, beta, moving_mean, moving_variance = self.bn.get_weights()
+
+      self.conv.use_bias = True
+      infilters = conv_weights.shape[-2]
+      self.conv.build([None, None, None, infilters])
+
+      base = tf.sqrt(self._norm_epsilon + moving_variance)
+      w_conv_base = tf.transpose(conv_weights, perm=(3, 2, 0, 1))
+      w_conv = tf.reshape(w_conv_base, [conv_weights.shape[-1], -1])
+
+      w_bn = tf.linalg.diag(gamma / base)
+      w_conv = tf.reshape(tf.matmul(w_bn, w_conv), w_conv_base.get_shape())
+      w_conv = tf.transpose(w_conv, perm=(2, 3, 1, 0))
+
+      b_bn = beta - gamma * moving_mean / base
+
+      self.conv.set_weights([w_conv, b_bn])
+      del self.bn
+
+      self.trainable = False
+      self.conv.trainable = False
+      self.bn = None
+    return
+
  def get_config(self):
    # used to store/share parameters to reconstruct the model
    layer_config = {

--- a/official/vision/beta/projects/yolo/modeling/yolo_model.py
+++ b/official/vision/beta/projects/yolo/modeling/yolo_model.py
@@ -14,72 +14,19 @@

 """Yolo models."""

+from typing import Mapping, Union
 import tensorflow as tf
-
-# static base Yolo Models that do not require configuration
-# similar to a backbone model id.
-
-# this is done greatly simplify the model config
-# the structure is as follows. model version, {v3, v4, v#, ... etc}
-# the model config type {regular, tiny, small, large, ... etc}
-YOLO_MODELS = {
-    "v4":
-        dict(
-            regular=dict(
-                embed_spp=False,
-                use_fpn=True,
-                max_level_process_len=None,
-                path_process_len=6),
-            tiny=dict(
-                embed_spp=False,
-                use_fpn=False,
-                max_level_process_len=2,
-                path_process_len=1),
-            csp=dict(
-                embed_spp=False,
-                use_fpn=True,
-                max_level_process_len=None,
-                csp_stack=5,
-                fpn_depth=5,
-                path_process_len=6),
-            csp_large=dict(
-                embed_spp=False,
-                use_fpn=True,
-                max_level_process_len=None,
-                csp_stack=7,
-                fpn_depth=7,
-                path_process_len=8,
-                fpn_filter_scale=2),
-        ),
-    "v3":
-        dict(
-            regular=dict(
-                embed_spp=False,
-                use_fpn=False,
-                max_level_process_len=None,
-                path_process_len=6),
-            tiny=dict(
-                embed_spp=False,
-                use_fpn=False,
-                max_level_process_len=2,
-                path_process_len=1),
-            spp=dict(
-                embed_spp=True,
-                use_fpn=False,
-                max_level_process_len=2,
-                path_process_len=1),
-        ),
-}
+from official.vision.beta.projects.yolo.modeling.layers import nn_blocks


 class Yolo(tf.keras.Model):
  """The YOLO model class."""

  def __init__(self,
-               backbone=None,
-               decoder=None,
-               head=None,
-               detection_generator=None,
+               backbone,
+               decoder,
+               head,
+               detection_generator,
               **kwargs):
    """Detection initialization function.

@@ -93,10 +40,10 @@ class Yolo(tf.keras.Model):
    super(Yolo, self).__init__(**kwargs)

    self._config_dict = {
-        "backbone": backbone,
-        "decoder": decoder,
-        "head": head,
-        "filter": detection_generator
+        'backbone': backbone,
+        'decoder': decoder,
+        'head': head,
+        'detection_generator': detection_generator
    }

    # model components
@@ -104,18 +51,19 @@ class Yolo(tf.keras.Model):
    self._decoder = decoder
    self._head = head
    self._detection_generator = detection_generator
+    self._fused = False
    return

  def call(self, inputs, training=False):
-    maps = self._backbone(inputs)
-    decoded_maps = self._decoder(maps)
-    raw_predictions = self._head(decoded_maps)
+    maps = self.backbone(inputs)
+    decoded_maps = self.decoder(maps)
+    raw_predictions = self.head(decoded_maps)
    if training:
-      return {"raw_output": raw_predictions}
+      return {'raw_output': raw_predictions}
    else:
      # Post-processing.
-      predictions = self._detection_generator(raw_predictions)
-      predictions.update({"raw_output": raw_predictions})
+      predictions = self.detection_generator(raw_predictions)
+      predictions.update({'raw_output': raw_predictions})
      return predictions

  @property
@@ -141,28 +89,22 @@ class Yolo(tf.keras.Model):
  def from_config(cls, config):
    return cls(**config)

-  def get_weight_groups(self, train_vars):
-    """Sort the list of trainable variables into groups for optimization.
-
-    Args:
-      train_vars: a list of tf.Variables that need to get sorted into their
-        respective groups.
-
-    Returns:
-      weights: a list of tf.Variables for the weights.
-      bias: a list of tf.Variables for the bias.
-      other: a list of tf.Variables for the other operations.
-    """
-    bias = []
-    weights = []
-    other = []
-    for var in train_vars:
-      if "bias" in var.name:
-        bias.append(var)
-      elif "beta" in var.name:
-        bias.append(var)
-      elif "kernel" in var.name or "weight" in var.name:
-        weights.append(var)
-      else:
-        other.append(var)
-    return weights, bias, other
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(backbone=self.backbone, head=self.head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+    return items
+
+  def fuse(self):
+    """Fuses all Convolution and Batchnorm layers to get better latency."""
+    print('Fusing Conv Batch Norm Layers.')
+    if not self._fused:
+      self._fused = True
+      for layer in self.submodules:
+        if isinstance(layer, nn_blocks.ConvBN):
+          layer.fuse()
+      self.summary()
+    return
--- a/official/vision/beta/projects/yolo/ops/mosaic.py
+++ b/official/vision/beta/projects/yolo/ops/mosaic.py
@@ -14,7 +14,6 @@

 """Mosaic op."""
 import random
-
 import tensorflow as tf
 import tensorflow_addons as tfa

@@ -55,7 +54,7 @@ class Mosaic:
        the images should be preserved.
      jitter: `float` for the maximum change in aspect ratio expected in each
        preprocessing step.
-      mosaic_crop_mode: `str` they type of mosaic to apply. The options are
+      mosaic_crop_mode: `str` the type of mosaic to apply. The options are
        {crop, scale, None}, crop will construct a mosaic by slicing images
        togther, scale will create a mosaic by concatnating and shifting the
        image, and None will default to scale and apply no post processing to
@@ -325,6 +324,12 @@ class Mosaic:
    else:
      return self._add_param(noop)

+  def _beta(self, alpha, beta):
+    """Generates a random number using the beta distribution."""
+    a = tf.random.gamma([], alpha)
+    b = tf.random.gamma([], beta)
+    return b / (a + b)
+
  def _mixup(self, one, two):
    """Blend together 2 images for the mixup data augmentation."""
    if self._mixup_frequency >= 1.0:
@@ -337,8 +342,8 @@ class Mosaic:
    if domo >= (1 - self._mixup_frequency):
      sample = one
      otype = one['image'].dtype
-      r = preprocessing_ops.random_uniform_strong(
-          0.4, 0.6, tf.float32, seed=self._seed)
+
+      r = self._beta(8.0, 8.0)
      sample['image'] = (
          r * tf.cast(one['image'], tf.float32) +
          (1 - r) * tf.cast(two['image'], tf.float32))