model builds

d4fb52e7 · Vishnu Banna · c631af40 · d4fb52e7 · d4fb52e7 · d4fb52e7
Commit d4fb52e7 authored Oct 01, 2021 by Vishnu Banna
20 changed files
--- a/official/vision/beta/projects/yolo/common/registry_imports.py
+++ b/official/vision/beta/projects/yolo/common/registry_imports.py
@@ -16,6 +16,21 @@
 # pylint: disable=unused-import
 from official.common import registry_imports
+# import configs 
 from official.vision.beta.projects.yolo.configs import darknet_classification
+from official.vision.beta.projects.yolo.configs import yolo
+# import modeling components
 from official.vision.beta.projects.yolo.modeling.backbones import darknet
+from official.vision.beta.projects.yolo.modeling.decoders import yolo_decoder
+# import tasks
 from official.vision.beta.projects.yolo.tasks import image_classification
+from official.vision.beta.projects.yolo.tasks import yolo
+# import optimization packages
+from official.vision.beta.projects.yolo.optimization import optimizer_factory
+from official.vision.beta.projects.yolo.optimization.configs import learning_rate_config
+from official.vision.beta.projects.yolo.optimization.configs import optimization_config
+from official.vision.beta.projects.yolo.optimization.configs import optimizer_config
\ No newline at end of file
--- a/official/vision/beta/projects/yolo/configs/decoders.py
+++ b/official/vision/beta/projects/yolo/configs/decoders.py
+"""Backbones configurations."""
+# Import libraries
+import dataclasses
+from typing import Optional, List
+from official.modeling import hyperparams
+from official.vision.beta.configs import decoders
+@dataclasses.dataclass
+class YoloDecoder(hyperparams.Config):
+  """if the name is specified, or version is specified we ignore 
+  input parameters and use version and name defaults"""
+  version: Optional[str] = None
+  type: Optional[str] = None
+  use_fpn: Optional[bool] = None
+  use_spatial_attention: bool = False
+  use_separable_conv: bool = False
+  csp_stack: Optional[bool] = None
+  fpn_depth: Optional[int] = None
+  fpn_filter_scale: Optional[int] = None
+  path_process_len: Optional[int] = None
+  max_level_process_len: Optional[int] = None
+  embed_spp: Optional[bool] = None
+  activation: Optional[str] = 'same'
+@dataclasses.dataclass
+class Decoder(decoders.Decoder):
+  type: Optional[str] = 'yolo_decoder'
+  yolo_decoder: YoloDecoder = YoloDecoder()
--- a/official/vision/beta/projects/yolo/configs/experiments/csp_darknet53.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/csp_darknet53.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/csp_darknet53_tfds.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/csp_darknet53_tfds.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/darknet53.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/darknet53.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/darknet53_tfds.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/darknet53_tfds.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov3/inference/512.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov3/inference/512.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float16'
+  num_gpus: 1
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: True
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'darknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v3
+        type: regular
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: greedy
+      iou_thresh: 0.25
+      nms_thresh: 0.45
+      pre_nms_points: 500
+    loss:
+      use_scaled_loss: False
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.07
+      cls_normalizer: 
+        'all': 1.0
+      obj_normalizer: 
+        'all': 1.0
+      objectness_smooth: 
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: leaky
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 64
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/train*'
+    is_training: true
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic:
+        mosaic_frequency: 0.75
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'crop'
+        mosaic_center: 0.2
+        aug_scale_min: 0.2
+        aug_scale_max: 1.6
+        jitter: 0.3
+      max_num_instances: 200
+      letter_box: False
+      random_flip: True
+      aug_rand_saturation: 1.5
+      aug_rand_brightness: 1.5
+      aug_rand_hue: 0.1
+      aug_scale_min: 0.1
+      aug_scale_max: 1.9
+      aug_rand_translate: 0.0 
+      jitter: 0.3
+      area_thresh: 0.1
+      random_pad: True
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 200
+      letter_box: False
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  weight_decay: 0.000
+  init_checkpoint: 'gs://tensorflow2/darknet/cspdarknet53-golden'
+  init_checkpoint_modules: 'backbone'
+  annotation_file: null
+trainer:
+  train_steps: 500500 # 160 epochs at 64 batchsize -> 500500 * 64/2
+  validation_steps: 625
+  steps_per_loop: 1850
+  summary_interval: 1850
+  validation_interval: 9250 
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema:
+      average_decay: 0.9998
+      trainable_weights_only: False
+      dynamic_decay: True
+    learning_rate:
+      type: stepwise
+      stepwise:
+        boundaries: [400000, 450000]
+        name: PiecewiseConstantDecay
+        values: [0.00131, 0.000131, 0.0000131] 
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.949
+        momentum_start: 0.949
+        nesterov: True
+        warmup_steps: 1000
+        weight_decay: 0.0005
+        sim_torch: true
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov3/tpu/512-letter.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov3/tpu/512-letter.yaml
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: True
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'darknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v3
+        type: regular
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: greedy
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: False
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.07
+      cls_normalizer: 
+        'all': 1.0
+      obj_normalizer: 
+        'all': 1.0
+      objectness_smooth: 
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 64
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/train*'
+    is_training: true
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic: 
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 200
+      letter_box: True
+      random_flip: True
+      aug_rand_saturation: 0.7
+      aug_rand_brightness: 0.4
+      aug_rand_hue: 0.015
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+      random_pad: False
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 300
+      letter_box: True
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  weight_decay: 0.000
+  init_checkpoint: 'gs://tensorflow2/darknet/cspdarknet53-golden'
+  init_checkpoint_modules: 'backbone'
+  annotation_file: null
+trainer:
+  train_steps: 500500 # 160 epochs at 64 batchsize -> 500500 * 64/2
+  validation_steps: 625
+  steps_per_loop: 1850
+  summary_interval: 1850
+  validation_interval: 9250 
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema:
+      average_decay: 0.9998
+      trainable_weights_only: False
+      dynamic_decay: True
+    learning_rate:
+      type: stepwise
+      stepwise:
+        boundaries: [400000, 450000]
+        name: PiecewiseConstantDecay
+        values: [0.00131, 0.000131, 0.0000131] 
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.949
+        momentum_start: 0.949
+        nesterov: True
+        warmup_steps: 1000
+        weight_decay: 0.0005
+        sim_torch: true
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov3/tpu/512.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov3/tpu/512.yaml
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: True
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'darknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v3
+        type: regular
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: greedy
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: False
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.07
+      cls_normalizer: 
+        'all': 1.0
+      obj_normalizer: 
+        'all': 1.0
+      objectness_smooth: 
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: leaky
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 64
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/train*'
+    is_training: true
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic:
+        mosaic_frequency: 0.75
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'crop'
+        mosaic_center: 0.2
+        aug_scale_min: 0.2
+        aug_scale_max: 1.6
+        jitter: 0.3
+      max_num_instances: 200
+      letter_box: False
+      random_flip: True
+      aug_rand_saturation: 1.5
+      aug_rand_brightness: 1.5
+      aug_rand_hue: 0.1
+      aug_scale_min: 0.1
+      aug_scale_max: 1.9
+      aug_rand_translate: 0.0 
+      jitter: 0.3
+      area_thresh: 0.1
+      random_pad: True
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 200
+      letter_box: False
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  weight_decay: 0.000
+  init_checkpoint: 'gs://tensorflow2/darknet/cspdarknet53-golden'
+  init_checkpoint_modules: 'backbone'
+  annotation_file: null
+trainer:
+  train_steps: 500500 # 160 epochs at 64 batchsize -> 500500 * 64/2
+  validation_steps: 625
+  steps_per_loop: 1850
+  summary_interval: 1850
+  validation_interval: 9250 
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema:
+      average_decay: 0.9998
+      trainable_weights_only: False
+      dynamic_decay: True
+    learning_rate:
+      type: stepwise
+      stepwise:
+        boundaries: [400000, 450000]
+        name: PiecewiseConstantDecay
+        values: [0.00131, 0.000131, 0.0000131] 
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.949
+        momentum_start: 0.949
+        nesterov: True
+        warmup_steps: 1000
+        weight_decay: 0.0005
+        sim_torch: true
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/inference/640.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/inference/640.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float16'
+  num_gpus: 1
+task:
+  smart_bias_lr: 0.1
+  model:
+    darknet_based_model: False
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'altered_cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: greedy
+      iou_thresh: 0.25
+      nms_thresh: 0.45
+      pre_nms_points: 500
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.3
+      obj_normalizer: 
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth: 
+        'all': 1.0
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 1
+    input_path:  '/media/vbanna/DATA_SHARE/CV/datasets/COCO_raw/records/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic: 
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      letter_box: True
+      random_flip: True
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    global_batch_size: 1
+    input_path: '/media/vbanna/DATA_SHARE/CV/datasets/COCO_raw/records/val*'
\ No newline at end of file
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640-ign.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640-ign.yaml
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  smart_bias_lr: 0.1
+  model:
+    darknet_based_model: False
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'altered_cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: greedy
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.3
+      obj_normalizer: 
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.0001
+      norm_momentum: 0.97
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 64
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/train*'
+    is_training: true
+    shuffle_buffer_size: 10000
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic: 
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: True
+      random_flip: True
+      aug_rand_saturation: 0.7
+      aug_rand_brightness: 0.4
+      aug_rand_hue: 0.015
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+      random_pad: False
+      use_tie_breaker: True
+      anchor_thresh: 4.0
+      best_match_only: True
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/val*'
+    is_training: false
+    shuffle_buffer_size: 10
+    drop_remainder: true
+    parser:
+      max_num_instances: 300
+      letter_box: True
+      use_tie_breaker: True
+      anchor_thresh: 4.0
+      best_match_only: True
+  weight_decay: 0.000
+  annotation_file: null
+trainer:
+  train_steps: 555000 # 160 epochs at 64 batchsize -> 500500 * 64/2
+  validation_steps: 625
+  steps_per_loop: 1850
+  summary_interval: 1850
+  validation_interval: 1850
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema:
+      average_decay: 0.9999
+      trainable_weights_only: False
+      dynamic_decay: True
+    learning_rate:
+      type: cosine
+      cosine:
+        initial_learning_rate: 0.01
+        name: Cosine
+        alpha: 0.2
+        decay_steps: 555000
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.937
+        momentum_start: 0.8
+        nesterov: True
+        warmup_steps: 5550
+        weight_decay: 0.0005
+        sim_torch: true
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 5550 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'altered_cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.3
+      obj_normalizer: 
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    input_path: 'gs://cam2-datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic: 
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: True
+      random_flip: True
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: 'gs://cam2-datasets/coco/val*'
\ No newline at end of file
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/inference/512-swin.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/inference/512-swin.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float16'
+  num_gpus: 1
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: True
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'swin'
+      swin:
+        min_level: 3
+        max_level: 5
+        patch_size: 4
+        embed_dims: 96 
+        window_size: [7, 7, 7, 7] 
+        depths: [2, 2, 6, 2]
+        num_heads: [3, 6, 12, 24]
+        drop_path: 0.0
+        absolute_positional_embed: False
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+        activation: leaky
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: greedy
+      iou_thresh: 0.25
+      nms_thresh: 0.45
+      pre_nms_points: 500
+    loss:
+      use_scaled_loss: False
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.07
+      cls_normalizer: 
+        'all': 1.0
+      obj_normalizer: 
+        'all': 1.0
+      objectness_smooth: 
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: gelu
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: false
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 4
+    dtype: float16
+    input_path: '/media/vbanna/DATA_SHARE/CV/datasets/COCO_raw/records/train*'
+    is_training: true
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic:
+        mosaic_frequency: 0.6
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'crop'
+        mosaic_center: 0.2
+        aug_scale_min: 0.2
+        aug_scale_max: 1.6
+        jitter: 0.3
+      max_num_instances: 200
+      letter_box: True
+      random_flip: True
+      aug_rand_saturation: 1.5
+      aug_rand_brightness: 1.5
+      aug_rand_hue: 0.1
+      aug_scale_min: 1.0
+      aug_scale_max: 1.0
+      aug_rand_translate: 0.0 
+      jitter: 0.3
+      area_thresh: 0.1
+      random_pad: True
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  validation_data:
+    global_batch_size: 8
+    dtype: float16
+    input_path: '/media/vbanna/DATA_SHARE/CV/datasets/COCO_raw/records/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 200
+      letter_box: True
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  weight_decay: 0.000
+  init_checkpoint: '../checkpoints/swin-baseline-3'
+  init_checkpoint_modules: 'backbone'
+  annotation_file: null
+trainer:
+  train_steps: 500500 # 160 epochs at 64 batchsize -> 500500 * 64/2
+  validation_steps: 625
+  steps_per_loop: 10
+  summary_interval: 10
+  validation_interval: 9250 
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema: null
+    learning_rate:
+      type: stepwise
+      stepwise:
+        boundaries: [400000, 450000]
+        name: PiecewiseConstantDecay
+        values: [0.00131, 0.000131, 0.0000131] 
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.949
+        momentum_start: 0.949
+        nesterov: True
+        warmup_steps: 1000
+        weight_decay: 0.0005
+        sim_torch: true
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/inference/512.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/inference/512.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float16'
+  num_gpus: 1
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: True
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: regular
+        activation: leaky
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: greedy
+      iou_thresh: 0.25
+      nms_thresh: 0.45
+      pre_nms_points: 500
+    loss:
+      use_scaled_loss: False
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.07
+      cls_normalizer: 
+        'all': 1.0
+      obj_normalizer: 
+        'all': 1.0
+      objectness_smooth: 
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    # global_batch_size: 64
+    # dtype: float32
+    input_path: '/media/vbanna/DATA_SHARE/CV/datasets/COCO_raw/records/train*'
+    # is_training: true
+    # drop_remainder: true
+    # seed: 1000
+    parser:
+      mosaic: 
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 200
+      letter_box: True
+      random_flip: True
+      aug_rand_translate: 0.1
+      random_pad: False
+  validation_data:
+    # global_batch_size: 1
+    # dtype: float32
+    input_path: '/media/vbanna/DATA_SHARE/CV/datasets/COCO_raw/records/val*'
+#     is_training: false
+#     drop_remainder: true
+#     parser:
+#       max_num_instances: 200
+#       letter_box: True
+#       use_tie_breaker: True
+#       anchor_thresh: 0.213
+#   weight_decay: 0.000
+#   init_checkpoint: '../checkpoints/512-wd-baseline-e1'
+#   init_checkpoint_modules: 'all'
+#   annotation_file: null
+# trainer:
+#   train_steps: 500500 # 160 epochs at 64 batchsize -> 500500 * 64/2
+#   validation_steps: 625
+#   steps_per_loop: 1850
+#   summary_interval: 1850
+#   validation_interval: 9250 
+#   checkpoint_interval: 1850
+#   optimizer_config:
+#     ema:
+#       average_decay: 0.9998
+#       trainable_weights_only: False
+#       dynamic_decay: True
+#     learning_rate:
+#       type: stepwise
+#       stepwise:
+#         boundaries: [400000, 450000]
+#         name: PiecewiseConstantDecay
+#         values: [0.00131, 0.000131, 0.0000131] 
+#     optimizer:
+#       type: sgd_torch
+#       sgd_torch:
+#         momentum: 0.949
+#         momentum_start: 0.949
+#         nesterov: True
+#         warmup_steps: 1000
+#         weight_decay: 0.0005
+#         sim_torch: true
+#         name: SGD
+#     warmup:
+#       type: 'linear'
+#       linear:
+#         warmup_steps: 1000 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512-mb.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512-mb.yaml
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: True
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: regular
+        activation: leaky
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: False
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.07
+      cls_normalizer: 
+        'all': 1.0
+      obj_normalizer: 
+        'all': 1.0
+      objectness_smooth: 
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    input_path: 'gs://cam2-datasets/coco/train*'
+    parser:
+      mosaic: 
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+        jitter: 0.3
+      max_num_instances: 200
+      letter_box: False
+      random_flip: True
+      aug_rand_translate: 0.1
+      random_pad: False
+  validation_data:
+    input_path: 'gs://cam2-datasets/coco/val*'
+    parser:
+      letter_box: False
+  weight_decay: 0.000
+  init_checkpoint: 'gs://tensorflow2/darknet/cspdarknet53-golden'
+  init_checkpoint_modules: 'backbone'
+  annotation_file: null
+# trainer:
+#   train_steps: 500500 # 160 epochs at 64 batchsize -> 500500 * 64/2
+#   validation_steps: 625
+#   steps_per_loop: 1850
+#   summary_interval: 1850
+#   validation_interval: 9250 
+#   checkpoint_interval: 1850
+#   optimizer_config:
+#     ema:
+#       average_decay: 0.9998
+#       trainable_weights_only: False
+#       dynamic_decay: True
+#     learning_rate:
+#       type: stepwise
+#       stepwise:
+#         boundaries: [400000, 450000]
+#         values: [0.00131, 0.000131, 0.0000131] 
+#     optimizer:
+#       type: sgd_torch
+#       sgd_torch:
+#         momentum: 0.949
+#         momentum_start: 0.949
+#         nesterov: True
+#         warmup_steps: 1000
+#         weight_decay: 0.0005
+#         sim_torch: true
+#         name: SGD
+#     warmup:
+#       type: 'linear'
+#       linear:
+#         warmup_steps: 1000 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512-swin.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512-swin.yaml
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: True
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'swin'
+      swin:
+        min_level: 3
+        max_level: 5
+        patch_size: 4
+        embed_dims: 96 
+        window_size: [7, 7, 7, 7] 
+        depths: [2, 2, 6, 2]
+        num_heads: [3, 6, 12, 24]
+        drop_path: 0.0
+        absolute_positional_embed: False
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+        activation: leaky
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: greedy
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: False
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.07
+      cls_normalizer: 
+        'all': 1.0
+      obj_normalizer: 
+        'all': 1.0
+      objectness_smooth: 
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 64
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/train*'
+    is_training: true
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic: 
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 200
+      letter_box: True
+      random_flip: True
+      aug_rand_saturation: 0.7
+      aug_rand_brightness: 0.4
+      aug_rand_hue: 0.015
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+      random_pad: False
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 300
+      letter_box: True
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  weight_decay: 0.000
+  init_checkpoint: 'gs://tensorflow2/darknet/cspdarknet53-golden'
+  init_checkpoint_modules: 'backbone'
+  annotation_file: null
+trainer:
+  train_steps: 500500 # 160 epochs at 64 batchsize -> 500500 * 64/2
+  validation_steps: 625
+  steps_per_loop: 1850
+  summary_interval: 1850
+  validation_interval: 9250 
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema: null
+    learning_rate:
+      type: stepwise
+      stepwise:
+        boundaries: [400000, 450000]
+        name: PiecewiseConstantDecay
+        values: [0.00131, 0.000131, 0.0000131] 
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.949
+        momentum_start: 0.949
+        nesterov: True
+        warmup_steps: 1000
+        weight_decay: 0.0005
+        sim_torch: true
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  smart_bias_lr: 0.0
+  model:
+    darknet_based_model: True
+    input_size: [512, 512, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: regular
+        activation: leaky
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': original
+      scale_xy:
+        '5': 1.05
+        '4': 1.1
+        '3': 1.2
+      max_boxes: 200
+      nms_type: greedy
+      iou_thresh: 0.001
+      nms_thresh: 0.60
+    loss:
+      use_scaled_loss: False
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.7
+      iou_normalizer: 
+        'all': 0.07
+      cls_normalizer: 
+        'all': 1.0
+      obj_normalizer: 
+        'all': 1.0
+      objectness_smooth: 
+        'all': 0.0
+      max_delta:
+        'all': 5.0
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.0001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16],   box: [19, 36],   box: [40, 28], 
+              box: [36, 75],   box: [76, 55],   box: [72, 146], 
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    global_batch_size: 64
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/train*'
+    is_training: true
+    drop_remainder: true
+    seed: 1000
+    parser:
+      mosaic:
+        mosaic_frequency: 0.75
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'crop'
+        mosaic_center: 0.2
+        aug_scale_min: 0.2
+        aug_scale_max: 1.6
+        jitter: 0.3
+      max_num_instances: 200
+      letter_box: False
+      random_flip: True
+      aug_rand_saturation: 1.5
+      aug_rand_brightness: 1.5
+      aug_rand_hue: 0.1
+      aug_scale_min: 0.1
+      aug_scale_max: 1.9
+      aug_rand_translate: 0.0 
+      jitter: 0.3
+      area_thresh: 0.1
+      random_pad: True
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  validation_data:
+    global_batch_size: 8
+    dtype: float32
+    input_path: 'gs://cam2-datasets/coco/val*'
+    is_training: false
+    drop_remainder: true
+    parser:
+      max_num_instances: 200
+      letter_box: False
+      use_tie_breaker: True
+      anchor_thresh: 0.213
+  weight_decay: 0.000
+  init_checkpoint: 'gs://tensorflow2/darknet/cspdarknet53-golden'
+  init_checkpoint_modules: 'backbone'
+  annotation_file: null
+trainer:
+  train_steps: 500500 # 160 epochs at 64 batchsize -> 500500 * 64/2
+  validation_steps: 625
+  steps_per_loop: 1850
+  summary_interval: 1850
+  validation_interval: 9250 
+  checkpoint_interval: 1850
+  optimizer_config:
+    ema:
+      average_decay: 0.9998
+      trainable_weights_only: False
+      dynamic_decay: True
+    learning_rate:
+      type: stepwise
+      stepwise:
+        boundaries: [400000, 450000]
+        name: PiecewiseConstantDecay
+        values: [0.00131, 0.000131, 0.0000131] 
+    optimizer:
+      type: sgd_torch
+      sgd_torch:
+        momentum: 0.949
+        momentum_start: 0.949
+        nesterov: True
+        warmup_steps: 1000
+        weight_decay: 0.0005
+        sim_torch: true
+        name: SGD
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000 #learning rate rises from 0 to 0.0013 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/yolo.py
+++ b/official/vision/beta/projects/yolo/configs/yolo.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""YOLO configuration definition."""
+from typing import List, Optional, Union
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling.hyperparams import config_definitions as cfg
+from official.vision.beta.configs import common
+from official.vision.beta.projects.yolo import optimization
+from official.vision.beta.projects.yolo.configs import backbones
+from official.vision.beta.projects.yolo.configs import decoders
+import numpy as np
+import dataclasses
+MIN_LEVEL = 1
+MAX_LEVEL = 7
+def _build_dict(min_level, max_level, value):
+  vals = {str(key): value for key in range(min_level, max_level + 1)}
+  vals["all"] = None
+  return lambda: vals
+def _build_path_scales(min_level, max_level):
+  return lambda: {str(key): 2**key for key in range(min_level, max_level + 1)}
+@dataclasses.dataclass
+class FPNConfig(hyperparams.Config):
+  def get(self):
+    values = self.as_dict()
+    if "all" in values and values["all"] is not None:
+      for key in values:
+        if key != 'all':
+          values[key] = values["all"]
+    return values
+# pylint: disable=missing-class-docstring
+@dataclasses.dataclass
+class TfExampleDecoder(hyperparams.Config):
+  regenerate_source_id: bool = False
+  coco91_to_80: bool = True 
+@dataclasses.dataclass
+class TfExampleDecoderLabelMap(hyperparams.Config):
+  regenerate_source_id: bool = False
+  label_map: str = ''
+@dataclasses.dataclass
+class DataDecoder(hyperparams.OneOfConfig):
+  type: Optional[str] = 'simple_decoder'
+  simple_decoder: TfExampleDecoder = TfExampleDecoder()
+  label_map_decoder: TfExampleDecoderLabelMap = TfExampleDecoderLabelMap()
+@dataclasses.dataclass
+class Mosaic(hyperparams.Config):
+  mosaic_frequency: float = 0.0
+  mixup_frequency: float = 0.0
+  mosaic_center: float = 0.2
+  mosaic_crop_mode: Optional[str] = None
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  jitter: float = 0.0
+@dataclasses.dataclass
+class Parser(hyperparams.Config):
+  max_num_instances: int = 200
+  letter_box: Optional[bool] = True
+  random_flip: bool = True
+  random_pad: float = False
+  jitter: float = 0.0
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  aug_rand_saturation: float = 0.0
+  aug_rand_brightness: float = 0.0
+  aug_rand_hue: float = 0.0
+  aug_rand_angle: float = 0.0
+  aug_rand_translate: float = 0.0
+  aug_rand_perspective: float = 0.0
+  use_tie_breaker: bool = True
+  best_match_only: bool = False
+  anchor_thresh: float = -0.01
+  area_thresh: float = 0.1
+  mosaic: Mosaic = Mosaic()
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """Input config for training."""
+  global_batch_size: int = 64
+  input_path: str = ''
+  tfds_name: str = None
+  tfds_split: str = None
+  global_batch_size: int = 1
+  is_training: bool = True
+  dtype: str = 'float16'
+  decoder: DataDecoder = DataDecoder()
+  parser: Parser = Parser()
+  shuffle_buffer_size: int = 10000
+  tfds_download: bool = True
+  cache: bool = False
+@dataclasses.dataclass
+class YoloHead(hyperparams.Config):
+  """Parameterization for the YOLO Head."""
+  smart_bias: bool = True
+@dataclasses.dataclass
+class YoloDetectionGenerator(hyperparams.Config):
+  box_type: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, "original"))
+  scale_xy: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  path_scales: FPNConfig = dataclasses.field(
+      default_factory=_build_path_scales(MIN_LEVEL, MAX_LEVEL))
+  nms_type: str = 'greedy'
+  iou_thresh: float = 0.001
+  nms_thresh: float = 0.6
+  max_boxes: int = 200
+  pre_nms_points: int = 5000
+@dataclasses.dataclass
+class YoloLoss(hyperparams.Config):
+  ignore_thresh: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 0.0))
+  truth_thresh: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  box_loss_type: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 'ciou'))
+  iou_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  cls_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  obj_normalizer: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 1.0))
+  max_delta: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, np.inf))
+  objectness_smooth: FPNConfig = dataclasses.field(
+      default_factory=_build_dict(MIN_LEVEL, MAX_LEVEL, 0.0))
+  label_smoothing: float = 0.0
+  use_scaled_loss: bool = True
+  update_on_repeat: bool = True
+class Box(hyperparams.Config):
+  box: List[int] = dataclasses.field(default=list)
+@dataclasses.dataclass
+class AnchorBoxes(hyperparams.Config):
+  boxes: List[Box] = None
+  level_limits: Optional[List[int]] = None
+  anchors_per_scale: int = 3
+  def get(self, min_level, max_level):
+    if self.level_limits is None:
+      boxes = [box.box for box in self.boxes]
+    else:
+      boxes = [[1.0, 1.0]] * ((max_level - min_level) + 1)
+      self.anchors_per_scale = 1
+    anchors_per_level = dict()
+    start = 0
+    for i in range(min_level, max_level + 1):
+      anchors_per_level[str(i)] = boxes[start:start + self.anchors_per_scale]
+      start += self.anchors_per_scale
+    return anchors_per_level, self.level_limits
+@dataclasses.dataclass
+class Yolo(hyperparams.Config):
+  input_size: Optional[List[int]] = dataclasses.field(
+      default_factory=lambda: [512, 512, 3])
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='darknet', darknet=backbones.Darknet(model_id='cspdarknet53'))
+  decoder: decoders.Decoder = decoders.Decoder(
+      type='yolo_decoder', yolo_decoder=decoders.YoloDecoder(
+        version='v4', type='regular'
+      )
+  )
+  head: YoloHead = YoloHead()
+  detection_generator: YoloDetectionGenerator = YoloDetectionGenerator()
+  loss: YoloLoss = YoloLoss()
+  norm_activation: common.NormActivation = common.NormActivation(
+      activation='mish',
+      use_sync_bn=True,
+      norm_momentum=0.99,
+      norm_epsilon=0.001)
+  num_classes: int = 80
+  anchor_boxes: AnchorBoxes = AnchorBoxes()
+  darknet_based_model: bool = False
+@dataclasses.dataclass
+class YoloTask(cfg.TaskConfig):
+  per_category_metrics: bool = False
+  smart_bias_lr: float = 0.0
+  model: Yolo = Yolo()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(is_training=False)
+  weight_decay: float = 0.0
+  annotation_file: Optional[str] = None
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: Union[
+      str, List[str]] = 'all'  # all, backbone, and/or decoder
+  gradient_clip_norm: float = 0.0
+COCO_INPUT_PATH_BASE = 'coco'
+COCO_TRAIN_EXAMPLES = 118287
+COCO_VAL_EXAMPLES = 5000
+GLOBAL_SEED = 1000
+@exp_factory.register_config_factory('yolo')
+def yolo() -> cfg.ExperimentConfig:
+  """Yolo general config."""
+  return cfg.ExperimentConfig(
+      task=YoloTask(),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+@exp_factory.register_config_factory('yolo_darknet')
+def yolo_darknet() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOv3 and v4"""
+  train_batch_size = 64
+  eval_batch_size = 8
+  train_epochs = 300
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+  validation_interval = 5
+  max_num_instances = 200
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint='',
+          init_checkpoint_modules='backbone',
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolo(
+              darknet_based_model = True,
+              norm_activation=common.NormActivation(use_sync_bn=True),
+              head=YoloHead(smart_bias=True), 
+              loss=YoloLoss(use_scaled_loss=False, update_on_repeat=True)),
+          train_data=DataConfig(
+              is_training=True,
+              global_batch_size=train_batch_size,
+              seed=GLOBAL_SEED, 
+              dtype='float32', 
+              parser=Parser(
+                letter_box=False,
+                aug_rand_saturation= 1.5,
+                aug_rand_brightness= 1.5,
+                aug_rand_hue= 0.1,
+                use_tie_breaker=True, 
+                best_match_only=False, 
+                anchor_thresh=0.213,
+                area_thresh=0.1,
+                max_num_instances=max_num_instances, 
+                mosaic=Mosaic(
+                  mosaic_frequency= 0.75,
+                  mixup_frequency= 0.0,
+                  mosaic_crop_mode= 'crop',
+                  mosaic_center= 0.2
+                )
+              )),
+          validation_data=DataConfig(
+              is_training=False,
+              global_batch_size=eval_batch_size, 
+              drop_remainder=True, 
+              dtype='float32', 
+              parser=Parser(
+                letter_box=False,
+                use_tie_breaker=True, 
+                best_match_only=False, 
+                anchor_thresh=0.213,
+                area_thresh=0.1,
+                max_num_instances=max_num_instances, 
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema':{
+                    'average_decay': 0.9998,
+                    'trainable_weights_only': False,
+                    'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.949,
+                      'momentum_start': 0.949,
+                      'nesterov': True,
+                      'warmup_steps': 1000,
+                      'weight_decay': 0.0005,
+                      'sim_torch': True,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [240 * steps_per_epoch, 270*steps_per_epoch],
+                      'values': [
+                          0.00131 * train_batch_size / 64.0,
+                          0.000131 * train_batch_size / 64.0,
+                          0.0000131 * train_batch_size / 64.0
+                      ]
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 1000,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
+@exp_factory.register_config_factory('scaled_yolo')
+def scaled_yolo() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOv4-csp and v4"""
+  train_batch_size = 128
+  eval_batch_size = 8
+  train_epochs = 300
+  warmup_epochs = 3
+  validation_interval = 5
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+  max_num_instances = 300
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint_modules=None,
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolo(
+            darknet_based_model = False,
+            norm_activation=common.NormActivation(
+              activation='mish',
+              use_sync_bn=True, 
+              norm_epsilon=0.0001, 
+              norm_momentum=0.97),
+            head=YoloHead(smart_bias=True), 
+            loss=YoloLoss(use_scaled_loss=True)),
+          train_data=DataConfig(
+              is_training=True,
+              global_batch_size=train_batch_size,
+              seed=GLOBAL_SEED, 
+              dtype='float32',
+              parser=Parser(
+                aug_rand_saturation = 0.7, 
+                aug_rand_brightness = 0.4,
+                aug_rand_hue = 0.015, 
+                letter_box=True,
+                use_tie_breaker=True, 
+                best_match_only=True, 
+                anchor_thresh=4.0,
+                random_pad=False,
+                area_thresh=0.1,
+                max_num_instances=max_num_instances,
+                mosaic=Mosaic(
+                  mosaic_crop_mode='scale',
+                  mosaic_frequency=1.0, 
+                  mixup_frequency=0.0, 
+                ) 
+              )),
+          validation_data=DataConfig(
+              is_training=False,
+              global_batch_size=eval_batch_size, 
+              drop_remainder=True, 
+              dtype='float32', 
+              parser=Parser(
+                letter_box=True,
+                use_tie_breaker=True, 
+                best_match_only=True, 
+                anchor_thresh=4.0,
+                area_thresh=0.1,
+                max_num_instances=max_num_instances, 
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema':{
+                    'average_decay': 0.9999,
+                    'trainable_weights_only': False,
+                    'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.937,
+                      'momentum_start': 0.8,
+                      'nesterov': True,
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'weight_decay': 0.0005 * train_batch_size/64.0,
+                      'sim_torch': True,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'cosine',
+                  'cosine': {
+                      'initial_learning_rate': 0.01,
+                      'alpha': 0.2, 
+                      'decay_steps': train_epochs * steps_per_epoch,
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
\ No newline at end of file
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -383,7 +383,7 @@ class Darknet(tf.keras.Model):
      max_level=5,
      width_scale=1.0,
      depth_scale=1.0,
-      use_reorg_input=False,
+      use_reorg_input = False, 
      csp_level_mod=(),
      activation=None,
      use_sync_bn=False,
@@ -454,6 +454,9 @@ class Darknet(tf.keras.Model):
  def _build_struct(self, net, inputs):
    if self._use_reorg_input:
      inputs = nn_blocks.Reorg()(inputs)
+      net[0].filters = net[1].filters
+      net[0].output_name = net[1].output_name
+      del net[1]
    endpoints = collections.OrderedDict()
    stack_outputs = [inputs]
@@ -666,7 +669,6 @@ class Darknet(tf.keras.Model):
    }
    return layer_config
 @factory.register_backbone_builder('darknet')
 def build_darknet(
    input_specs: tf.keras.layers.InputSpec,

--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -12,11 +12,66 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Lint as: python3
 """Feature Pyramid Network and Path Aggregation variants used in YOLO."""
+from typing import Mapping, Union
+from official.modeling import hyperparams
 import tensorflow as tf
 from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
+from official.vision.beta.modeling.decoders import factory
+# model configurations
+# the structure is as follows. model version, {v3, v4, v#, ... etc}
+# the model config type {regular, tiny, small, large, ... etc}
+YOLO_MODELS = {
+    "v4":
+        dict(
+            regular=dict(
+                embed_spp=False,
+                use_fpn=True,
+                max_level_process_len=None,
+                path_process_len=6),
+            tiny=dict(
+                embed_spp=False,
+                use_fpn=False,
+                max_level_process_len=2,
+                path_process_len=1),
+            csp=dict(
+                embed_spp=False,
+                use_fpn=True,
+                max_level_process_len=None,
+                csp_stack=5,
+                fpn_depth=5,
+                path_process_len=6),
+            csp_large=dict(
+                embed_spp=False,
+                use_fpn=True,
+                max_level_process_len=None,
+                csp_stack=7,
+                fpn_depth=7,
+                path_process_len=8,
+                fpn_filter_scale=2),
+        ),
+    "v3":
+        dict(
+            regular=dict(
+                embed_spp=False,
+                use_fpn=False,
+                max_level_process_len=None,
+                path_process_len=6),
+            tiny=dict(
+                embed_spp=False,
+                use_fpn=False,
+                max_level_process_len=2,
+                path_process_len=1),
+            spp=dict(
+                embed_spp=True,
+                use_fpn=False,
+                max_level_process_len=2,
+                path_process_len=1),
+        ),
+}
 @tf.keras.utils.register_keras_serializable(package='yolo')
 class _IdentityRoute(tf.keras.layers.Layer):
@@ -487,3 +542,65 @@ class YoloDecoder(tf.keras.Model):
  @classmethod
  def from_config(cls, config, custom_objects=None):
    return cls(**config)
+@factory.register_decoder_builder('yolo_decoder')
+def build_yolo_decoder(input_specs: Mapping[str, tf.TensorShape], 
+                       model_config: hyperparams.Config, 
+                       l2_regularizer: tf.keras.regularizers.Regularizer = None, 
+                       **kwargs) -> Union[None, tf.keras.Model, tf.keras.layers.Layer]:
+  """Builds Yolo FPN/PAN decoder from a config.
+  Args:
+    input_specs: A `dict` of input specifications. A dictionary consists of
+      {level: TensorShape} from a backbone.
+    model_config: A OneOfConfig. Model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
+      None.
+  Returns:
+    A `tf.keras.Model` instance of the Yolo FPN/PAN decoder.
+  """
+  decoder_cfg = model_config.decoder.get()
+  norm_activation_config = model_config.norm_activation
+  activation = (
+      decoder_cfg.activation
+      if decoder_cfg.activation != "same" else
+      norm_activation_config.activation)
+  if decoder_cfg.version is None:  # custom yolo
+    raise Exception("decoder version cannot be None, specify v3 or v4")
+  if decoder_cfg.version not in YOLO_MODELS:
+    raise Exception(
+        "unsupported model version please select from {v3, v4}, \n\n \
+        or specify a custom decoder config using YoloDecoder in you yaml")
+  if decoder_cfg.type == None:
+    decoder_cfg.type = "regular"
+  if decoder_cfg.type not in YOLO_MODELS[decoder_cfg.version]:
+    raise Exception("unsupported model type please select from \
+        {yolo_model.YOLO_MODELS[decoder_cfg.version].keys()},\
+        \n\n or specify a custom decoder config using YoloDecoder in you yaml")
+  base_model = YOLO_MODELS[decoder_cfg.version][decoder_cfg.type]
+  cfg_dict = decoder_cfg.as_dict()
+  for key in base_model:
+    if cfg_dict[key] is not None:
+      base_model[key] = cfg_dict[key]
+  base_dict = dict(
+      activation=activation,
+      use_spatial_attention=decoder_cfg.use_spatial_attention,
+      use_separable_conv=decoder_cfg.use_separable_conv,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+  base_model.update(base_dict)
+  model = YoloDecoder(input_specs, **base_model)
+  return model
\ No newline at end of file