Merge pull request #10333 from PurdueDualityLab:exp_pr2

PiperOrigin-RevId: 409474266

Merge pull request #10333 from PurdueDualityLab:exp_pr2
PiperOrigin-RevId: 409474266
9cd84cc1 · A. Unique TensorFlower · 987238e6 · 7a45b513 · 9cd84cc1 · 9cd84cc1
Commit 9cd84cc1 authored Nov 12, 2021 by A. Unique TensorFlower
16 changed files
--- a/official/vision/beta/projects/yolo/configs/decoders.py
+++ b/official/vision/beta/projects/yolo/configs/decoders.py
@@ -33,6 +33,8 @@ class YoloDecoder(hyperparams.Config):
  use_separable_conv: bool = False
  csp_stack: Optional[bool] = None
  fpn_depth: Optional[int] = None
+  max_fpn_depth: Optional[int] = None
+  max_csp_stack: Optional[int] = None
  fpn_filter_scale: Optional[int] = None
  path_process_len: Optional[int] = None
  max_level_process_len: Optional[int] = None

--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_csp_640_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_csp_640_tpu.yaml
+# --experiment_type=scaled_yolo
+# mAP 47.6
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'altered_cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.3
+      object_normalizer:
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16], box: [19, 36], box: [40, 28],
+              box: [36, 75], box: [76, 55], box: [72, 146],
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p5_896_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p5_896_tpu.yaml
+# --experiment_type=scaled_yolo
+# mAP 51.1%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [896, 896, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 5
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.3
+      object_normalizer:
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes: [box: [13, 17], box: [31, 25], box: [24, 51], box: [61, 45],
+              box: [48, 102], box: [119, 96], box: [97, 189], box: [217, 184],
+              box: [171, 384], box: [324, 451], box: [616, 618], box: [800, 800]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
+trainer:
+  best_checkpoint_eval_metric: 'AP'
+  best_checkpoint_export_subdir: 'best_ckpt'
+  best_checkpoint_metric_comp: 'higher'
+  train_steps: 231000  # 500 epochs
+  optimizer_config:
+    learning_rate:
+      cosine:
+        decay_steps: 231000  # 500 epochs
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p6_1280_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p6_1280_tpu.yaml
+# --experiment_type=scaled_yolo
+# mAP 54%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1280, 1280, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 6
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.5
+      object_normalizer:
+        '6': 0.07
+        '5': 0.29
+        '4': 0.7
+        '3': 2.8
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13, 17], box: [31, 25], box: [24, 51], box: [61, 45],
+               box: [61, 45], box: [48, 102], box: [119, 96], box: [97, 189],
+               box: [97, 189], box: [217, 184], box: [171, 384], box: [324, 451],
+               box: [324, 451], box: [545, 357], box: [616, 618], box: [1024, 1024]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
+trainer:
+  best_checkpoint_eval_metric: 'AP'
+  best_checkpoint_export_subdir: 'best_ckpt'
+  best_checkpoint_metric_comp: 'higher'
+  train_steps: 231000  # 500 epochs
+  optimizer_config:
+    learning_rate:
+      cosine:
+        decay_steps: 231000  # 500 epochs
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p7_1536_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p7_1536_tpu.yaml
+# --experiment_type=scaled_yolo
+# mAP 54.7%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1536, 1536, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 7
+        min_level: 3
+        width_scale: 1.25
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.5
+      object_normalizer:
+        '7': 0.07
+        '6': 0.22
+        '5': 0.35
+        '4': 0.7
+        '3': 2.8
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13, 17], box: [22, 25], box: [55, 41], box: [27, 66],
+               box: [57, 88], box: [112, 69], box: [69, 177], box: [136, 138],
+               box: [136, 138], box: [287, 114], box: [134, 275], box: [268, 248],
+               box: [268, 248], box: [232, 504], box: [445, 416], box: [640, 640],
+               box: [812, 393], box: [477, 808], box: [1070, 908], box: [1408, 1408]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
+trainer:
+  best_checkpoint_eval_metric: 'AP'
+  best_checkpoint_export_subdir: 'best_ckpt'
+  best_checkpoint_metric_comp: 'higher'
+  train_steps: 231000  # 500 epochs
+  optimizer_config:
+    learning_rate:
+      cosine:
+        decay_steps: 231000  # 500 epochs
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/imagenet_pretraining/cspdarknet53_256_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/imagenet_pretraining/cspdarknet53_256_tpu.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float32'
+task:
+  model:
+    num_classes: 1001
+    input_size: [256, 256, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknet53'
+    norm_activation:
+      activation: 'mish'
+  losses:
+    l2_weight_decay: 0.0005
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 128
+    dtype: 'float16'
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: true
+    global_batch_size: 128
+    dtype: 'float16'
+    drop_remainder: false
+trainer:
+  train_steps: 1200000  # epochs: 120
+  validation_steps: 400  # size of validation data
+  validation_interval: 10000
+  steps_per_loop: 10000
+  summary_interval: 10000
+  checkpoint_interval: 10000
+  optimizer_config:
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'polynomial'
+      polynomial:
+        initial_learning_rate: 0.1
+        end_learning_rate: 0.0001
+        power: 4.0
+        decay_steps: 1200000
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000  # learning rate rises from 0 to 0.1 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/yolo.py
+++ b/official/vision/beta/projects/yolo/configs/yolo.py
@@ -268,7 +268,7 @@ def yolo() -> cfg.ExperimentConfig:
 @exp_factory.register_config_factory('yolo_darknet')
 def yolo_darknet() -> cfg.ExperimentConfig:
  """COCO object detection with YOLOv3 and v4."""
-  train_batch_size = 64
+  train_batch_size = 256
  eval_batch_size = 8
  train_epochs = 300
  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
@@ -389,7 +389,7 @@ def yolo_darknet() -> cfg.ExperimentConfig:
 @exp_factory.register_config_factory('scaled_yolo')
 def scaled_yolo() -> cfg.ExperimentConfig:
  """COCO object detection with YOLOv4-csp and v4."""
-  train_batch_size = 64
+  train_batch_size = 256
  eval_batch_size = 8
  train_epochs = 300
  warmup_epochs = 3
@@ -411,7 +411,7 @@ def scaled_yolo() -> cfg.ExperimentConfig:
              norm_activation=common.NormActivation(
                  activation='mish',
                  use_sync_bn=True,
-                  norm_epsilon=0.0001,
+                  norm_epsilon=0.001,
                  norm_momentum=0.97),
              head=YoloHead(smart_bias=True),
              loss=YoloLoss(use_scaled_loss=True),
@@ -469,7 +469,7 @@ def scaled_yolo() -> cfg.ExperimentConfig:
          validation_interval=validation_interval * steps_per_epoch,
          steps_per_loop=steps_per_epoch,
          summary_interval=steps_per_epoch,
-          checkpoint_interval=steps_per_epoch,
+          checkpoint_interval=5 * steps_per_epoch,
          optimizer_config=optimization.OptimizationConfig({
              'ema': {
                  'average_decay': 0.9999,
@@ -483,7 +483,7 @@ def scaled_yolo() -> cfg.ExperimentConfig:
                      'momentum_start': 0.8,
                      'nesterov': True,
                      'warmup_steps': steps_per_epoch * warmup_epochs,
-                      'weight_decay': 0.0005 * train_batch_size / 64.0,
+                      'weight_decay': 0.0005,
                  }
              },
              'learning_rate': {

--- a/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
@@ -237,14 +237,14 @@ class Parser(parser.Parser):
          affine=affine,
          shuffle_boxes=False,
          area_thresh=self._area_thresh,
-          augment=True,
+          filter_and_clip_boxes=True,
          seed=self._seed)
      classes = tf.gather(classes, inds)
      info = infos[-1]
    else:
      image = tf.image.resize(
          image, (self._image_h, self._image_w), method='nearest')
-      output_size = tf.cast([640, 640], tf.float32)
+      output_size = tf.cast([self._image_h, self._image_w], tf.float32)
      boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
      inds = bbox_ops.get_non_empty_box_indices(boxes_)
      boxes = tf.gather(boxes, inds)
@@ -286,7 +286,8 @@ class Parser(parser.Parser):
    # Clip and clean boxes.
    image = image / 255.0
    boxes, inds = preprocessing_ops.transform_and_clip_boxes(
-        boxes, infos, shuffle_boxes=False, area_thresh=0.0, augment=True)
+        boxes, infos, shuffle_boxes=False, area_thresh=0.0,
+        filter_and_clip_boxes=False)
    classes = tf.gather(classes, inds)
    info = infos[-1]
@@ -342,17 +343,26 @@ class Parser(parser.Parser):
    # Update the labels dictionary.
    if not is_training:
      # Sets up groundtruth data for evaluation.
      groundtruths = {
-          'source_id': labels['source_id'],
+          'source_id':
-          'height': height,
+              labels['source_id'],
-          'width': width,
+          'height':
-          'num_detections': tf.shape(gt_boxes)[0],
+              data['height'],
-          'image_info': info,
+          'width':
-          'boxes': gt_boxes,
+              data['width'],
-          'classes': gt_classes,
+          'num_detections':
-          'areas': tf.gather(data['groundtruth_area'], inds),
+              tf.shape(data['groundtruth_boxes'])[0],
+          'image_info':
+              info,
+          'boxes':
+              bbox_ops.denormalize_boxes(
+                  data['groundtruth_boxes'],
+                  tf.cast([data['height'], data['width']], gt_boxes.dtype)),
+          'classes':
+              data['groundtruth_classes'],
+          'areas':
+              data['groundtruth_area'],
          'is_crowds':
              tf.cast(tf.gather(data['groundtruth_is_crowd'], inds), tf.int32),
      }

--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -225,7 +225,7 @@ LARGECSP53 = {
            False
        ],
        [
-            'DarkRes', 'csp', 1, True, 64, None, None, None, None, 'mish', -1,
+            'DarkRes', 'csp', 1, False, 64, None, None, None, None, 'mish', -1,
            1, 1, False
        ],
        [

--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -50,8 +50,18 @@ YOLO_MODELS = {
                max_level_process_len=None,
                csp_stack=7,
                fpn_depth=7,
+                max_fpn_depth=5,
+                max_csp_stack=5,
                path_process_len=8,
-                fpn_filter_scale=2),
+                fpn_filter_scale=1),
+            csp_xlarge=dict(
+                embed_spp=False,
+                use_fpn=True,
+                max_level_process_len=None,
+                csp_stack=7,
+                fpn_depth=7,
+                path_process_len=8,
+                fpn_filter_scale=1),
        ),
    'v3':
        dict(
@@ -87,6 +97,8 @@ class YoloFPN(tf.keras.layers.Layer):
  def __init__(self,
               fpn_depth=4,
+               max_fpn_depth=None,
+               max_csp_stack=None,
               use_spatial_attention=False,
               csp_stack=False,
               activation='leaky',
@@ -104,6 +116,10 @@ class YoloFPN(tf.keras.layers.Layer):
    Args:
      fpn_depth: `int`, number of layers to use in each FPN path
        if you choose to use an FPN.
+      max_fpn_depth: `int`, number of layers to use in each FPN path
+        if you choose to use an FPN along the largest FPN level.
+      max_csp_stack: `int`, number of layers to use for CSP on the largest_path
+        only.
      use_spatial_attention: `bool`, use the spatial attention module.
      csp_stack: `bool`, CSPize the FPN.
      activation: `str`, the activation function to use typically leaky or mish.
@@ -121,6 +137,7 @@ class YoloFPN(tf.keras.layers.Layer):
    super().__init__(**kwargs)
    self._fpn_depth = fpn_depth
+    self._max_fpn_depth = max_fpn_depth or self._fpn_depth
    self._activation = activation
    self._use_sync_bn = use_sync_bn
@@ -133,6 +150,7 @@ class YoloFPN(tf.keras.layers.Layer):
    self._use_spatial_attention = use_spatial_attention
    self._filter_scale = fpn_filter_scale
    self._csp_stack = csp_stack
+    self._max_csp_stack = max_csp_stack or min(self._max_fpn_depth, csp_stack)
    self._base_config = dict(
        activation=self._activation,
@@ -184,6 +202,7 @@ class YoloFPN(tf.keras.layers.Layer):
    for level, depth in zip(
        reversed(range(self._min_level, self._max_level + 1)), self._depths):
      if level == self._min_level:
        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
            filters=depth // 2,
@@ -211,10 +230,10 @@ class YoloFPN(tf.keras.layers.Layer):
      else:
        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
            filters=depth,
-            repetitions=self._fpn_depth + 1 * int(self._csp_stack == 0),
+            repetitions=self._max_fpn_depth + 1 * int(self._csp_stack == 0),
            insert_spp=True,
            block_invert=False,
-            csp_stack=self._csp_stack,
+            csp_stack=min(self._csp_stack, self._max_fpn_depth),
            **self._base_config)
  def call(self, inputs):
@@ -349,13 +368,16 @@ class YoloPAN(tf.keras.layers.Layer):
      downsample = False
      upsample = True
-    if self._csp_stack == 0:
-      proc_filters = lambda x: x
-      resample_filters = lambda x: x // 2
-    else:
-      proc_filters = lambda x: x * 2
-      resample_filters = lambda x: x
    for level, depth in zip(self._iterator, self._depths):
+      if level > 5:
+        proc_filters = lambda x: x * 2
+        resample_filters = lambda x: x
+      elif self._csp_stack == 0:
+        proc_filters = lambda x: x
+        resample_filters = lambda x: x // 2
+      else:
+        proc_filters = lambda x: x * 2
+        resample_filters = lambda x: x
      if level == self._input:
        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
            filters=proc_filters(depth),
@@ -396,7 +418,7 @@ class YoloPAN(tf.keras.layers.Layer):
    depths = []
    if len(inputs.keys()) > 3 or self._fpn_filter_scale > 1:
      for i in range(self._min_level, self._max_level + 1):
-        depths.append(inputs[str(i)][-1] * 2)
+        depths.append(inputs[str(i)][-1])
    else:
      for _ in range(self._min_level, self._max_level + 1):
        depths.append(minimum_depth)
@@ -429,6 +451,8 @@ class YoloDecoder(tf.keras.Model):
               use_spatial_attention=False,
               csp_stack=False,
               fpn_depth=4,
+               max_fpn_depth=None,
+               max_csp_stack=None,
               fpn_filter_scale=1,
               path_process_len=6,
               max_level_process_len=None,
@@ -455,6 +479,8 @@ class YoloDecoder(tf.keras.Model):
      csp_stack: `bool`, CSPize the FPN.
      fpn_depth: `int`, number of layers ot use in each FPN path if you choose
        to use an FPN.
+      max_fpn_depth: `int`, maximum fpn depth.
+      max_csp_stack: `int`, maximum csp stack.
      fpn_filter_scale: `int`, scaling factor for the FPN filters.
      path_process_len: `int`, number of layers ot use in each Decoder path.
      max_level_process_len: `int`, number of layers ot use in the largest
@@ -475,6 +501,8 @@ class YoloDecoder(tf.keras.Model):
    self._input_specs = input_specs
    self._use_fpn = use_fpn
    self._fpn_depth = fpn_depth
+    self._max_fpn_depth = max_fpn_depth
+    self._max_csp_stack = max_csp_stack
    self._path_process_len = path_process_len
    self._max_level_process_len = max_level_process_len
    self._embed_spp = embed_spp
@@ -514,8 +542,10 @@ class YoloDecoder(tf.keras.Model):
    }
    if self._use_fpn:
      inter_outs = YoloFPN(
-          fpn_depth=self._fpn_depth, **self._base_config)(
+          fpn_depth=self._fpn_depth,
-              inputs)
+          max_fpn_depth=self._max_fpn_depth,
+          max_csp_stack=self._max_csp_stack,
+          **self._base_config)(inputs)
      outputs = YoloPAN(**self._decoder_config)(inter_outs)
    else:
      inter_outs = None

--- a/official/vision/beta/projects/yolo/ops/mosaic.py
+++ b/official/vision/beta/projects/yolo/ops/mosaic.py
@@ -179,7 +179,7 @@ class Mosaic:
        infos,
        area_thresh=self._area_thresh,
        shuffle_boxes=False,
-        augment=True,
+        filter_and_clip_boxes=True,
        seed=self._seed)
    classes, is_crowd, area = self._select_ind(inds, classes, is_crowd, area)  # pylint:disable=unbalanced-tuple-unpacking
    return image, boxes, classes, is_crowd, area, crop_points

--- a/official/vision/beta/projects/yolo/ops/preprocessing_ops.py
+++ b/official/vision/beta/projects/yolo/ops/preprocessing_ops.py
@@ -482,11 +482,15 @@ def resize_and_jitter_image(image,
    image_ = tf.pad(
        cropped_image, [[pad[0], pad[2]], [pad[1], pad[3]], [0, 0]],
        constant_values=PAD_VALUE)
+    # Pad and scale info
+    isize = tf.cast(tf.shape(image_)[:2], dtype=tf.float32)
+    osize = tf.cast((desired_size[0], desired_size[1]), dtype=tf.float32)
    pad_info = tf.stack([
        tf.cast(tf.shape(cropped_image)[:2], tf.float32),
-        tf.cast(tf.shape(image_)[:2], dtype=tf.float32),
+        osize,
-        tf.ones_like(original_dims, dtype=tf.float32),
+        osize/isize,
-        (-tf.cast(pad[:2], tf.float32))
+        (-tf.cast(pad[:2], tf.float32)*osize/isize)
    ])
    infos.append(pad_info)
@@ -761,7 +765,9 @@ def boxes_candidates(clipped_boxes,
  Returns:
    indices[:, 0]: A `Tensor` representing valid boxes after filtering.
  """
+  if area_thr == 0.0:
+    wh_thr = 0
+    ar_thr = np.inf
  area_thr = tf.math.abs(area_thr)
  # Get the scaled and shifted heights of the original
@@ -778,8 +784,8 @@ def boxes_candidates(clipped_boxes,
                  clipped_height / (clipped_width + 1e-16))
  # Ensure the clipped width adn height are larger than a preset threshold.
-  conda = clipped_width > wh_thr
+  conda = clipped_width >= wh_thr
-  condb = clipped_height > wh_thr
+  condb = clipped_height >= wh_thr
  # Ensure the area of the clipped box is larger than the area threshold.
  area = (clipped_height * clipped_width) / (og_width * og_height + 1e-16)
@@ -837,7 +843,7 @@ def transform_and_clip_boxes(boxes,
                             shuffle_boxes=False,
                             area_thresh=0.1,
                             seed=None,
-                             augment=True):
+                             filter_and_clip_boxes=True):
  """Clips and cleans the boxes.
  Args:
@@ -847,7 +853,8 @@ def transform_and_clip_boxes(boxes,
    shuffle_boxes: A `bool` for shuffling the boxes.
    area_thresh: An `int` for the area threshold.
    seed: seed for random number generation.
-    augment: A `bool` for clipping the boxes to [0, 1].
+    filter_and_clip_boxes: A `bool` for filtering and clipping the boxes to
+      [0, 1].
  Returns:
    boxes: A `Tensor` representing the augmented boxes.
@@ -868,8 +875,8 @@ def transform_and_clip_boxes(boxes,
  # Make sure all boxes are valid to start, clip to [0, 1] and get only the
  # valid boxes.
-  output_size = tf.cast([640, 640], tf.float32)
+  output_size = None
-  if augment:
+  if filter_and_clip_boxes:
    boxes = tf.math.maximum(tf.math.minimum(boxes, 1.0), 0.0)
  cond = get_valid_boxes(boxes)
@@ -918,16 +925,18 @@ def transform_and_clip_boxes(boxes,
  boxes *= tf.cast(tf.expand_dims(cond, axis=-1), boxes.dtype)
  # Threshold the existing boxes.
-  if augment:
+  if filter_and_clip_boxes:
-    boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
+    if output_size is not None:
-    box_history_ = bbox_ops.denormalize_boxes(box_history, output_size)
+      boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
-    inds = boxes_candidates(boxes_, box_history_, area_thr=area_thresh)
+      box_history_ = bbox_ops.denormalize_boxes(box_history, output_size)
+      inds = boxes_candidates(boxes_, box_history_, area_thr=area_thresh)
+    else:
+      inds = boxes_candidates(
+          boxes, box_history, wh_thr=0.0, area_thr=area_thresh)
    # Select and gather the good boxes.
    if shuffle_boxes:
      inds = tf.random.shuffle(inds, seed=seed)
  else:
-    boxes = box_history
+    inds = bbox_ops.get_non_empty_box_indices(boxes)
-    boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
-    inds = bbox_ops.get_non_empty_box_indices(boxes_)
  boxes = tf.gather(boxes, inds)
  return boxes, inds
--- a/official/vision/beta/projects/yolo/optimization/sgd_torch.py
+++ b/official/vision/beta/projects/yolo/optimization/sgd_torch.py
@@ -302,6 +302,7 @@ class SGDTorch(tf.keras.optimizers.Optimizer):
        "decay": self._initial_decay,
        "momentum": self._serialize_hyperparameter("momentum"),
        "momentum_start": self._serialize_hyperparameter("momentum_start"),
+        "weight_decay": self._serialize_hyperparameter("weight_decay"),
        "warmup_steps": self._serialize_hyperparameter("warmup_steps"),
        "nesterov": self.nesterov,
    })

--- a/official/vision/beta/projects/yolo/tasks/yolo.py
+++ b/official/vision/beta/projects/yolo/tasks/yolo.py
@@ -255,16 +255,22 @@ class YoloTask(base_task.Task):
        logs.update({m.name: m.result()})
    return logs
-  def _reorg_boxes(self, boxes, num_detections, image):
+  def _reorg_boxes(self, boxes, info, num_detections):
    """Scale and Clean boxes prior to Evaluation."""
-    # Build a prediciton mask to take only the number of detections
    mask = tf.sequence_mask(num_detections, maxlen=tf.shape(boxes)[1])
    mask = tf.cast(tf.expand_dims(mask, axis=-1), boxes.dtype)
    # Denormalize the boxes by the shape of the image
-    inshape = tf.cast(preprocessing_ops.get_image_shape(image), boxes.dtype)
+    inshape = tf.expand_dims(info[:, 1, :], axis=1)
+    ogshape = tf.expand_dims(info[:, 0, :], axis=1)
+    scale = tf.expand_dims(info[:, 2, :], axis=1)
+    offset = tf.expand_dims(info[:, 3, :], axis=1)
    boxes = box_ops.denormalize_boxes(boxes, inshape)
+    boxes = box_ops.clip_boxes(boxes, inshape)
+    boxes += tf.tile(offset, [1, 1, 2])
+    boxes /= tf.tile(scale, [1, 1, 2])
+    boxes = box_ops.clip_boxes(boxes, ogshape)
    # Mask the boxes for usage
    boxes *= mask
@@ -292,10 +298,8 @@ class YoloTask(base_task.Task):
    logs = {self.loss: metric_loss}
    # Reorganize and rescale the boxes
-    boxes = self._reorg_boxes(y_pred['bbox'], y_pred['num_detections'], image)
+    info = label['groundtruths']['image_info']
-    label['groundtruths']['boxes'] = self._reorg_boxes(
+    boxes = self._reorg_boxes(y_pred['bbox'], info, y_pred['num_detections'])
-        label['groundtruths']['boxes'], label['groundtruths']['num_detections'],
-        image)
    # Build the input for the coc evaluation metric
    coco_model_outputs = {