large models

ff676f0b · Vishnu Banna · cf3bb8cf · ff676f0b · ff676f0b · ff676f0b
Commit ff676f0b authored Oct 15, 2021 by Vishnu Banna
12 changed files
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p5_896_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p5_896_tpu.yaml
+# --experiment_type=large_yolo_finetune
+# mAP 51.1%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [896, 896, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 5
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '5': 0.4
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes: [box: [13,17], box: [31,25], box: [24,51], box: [61,45], 
+              box: [48,102], box: [119,96], box: [97,189], box: [217,184], 
+              box: [171,384], box: [324,451],  box: [616,618], box: [800,800]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.2
+        aug_scale_max: 1.8
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p6_1280_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p6_1280_tpu.yaml
+# --experiment_type=large_yolo_finetune
+# mAP 54.4%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1280, 1280, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 6
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '6': 0.1
+        '5': 0.4
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13,17], box: [31,25], box: [24,51], box: [61,45], 
+              box: [61,45], box: [48,102], box: [119,96], box: [97,189], 
+              box: [97,189], box: [217,184], box: [171,384], box: [324,451], 
+              box: [324,451], box: [545,357], box: [616,618], box: [1024,1024]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.2
+        aug_scale_max: 1.8
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p7_1536_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection-finetune/yolo_l_p7_1536_tpu.yaml
+# --experiment_type=large_yolo
+# mAP 55.3%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1536, 1536, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 7
+        min_level: 3
+        width_scale: 1.25
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '7': 0.1
+        '6': 0.4
+        '5': 0.5
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13,17], box: [22,25], box: [55,41], box: [27,66],
+              box: [57,88], box: [112,69], box: [69,177], box: [136,138],  
+              box: [136,138], box: [287,114], box: [134,275], box: [268,248],
+              box: [268,248], box: [232,504], box: [445,416], box: [640,640], 
+              box: [812,393], box: [477,808], box: [1070,908], box: [1408,1408]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.2
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.2
+        aug_scale_max: 1.8
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_csp_640_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_csp_640_tpu.yaml
+# --experiment_type=scaled_yolo
+# mAP 47.6
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [640, 640, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'altered_cspdarknet53'
+        max_level: 5
+        min_level: 3
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer:
+        'all': 0.05
+      cls_normalizer:
+        'all': 0.3
+      object_normalizer:
+        '5': 0.28
+        '4': 0.70
+        '3': 2.80
+      objectness_smooth:
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 3
+      boxes: [box: [12, 16], box: [19, 36], box: [40, 28],
+              box: [36, 75], box: [76, 55], box: [72, 146],
+              box: [142, 110], box: [192, 243], box: [459, 401]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.25
+        aug_scale_min: 0.1
+        aug_scale_max: 1.9
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.1
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p5_896_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p5_896_tpu.yaml
+# --experiment_type=large_yolo
+# mAP 50.5%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float16'
+  tpu_enable_xla_dynamic_padder: false
+  num_gpus: 1
+task:
+  model:
+    input_size: [896, 896, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 5
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '5': 0.4
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes: [box: [13,17], box: [31,25], box: [24,51], box: [61,45], 
+              box: [48,102], box: [119,96], box: [97,189], box: [217,184], 
+              box: [171,384], box: [324,451],  box: [616,618], box: [800,800]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.5
+        aug_scale_max: 1.5
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p6_1280_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p6_1280_tpu.yaml
+# --experiment_type=large_yolo
+# mAP 53.4%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1280, 1280, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 6
+        min_level: 3
+        width_scale: 1.00
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '6': 0.1
+        '5': 0.4
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13,17], box: [31,25], box: [24,51], box: [61,45], 
+              box: [61,45], box: [48,102], box: [119,96], box: [97,189], 
+              box: [97,189], box: [217,184], box: [171,384], box: [324,451], 
+              box: [324,451], box: [545,357], box: [616,618], box: [1024,1024]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.5
+        aug_scale_max: 1.5
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p7_1536_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/scaled-yolo/detection/yolo_l_p7_1536_tpu.yaml
+# --experiment_type=large_yolo
+# mAP 54.6%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+  tpu_enable_xla_dynamic_padder: false
+task:
+  model:
+    input_size: [1536, 1536, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'csp-large'
+        max_level: 7
+        min_level: 3
+        width_scale: 1.25
+        depth_scale: 1.00
+    decoder:
+      type: yolo_decoder
+      yolo_decoder:
+        version: v4
+        type: csp_large
+    head:
+      smart_bias: true
+    detection_generator:
+      box_type:
+        'all': scaled
+      scale_xy:
+        'all': 2.0
+      max_boxes: 300
+      nms_type: iou
+      iou_thresh: 0.001
+      nms_thresh: 0.65
+    loss:
+      use_scaled_loss: true
+      update_on_repeat: true
+      box_loss_type:  
+        'all': ciou
+      ignore_thresh:
+        'all': 0.0
+      iou_normalizer: 
+        'all': 0.05
+      cls_normalizer: 
+        'all': 0.5
+      object_normalizer: 
+        '7': 0.1
+        '6': 0.4
+        '5': 0.5
+        '4': 1.0
+        '3': 4.0
+      objectness_smooth: 
+        'all': 1.0
+    norm_activation:
+      use_sync_bn: true
+    num_classes: 80
+    anchor_boxes:
+      anchors_per_scale: 4
+      boxes:  [box: [13,17], box: [22,25], box: [55,41], box: [27,66],
+              box: [57,88], box: [112,69], box: [69,177], box: [136,138],  
+              box: [136,138], box: [287,114], box: [134,275], box: [268,248],
+              box: [268,248], box: [232,504], box: [445,416], box: [640,640], 
+              box: [812,393], box: [477,808], box: [1070,908], box: [1408,1408]]
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/train*'
+    shuffle_buffer_size: 10000
+    parser:
+      mosaic:
+        mosaic_frequency: 1.0
+        mixup_frequency: 0.0
+        mosaic_crop_mode: 'scale'
+        mosaic_center: 0.0
+        aug_scale_min: 0.5
+        aug_scale_max: 1.5
+      max_num_instances: 300
+      letter_box: true
+      random_flip: true
+      aug_rand_translate: 0.5
+      area_thresh: 0.1
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/tensorflow-performance-data/datasets/coco/val*'
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4-csp/tpu/640.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/tpu/512.yaml
--- a/official/vision/beta/projects/yolo/configs/experiments/yolov4/imagenet_pretraining/cspdarknet53_256_tpu.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/yolov4/imagenet_pretraining/cspdarknet53_256_tpu.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float32'
+task:
+  model:
+    num_classes: 1001
+    input_size: [256, 256, 3]
+    backbone:
+      type: 'darknet'
+      darknet:
+        model_id: 'cspdarknet53'
+    norm_activation:
+      activation: 'mish'
+  losses:
+    l2_weight_decay: 0.0005
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 128
+    dtype: 'float16'
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: true
+    global_batch_size: 128
+    dtype: 'float16'
+    drop_remainder: false
+trainer:
+  train_steps: 1200000  # epochs: 120
+  validation_steps: 400  # size of validation data
+  validation_interval: 10000
+  steps_per_loop: 10000
+  summary_interval: 10000
+  checkpoint_interval: 10000
+  optimizer_config:
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'polynomial'
+      polynomial:
+        initial_learning_rate: 0.1
+        end_learning_rate: 0.0001
+        power: 4.0
+        decay_steps: 1200000
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1000  # learning rate rises from 0 to 0.1 over 1000 steps
--- a/official/vision/beta/projects/yolo/configs/yolo.py
+++ b/official/vision/beta/projects/yolo/configs/yolo.py
@@ -508,3 +508,226 @@ def scaled_yolo() -> cfg.ExperimentConfig:
      ])

  return config
+
+
+@exp_factory.register_config_factory('large_yolo')
+def large_yolo() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOv4-csp and v4."""
+  train_batch_size = 64
+  eval_batch_size = 8
+  train_epochs = 300
+  fine_tune_epochs = 450
+  warmup_epochs = 3
+
+  validation_interval = 5
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+
+  max_num_instances = 300
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint_modules='',
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolo(
+              darknet_based_model=False,
+              norm_activation=common.NormActivation(
+                  activation='mish',
+                  use_sync_bn=True,
+                  norm_epsilon=0.0001,
+                  norm_momentum=0.97),
+              head=YoloHead(smart_bias=True),
+              loss=YoloLoss(use_scaled_loss=True)),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              dtype='float32',
+              parser=Parser(
+                  aug_rand_saturation=0.7,
+                  aug_rand_brightness=0.4,
+                  aug_rand_hue=0.015,
+                  letter_box=True,
+                  use_tie_breaker=True,
+                  best_match_only=True,
+                  anchor_thresh=4.0,
+                  random_pad=False,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+                  mosaic=Mosaic(
+                      mosaic_crop_mode='scale',
+                      mosaic_frequency=1.0,
+                      mixup_frequency=0.0,
+                  ))),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=True,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=True,
+                  use_tie_breaker=True,
+                  best_match_only=True,
+                  anchor_thresh=4.0,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema': {
+                  'average_decay': 0.9999,
+                  'trainable_weights_only': False,
+                  'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.937,
+                      'momentum_start': 0.9,
+                      'nesterov': True,
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'weight_decay': 0.0005 * train_batch_size / 64.0,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'cosine',
+                  'cosine': {
+                      'initial_learning_rate': 0.01,
+                      'alpha': 0.2,
+                      'decay_steps': fine_tune_epochs * steps_per_epoch,
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+@exp_factory.register_config_factory('large_yolo_finetune')
+def large_yolo_finetune() -> cfg.ExperimentConfig:
+  """COCO object detection with YOLOv4-csp and v4."""
+  train_batch_size = 64
+  eval_batch_size = 8
+  train_epochs = 300
+  fine_tune_epochs = 450
+  warmup_epochs = 3
+
+  validation_interval = 5
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+
+  max_num_instances = 300
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=YoloTask(
+          smart_bias_lr=0.1,
+          init_checkpoint_modules='',
+          annotation_file=None,
+          weight_decay=0.0,
+          model=Yolo(
+              darknet_based_model=False,
+              norm_activation=common.NormActivation(
+                  activation='mish',
+                  use_sync_bn=True,
+                  norm_epsilon=0.0001,
+                  norm_momentum=0.97),
+              head=YoloHead(smart_bias=True),
+              loss=YoloLoss(use_scaled_loss=True)),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              dtype='float32',
+              parser=Parser(
+                  aug_rand_saturation=0.7,
+                  aug_rand_brightness=0.4,
+                  aug_rand_hue=0.015,
+                  letter_box=True,
+                  use_tie_breaker=True,
+                  best_match_only=True,
+                  anchor_thresh=4.0,
+                  random_pad=False,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+                  mosaic=Mosaic(
+                      mosaic_crop_mode='scale',
+                      mosaic_frequency=1.0,
+                      mixup_frequency=0.2,
+                  ))),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=True,
+              dtype='float32',
+              parser=Parser(
+                  letter_box=True,
+                  use_tie_breaker=True,
+                  best_match_only=True,
+                  anchor_thresh=4.0,
+                  area_thresh=0.1,
+                  max_num_instances=max_num_instances,
+              ))),
+      trainer=cfg.TrainerConfig(
+          train_steps=fine_tune_epochs * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=validation_interval * steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'ema': {
+                  'average_decay': 0.9999,
+                  'trainable_weights_only': False,
+                  'dynamic_decay': True,
+              },
+              'optimizer': {
+                  'type': 'sgd_torch',
+                  'sgd_torch': {
+                      'momentum': 0.937,
+                      'momentum_start': 0.9,
+                      'nesterov': True,
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'weight_decay': 0.0005 * train_batch_size / 64.0,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'cosine',
+                  'cosine': {
+                      'initial_learning_rate': 0.01,
+                      'alpha': 0.2,
+                      'decay_steps': fine_tune_epochs * steps_per_epoch,
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': steps_per_epoch * warmup_epochs,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
\ No newline at end of file
--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -51,7 +51,7 @@ YOLO_MODELS = {
                csp_stack=7,
                fpn_depth=7,
                path_process_len=8,
-                fpn_filter_scale=2),
+                fpn_filter_scale=1),
        ),
    'v3':
        dict(
@@ -349,13 +349,16 @@ class YoloPAN(tf.keras.layers.Layer):
      downsample = False
      upsample = True

-    if self._csp_stack == 0:
-      proc_filters = lambda x: x
-      resample_filters = lambda x: x // 2
-    else:
-      proc_filters = lambda x: x * 2
-      resample_filters = lambda x: x
    for level, depth in zip(self._iterator, self._depths):
+      if level > 5:
+        proc_filters = lambda x: x * 2
+        resample_filters = lambda x: x // 2
+      elif self._csp_stack == 0:
+        proc_filters = lambda x: x
+        resample_filters = lambda x: x // 2
+      else:
+        proc_filters = lambda x: x * 2
+        resample_filters = lambda x: x
      if level == self._input:
        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
            filters=proc_filters(depth),
@@ -396,7 +399,7 @@ class YoloPAN(tf.keras.layers.Layer):
    depths = []
    if len(inputs.keys()) > 3 or self._fpn_filter_scale > 1:
      for i in range(self._min_level, self._max_level + 1):
-        depths.append(inputs[str(i)][-1] * 2)
+        depths.append(inputs[str(i)][-1]) # * 2)
    else:
      for _ in range(self._min_level, self._max_level + 1):
        depths.append(minimum_depth)
@@ -605,4 +608,4 @@ def build_yolo_decoder(

  base_model.update(base_dict)
  model = YoloDecoder(input_specs, **base_model, **kwargs)
-  return model
+  return model
\ No newline at end of file