Merge remote-tracking branch 'upstream/master'

2b676a9b · Gunho Park · 6ddd627a · bcbce005 · 2b676a9b · 2b676a9b
Commit 2b676a9b authored Jun 16, 2021 by Gunho Park
20 changed files
--- a/official/core/base_trainer.py
+++ b/official/core/base_trainer.py
@@ -246,10 +246,11 @@ class Trainer(_AsyncTrainer):
    self._train_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
    self._validation_loss = tf.keras.metrics.Mean(
        "validation_loss", dtype=tf.float32)
+    model_metrics = model.metrics if hasattr(model, "metrics") else []
    self._train_metrics = self.task.build_metrics(
-        training=True) + self.model.metrics
+        training=True) + model_metrics
    self._validation_metrics = self.task.build_metrics(
-        training=False) + self.model.metrics
+        training=False) + model_metrics

    self.init_async()


--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -181,20 +181,21 @@ class AxProcessor(DataProcessor):
 class ColaProcessor(DataProcessor):
  """Processor for the CoLA data set (GLUE version)."""

+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
+    super(ColaProcessor, self).__init__(process_text_fn)
+    self.dataset = tfds.load("glue/cola", try_gcs=True)
+
  def get_train_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    return self._create_examples_tfds("train")

  def get_dev_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    return self._create_examples_tfds("validation")

  def get_test_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    return self._create_examples_tfds("test")

  def get_labels(self):
    """See base class."""
@@ -205,22 +206,19 @@ class ColaProcessor(DataProcessor):
    """See base class."""
    return "COLA"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = self.dataset[set_type].as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      # Only the test set has a header.
-      if set_type == "test" and i == 0:
-        continue
+    for i, example in enumerate(dataset):
      guid = "%s-%s" % (set_type, i)
-      if set_type == "test":
-        text_a = self.process_text_fn(line[1])
-        label = "0"
-      else:
-        text_a = self.process_text_fn(line[3])
-        label = self.process_text_fn(line[1])
+      label = "0"
+      text_a = self.process_text_fn(example["sentence"])
+      if set_type != "test":
+        label = str(example["label"])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=None, label=label, weight=None))
    return examples



--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -40,6 +40,7 @@ class SentencePredictionDataConfig(cfg.DataConfig):
  label_type: str = 'int'
  # Whether to include the example id number.
  include_example_id: bool = False
+  label_field: str = 'label_ids'
  # Maps the key in TfExample to feature name.
  # E.g 'label_ids' to 'next_sentence_labels'
  label_name: Optional[Tuple[str, str]] = None
@@ -53,6 +54,7 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
    self._params = params
    self._seq_length = params.seq_length
    self._include_example_id = params.include_example_id
+    self._label_field = params.label_field
    if params.label_name:
      self._label_name_mapping = dict([params.label_name])
    else:
@@ -65,7 +67,7 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'label_ids': tf.io.FixedLenFeature([], label_type),
+        self._label_field: tf.io.FixedLenFeature([], label_type),
    }
    if self._include_example_id:
      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
@@ -92,10 +94,10 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
    if self._include_example_id:
      x['example_id'] = record['example_id']

-    x['label_ids'] = record['label_ids']
+    x[self._label_field] = record[self._label_field]

-    if 'label_ids' in self._label_name_mapping:
-      x[self._label_name_mapping['label_ids']] = record['label_ids']
+    if self._label_field in self._label_name_mapping:
+      x[self._label_name_mapping[self._label_field]] = record[self._label_field]

    return x

@@ -215,7 +217,7 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
    model_inputs = self._text_processor(segments)
    if self._include_example_id:
      model_inputs['example_id'] = record['example_id']
-    model_inputs['label_ids'] = record[self._label_field]
+    model_inputs[self._label_field] = record[self._label_field]
    return model_inputs

  def _decode(self, record: tf.Tensor):

--- a/official/nlp/data/sentence_prediction_dataloader_test.py
+++ b/official/nlp/data/sentence_prediction_dataloader_test.py
@@ -197,13 +197,14 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
        vocab_file=vocab_file_path)
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
+    label_field = data_config.label_field
    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', 'label_ids'],
+        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
        features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['label_ids'].shape, (batch_size,))
+    self.assertEqual(features[label_field].shape, (batch_size,))

  @parameterized.parameters(True, False)
  def test_python_sentencepiece_preprocessing(self, use_tfds):
@@ -231,13 +232,14 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    )
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
+    label_field = data_config.label_field
    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', 'label_ids'],
+        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
        features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['label_ids'].shape, (batch_size,))
+    self.assertEqual(features[label_field].shape, (batch_size,))

  @parameterized.parameters(True, False)
  def test_saved_model_preprocessing(self, use_tfds):
@@ -265,13 +267,14 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    )
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
+    label_field = data_config.label_field
    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', 'label_ids'],
+        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
        features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['label_ids'].shape, (batch_size,))
+    self.assertEqual(features[label_field].shape, (batch_size,))


 if __name__ == '__main__':

--- a/official/nlp/projects/mobilebert/README.md
+++ b/official/nlp/projects/mobilebert/README.md
@@ -22,7 +22,7 @@ modeling library:
  * [mobile_bert_encoder.py](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/mobile_bert_encoder.py)
  contains `MobileBERTEncoder` implementation.
  * [mobile_bert_layers.py](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/mobile_bert_layers.py)
-  contains `MobileBertEmbedding`, `MobileBertMaskedLM` and `MobileBertMaskedLM`
+  contains `MobileBertEmbedding`, `MobileBertTransformer` and `MobileBertMaskedLM`
  implementation.

 ## Pre-trained Models

--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -69,6 +69,10 @@ class SentencePredictionTask(base_task.Task):
    if params.metric_type not in METRIC_TYPES:
      raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
    self.metric_type = params.metric_type
+    if hasattr(params.train_data, 'label_field'):
+      self.label_field = params.train_data.label_field
+    else:
+      self.label_field = 'label_ids'

  def build_model(self):
    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
@@ -95,7 +99,7 @@ class SentencePredictionTask(base_task.Task):
          use_encoder_pooler=self.task_config.model.use_encoder_pooler)

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    label_ids = labels['label_ids']
+    label_ids = labels[self.label_field]
    if self.task_config.model.num_classes == 1:
      loss = tf.keras.losses.mean_squared_error(label_ids, model_outputs)
    else:
@@ -121,7 +125,7 @@ class SentencePredictionTask(base_task.Task):
          y = tf.zeros((1,), dtype=tf.float32)
        else:
          y = tf.zeros((1, 1), dtype=tf.int32)
-        x['label_ids'] = y
+        x[self.label_field] = y
        return x

      dataset = tf.data.Dataset.range(1)
@@ -144,10 +148,10 @@ class SentencePredictionTask(base_task.Task):

  def process_metrics(self, metrics, labels, model_outputs):
    for metric in metrics:
-      metric.update_state(labels['label_ids'], model_outputs)
+      metric.update_state(labels[self.label_field], model_outputs)

  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    compiled_metrics.update_state(labels, model_outputs)
+    compiled_metrics.update_state(labels[self.label_field], model_outputs)

  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
    if self.metric_type == 'accuracy':
@@ -163,12 +167,12 @@ class SentencePredictionTask(base_task.Task):
          'sentence_prediction':  # Ensure one prediction along batch dimension.
              tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=1),
          'labels':
-              labels['label_ids'],
+              labels[self.label_field],
      })
    if self.metric_type == 'pearson_spearman_corr':
      logs.update({
          'sentence_prediction': outputs,
-          'labels': labels['label_ids'],
+          'labels': labels[self.label_field],
      })
    return logs


--- a/official/projects/README.md
+++ b/official/projects/README.md
+This directory contains projects using TensorFlow Model Garden Modeling
+libraries.
--- a/official/vision/beta/data/create_coco_tf_record.py
+++ b/official/vision/beta/data/create_coco_tf_record.py
@@ -46,7 +46,7 @@ from official.vision.beta.data import tfrecord_lib
 flags.DEFINE_boolean(
    'include_masks', False, 'Whether to include instance segmentations masks '
    '(PNG encoded) in the result. default: False.')
-flags.DEFINE_string('image_dir', '', 'Directory containing images.')
+flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
 flags.DEFINE_string(
    'image_info_file', '', 'File containing image information. '
    'Tf Examples in the output files correspond to the image '
@@ -159,7 +159,7 @@ def encode_caption_annotations(caption_annotations):


 def create_tf_example(image,
-                      image_dir,
+                      image_dirs,
                      bbox_annotations=None,
                      id_to_name_map=None,
                      caption_annotations=None,
@@ -169,7 +169,7 @@ def create_tf_example(image,
  Args:
    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
      u'width', u'date_captured', u'flickr_url', u'id']
-    image_dir: directory containing the image files.
+    image_dirs: list of directories containing the image files.
    bbox_annotations:
      list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
        u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
@@ -190,14 +190,31 @@ def create_tf_example(image,
    num_annotations_skipped: Number of (invalid) annotations that were ignored.

  Raises:
-    ValueError: if the image pointed to by data['filename'] is not a valid JPEG
+    ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
+      does not exist, or is not unique across image directories.
  """
  image_height = image['height']
  image_width = image['width']
  filename = image['file_name']
  image_id = image['id']

-  full_path = os.path.join(image_dir, filename)
+  if len(image_dirs) > 1:
+    full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
+    full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
+    if not full_existing_paths:
+      raise ValueError(
+          '{} does not exist across image directories.'.format(filename))
+    if len(full_existing_paths) > 1:
+      raise ValueError(
+          '{} is not unique across image directories'.format(filename))
+    full_path, = full_existing_paths
+  # If there is only one image directory, it's not worth checking for existence,
+  # since trying to open the file will raise an informative error message if it
+  # does not exist.
+  else:
+    image_dir, = image_dirs
+    full_path = os.path.join(image_dir, filename)
+
  with tf.io.gfile.GFile(full_path, 'rb') as fid:
    encoded_jpg = fid.read()

@@ -276,7 +293,7 @@ def _load_images_info(images_info_file):
  return info_dict['images']


-def generate_annotations(images, image_dir,
+def generate_annotations(images, image_dirs,
                         img_to_obj_annotation=None,
                         img_to_caption_annotation=None, id_to_name_map=None,
                         include_masks=False):
@@ -289,12 +306,12 @@ def generate_annotations(images, image_dir,
    caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
                         img_to_caption_annotation else None)

-    yield (image, image_dir, object_annotation, id_to_name_map,
+    yield (image, image_dirs, object_annotation, id_to_name_map,
           caption_annotaion, include_masks)


 def _create_tf_record_from_coco_annotations(images_info_file,
-                                            image_dir,
+                                            image_dirs,
                                            output_path,
                                            num_shards,
                                            object_annotations_file=None,
@@ -309,7 +326,7 @@ def _create_tf_record_from_coco_annotations(images_info_file,
      files Eg. 'image_info_test-dev2017.json',
      'instance_annotations_train2017.json',
      'caption_annotations_train2017.json', etc.
-    image_dir: Directory containing the image files.
+    image_dirs: List of directories containing the image files.
    output_path: Path to output tf.Record file.
    num_shards: Number of output files to create.
    object_annotations_file: JSON file containing bounding box annotations.
@@ -333,7 +350,7 @@ def _create_tf_record_from_coco_annotations(images_info_file,
        _load_caption_annotations(caption_annotations_file))

  coco_annotations_iter = generate_annotations(
-      images, image_dir, img_to_obj_annotation, img_to_caption_annotation,
+      images, image_dirs, img_to_obj_annotation, img_to_caption_annotation,
      id_to_name_map=id_to_name_map, include_masks=include_masks)

  num_skipped = tfrecord_lib.write_tf_record_dataset(

--- a/official/vision/beta/data/process_coco_few_shot.sh
+++ b/official/vision/beta/data/process_coco_few_shot.sh
+#!/bin/bash
+#
+# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
+
+tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
+output_dir="/tmp/coco_few_shot"
+while getopts "o:" o; do
+  case "${o}" in
+    o) output_dir=${OPTARG} ;;
+    *) echo "Usage: ${0} [-o <output_dir>]" 1>&2; exit 1 ;;
+  esac
+done
+
+cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
+wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
+    -P "${tmp_dir}" -A "5k.json,*10shot*.json,*30shot*.json" \
+    "http://${cocosplit_url}/"
+mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
+rm -rf "${tmp_dir}/${cocosplit_url}/"
+
+python process_coco_few_shot_json_files.py \
+    --logtostderr --workdir="${tmp_dir}"
+
+for seed in {0..9}; do
+  for shots in 10 30; do
+    python create_coco_tf_record.py \
+        --logtostderr \
+        --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/train2014 \
+        --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/val2014 \
+        --image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --caption_annotations_file="" \
+        --output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
+        --num_shards=4
+  done
+done
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/train2014 \
+    --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/val2014 \
+    --image_info_file="${tmp_dir}/datasplit/5k.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/5k.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/5k" \
+    --num_shards=10
+
+rm -rf "${tmp_dir}"
--- a/official/vision/beta/data/process_coco_few_shot_json_files.py
+++ b/official/vision/beta/data/process_coco_few_shot_json_files.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processes the JSON files for COCO few-shot.
+
+We assume that `workdir` mirrors the contents of
+http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
+files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
+"Frustratingly Simple Few-Shot Object Detection" paper uses.
+"""
+
+import collections
+import itertools
+import json
+import logging
+import os
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+
+flags.DEFINE_string('workdir', None, 'Working directory.')
+
+FLAGS = flags.FLAGS
+CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
+              'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
+              'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
+              'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
+              'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
+              'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
+              'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
+              'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
+              'parking meter', 'person', 'pizza', 'potted plant',
+              'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
+              'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
+              'stop sign', 'suitcase', 'surfboard', 'teddy bear',
+              'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
+              'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
+              'wine glass', 'zebra']
+SEEDS = list(range(10))
+SHOTS = [10, 30]
+
+FILE_SUFFIXES = collections.defaultdict(list)
+for _seed, _shots in itertools.product(SEEDS, SHOTS):
+  for _category in CATEGORIES:
+    FILE_SUFFIXES[(_seed, _shots)].append(
+        '{}full_box_{}shot_{}_trainval.json'.format(
+            # http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
+            #
+            #   datasplit/
+            #     trainvalno5k.json
+            #     5k.json
+            #   full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #   seed{1-9}/
+            #     full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #
+            # This means that the JSON files for seed0 are located in the root
+            # directory rather than in a `seed?/` subdirectory, hence the
+            # conditional expression below.
+            '' if _seed == 0 else 'seed{}/'.format(_seed),
+            _shots,
+            _category))
+
+
+def main(unused_argv):
+  workdir = FLAGS.workdir
+
+  for seed, shots in itertools.product(SEEDS, SHOTS):
+    # Retrieve all examples for a given seed and shots setting.
+    file_paths = [os.path.join(workdir, suffix)
+                  for suffix in FILE_SUFFIXES[(seed, shots)]]
+    json_dicts = []
+    for file_path in file_paths:
+      with tf.io.gfile.GFile(file_path, 'r') as f:
+        json_dicts.append(json.load(f))
+
+    # Make sure that all JSON files for a given seed and shots setting have the
+    # same metadata. We count on this to fuse them later on.
+    metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
+                       'categories': d['categories']} for d in json_dicts]
+    if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
+      raise RuntimeError(
+          'JSON files for {} shots (seed {}) '.format(shots, seed) +
+          'have different info, licences, or categories fields')
+
+    # Retrieve images across all JSON files.
+    images = sum((d['images'] for d in json_dicts), [])
+    # Remove duplicate image entries.
+    images = list({image['id']: image for image in images}.values())
+
+    output_dict = {
+        'info': json_dicts[0]['info'],
+        'licenses': json_dicts[0]['licenses'],
+        'categories': json_dicts[0]['categories'],
+        'images': images,
+        'annotations': sum((d['annotations'] for d in json_dicts), [])
+    }
+
+    output_path = os.path.join(workdir,
+                               '{}shot_seed{}.json'.format(shots, seed))
+    with tf.io.gfile.GFile(output_path, 'w') as f:
+      json.dump(output_dict, f)
+    logger.info('Processed %d shots (seed %d) and saved to %s',
+                shots, seed, output_path)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('workdir')
+  app.run(main)
--- a/official/vision/beta/projects/movinet/modeling/movinet.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet.py
@@ -525,7 +525,6 @@ class Movinet(tf.keras.Model):
    Returns:
      A dict mapping state names to state shapes.
    """
-
    def divide_resolution(shape, num_downsamples):
      """Downsamples the dimension to calculate strided convolution shape."""
      if shape is None:
@@ -564,6 +563,12 @@ class Movinet(tf.keras.Model):
        for layer_idx, layer in enumerate(params):
          expand_filters, kernel_size, strides = layer

+          # If we use a 2D kernel, we apply spatial downsampling
+          # before the buffer.
+          if (tuple(strides[1:3]) != (1, 1) and
+              self._conv_type in ['2plus1d', '3d_2plus1d']):
+            num_downsamples += 1
+
          if kernel_size[0] > 1:
            states[f'state/b{block_idx}/l{layer_idx}/stream_buffer'] = (
                input_shape[0],
@@ -585,7 +590,11 @@ class Movinet(tf.keras.Model):
          if strides[1] != strides[2]:
            raise ValueError('Strides must match in the spatial dimensions, '
                             'got {}'.format(strides))
-          if strides[1] != 1 or strides[2] != 1:
+
+          # If we use a 3D kernel, we apply spatial downsampling
+          # after the buffer.
+          if (tuple(strides[1:3]) != (1, 1) and
+              self._conv_type not in ['2plus1d', '3d_2plus1d']):
            num_downsamples += 1
      elif isinstance(block, HeadSpec):
        states['state/head/pool_buffer'] = (

--- a/official/vision/beta/projects/movinet/modeling/movinet_layers.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet_layers.py
@@ -633,9 +633,28 @@ class StreamConvBlock(ConvBlock):
    states = dict(states) if states is not None else {}

    x = inputs
-    if self._stream_buffer is not None:
+
+    # If we have no separate temporal conv, use the buffer before the 3D conv.
+    if self._conv_temporal is None and self._stream_buffer is not None:
      x, states = self._stream_buffer(x, states=states)
-    x = super(StreamConvBlock, self).call(x)
+
+    x = self._conv(x)
+    if self._batch_norm is not None:
+      x = self._batch_norm(x)
+    if self._activation_layer is not None:
+      x = self._activation_layer(x)
+
+    if self._conv_temporal is not None:
+      if self._stream_buffer is not None:
+        # If we have a separate temporal conv, use the buffer before the
+        # 1D conv instead (otherwise, we may waste computation on the 2D conv).
+        x, states = self._stream_buffer(x, states=states)
+
+      x = self._conv_temporal(x)
+      if self._batch_norm_temporal is not None:
+        x = self._batch_norm_temporal(x)
+      if self._activation_layer is not None:
+        x = self._activation_layer(x)

    return x, states


--- a/official/vision/beta/projects/movinet/modeling/movinet_model.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet_model.py
@@ -115,15 +115,31 @@ class MovinetClassifier(tf.keras.Model):
    inputs = {**states, 'image': image}

    if backbone.use_external_states:
-      before_states = set(states)
+      before_states = states
      endpoints, states = backbone(inputs)
-      after_states = set(states)
+      after_states = states

-      new_states = after_states - before_states
+      new_states = set(after_states) - set(before_states)
      if new_states:
-        raise AttributeError('Expected input and output states to be the same. '
-                             'Got extra states {}, expected {}'.format(
-                                 new_states, before_states))
+        raise ValueError(
+            'Expected input and output states to be the same. Got extra states '
+            '{}, expected {}'.format(new_states, set(before_states)))
+
+      mismatched_shapes = {}
+      for name in after_states:
+        before_shape = before_states[name].shape
+        after_shape = after_states[name].shape
+        if len(before_shape) != len(after_shape):
+          mismatched_shapes[name] = (before_shape, after_shape)
+          continue
+        for before, after in zip(before_shape, after_shape):
+          if before is not None and after is not None and before != after:
+            mismatched_shapes[name] = (before_shape, after_shape)
+            break
+      if mismatched_shapes:
+        raise ValueError(
+            'Got mismatched input and output state shapes: {}'.format(
+                mismatched_shapes))
    else:
      endpoints, states = backbone(inputs)


--- a/official/vision/beta/projects/yolo/README.md
+++ b/official/vision/beta/projects/yolo/README.md
+DISCLAIMER: this YOLO implementation is still under development. No support will
+be provided during the development phase.
+
 # YOLO Object Detectors, You Only Look Once

 [![Paper](http://img.shields.io/badge/Paper-arXiv.1804.02767-B3181B?logo=arXiv)](https://arxiv.org/abs/1804.02767)
@@ -74,3 +77,5 @@ head could be connected to a new, more powerful backbone if a person chose to.

 [![TensorFlow 2.2](https://img.shields.io/badge/TensorFlow-2.2-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
 [![Python 3.8](https://img.shields.io/badge/Python-3.8-3776AB)](https://www.python.org/downloads/release/python-380/)
+
+
--- a/official/vision/beta/projects/yolo/configs/backbones.py
+++ b/official/vision/beta/projects/yolo/configs/backbones.py
@@ -24,11 +24,14 @@ from official.vision.beta.configs import backbones


 @dataclasses.dataclass
-class DarkNet(hyperparams.Config):
-  """DarkNet config."""
-  model_id: str = "darknet53"
+class Darknet(hyperparams.Config):
+  """Darknet config."""
+  model_id: str = 'darknet53'
+  width_scale: float = 1.0
+  depth_scale: float = 1.0
+  dilate: bool = False


 @dataclasses.dataclass
 class Backbone(backbones.Backbone):
-  darknet: DarkNet = DarkNet()
+  darknet: Darknet = Darknet()
--- a/official/vision/beta/projects/yolo/configs/darknet_classification.py
+++ b/official/vision/beta/projects/yolo/configs/darknet_classification.py
@@ -32,7 +32,7 @@ class ImageClassificationModel(hyperparams.Config):
  num_classes: int = 0
  input_size: List[int] = dataclasses.field(default_factory=list)
  backbone: backbones.Backbone = backbones.Backbone(
-      type='darknet', resnet=backbones.DarkNet())
+      type='darknet', darknet=backbones.Darknet())
  dropout_rate: float = 0.0
  norm_activation: common.NormActivation = common.NormActivation()
  # Adds a BatchNormalization layer pre-GlobalAveragePooling in classification

--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 # Lint as: python3
-
 """Contains definitions of Darknet Backbone Networks.

   The models are inspired by ResNet, and CSPNet
@@ -29,15 +28,15 @@ Cross Stage Partial networks (CSPNets) were proposed in:
    arXiv:1911.11929


-DarkNets Are used mainly for Object detection in:
+Darknets are used mainly for object detection in:
 [1] Joseph Redmon, Ali Farhadi
    YOLOv3: An Incremental Improvement. arXiv:1804.02767

 [2] Alexey Bochkovskiy, Chien-Yao Wang, Hong-Yuan Mark Liao
    YOLOv4: Optimal Speed and Accuracy of Object Detection. arXiv:2004.10934
 """
-import collections

+import collections
 import tensorflow as tf

 from official.modeling import hyperparams
@@ -45,28 +44,32 @@ from official.vision.beta.modeling.backbones import factory
 from official.vision.beta.projects.yolo.modeling.layers import nn_blocks


-class BlockConfig(object):
-  """Get layer config to make code more readable.
-
-    Args:
-        layer: string layer name
-        stack: the type of layer ordering to use for this specific level
-        repetitions: integer for the number of times to repeat block
-        bottelneck: boolean for does this stack have a bottle neck layer
-        filters: integer for the output depth of the level
-        pool_size: integer the pool_size of max pool layers
-        kernel_size: optional integer, for convolution kernel size
-        strides: integer or tuple to indicate convolution strides
-        padding: the padding to apply to layers in this stack
-        activation: string for the activation to use for this stack
-        route: integer for what level to route from to get the next input
-        output_name: the name to use for this output
-        is_output: is this layer an output in the default model
-  """
+class BlockConfig:
+  """Class to store layer config to make code more readable."""

  def __init__(self, layer, stack, reps, bottleneck, filters, pool_size,
-               kernel_size, strides, padding, activation, route, output_name,
-               is_output):
+               kernel_size, strides, padding, activation, route, dilation_rate,
+               output_name, is_output):
+    """Initializing method for BlockConfig.
+
+    Args:
+      layer: A `str` for layer name.
+      stack: A `str` for the type of layer ordering to use for this specific
+        level.
+      reps: An `int` for the number of times to repeat block.
+      bottleneck: A `bool` for whether this stack has a bottle neck layer.
+      filters: An `int` for the output depth of the level.
+      pool_size: An `int` for the pool_size of max pool layers.
+      kernel_size: An `int` for convolution kernel size.
+      strides: A `Union[int, tuple]` that indicates convolution strides.
+      padding: An `int` for the padding to apply to layers in this stack.
+      activation: A `str` for the activation to use for this stack.
+      route: An `int` for the level to route from to get the next input.
+      dilation_rate: An `int` for the scale used in dialated Darknet.
+      output_name: A `str` for the name to use for this output.
+      is_output: A `bool` for whether this layer is an output in the default
+        model.
+    """
    self.layer = layer
    self.stack = stack
    self.repetitions = reps
@@ -78,6 +81,7 @@ class BlockConfig(object):
    self.padding = padding
    self.activation = activation
    self.route = route
+    self.dilation_rate = dilation_rate
    self.output_name = output_name
    self.is_output = is_output

@@ -89,41 +93,41 @@ def build_block_specs(config):
  return specs


-class LayerFactory(object):
-  """Class for quick look up of default layers.
+class LayerBuilder:
+  """Layer builder class.

-  Used by darknet to connect, introduce or exit a level. Used in place of an if
-  condition or switch to make adding new layers easier and to reduce redundant
-  code.
+  Class for quick look up of default layers used by darknet to
+  connect, introduce or exit a level. Used in place of an if condition
+  or switch to make adding new layers easier and to reduce redundant code.
  """

  def __init__(self):
    self._layer_dict = {
-        "ConvBN": (nn_blocks.ConvBN, self.conv_bn_config_todict),
-        "MaxPool": (tf.keras.layers.MaxPool2D, self.maxpool_config_todict)
+        'ConvBN': (nn_blocks.ConvBN, self.conv_bn_config_todict),
+        'MaxPool': (tf.keras.layers.MaxPool2D, self.maxpool_config_todict)
    }

  def conv_bn_config_todict(self, config, kwargs):
    dictvals = {
-        "filters": config.filters,
-        "kernel_size": config.kernel_size,
-        "strides": config.strides,
-        "padding": config.padding
+        'filters': config.filters,
+        'kernel_size': config.kernel_size,
+        'strides': config.strides,
+        'padding': config.padding
    }
    dictvals.update(kwargs)
    return dictvals

  def darktiny_config_todict(self, config, kwargs):
-    dictvals = {"filters": config.filters, "strides": config.strides}
+    dictvals = {'filters': config.filters, 'strides': config.strides}
    dictvals.update(kwargs)
    return dictvals

  def maxpool_config_todict(self, config, kwargs):
    return {
-        "pool_size": config.pool_size,
-        "strides": config.strides,
-        "padding": config.padding,
-        "name": kwargs["name"]
+        'pool_size': config.pool_size,
+        'strides': config.strides,
+        'padding': config.padding,
+        'name': kwargs['name']
    }

  def __call__(self, config, kwargs):
@@ -134,90 +138,259 @@ class LayerFactory(object):

 # model configs
 LISTNAMES = [
-    "default_layer_name", "level_type", "number_of_layers_in_level",
-    "bottleneck", "filters", "kernal_size", "pool_size", "strides", "padding",
-    "default_activation", "route", "level/name", "is_output"
+    'default_layer_name', 'level_type', 'number_of_layers_in_level',
+    'bottleneck', 'filters', 'kernal_size', 'pool_size', 'strides', 'padding',
+    'default_activation', 'route', 'dilation', 'level/name', 'is_output'
 ]

-# pylint: disable=line-too-long
 CSPDARKNET53 = {
-    "list_names": LISTNAMES,
-    "splits": {"backbone_split": 106,
-               "neck_split": 138},
-    "backbone": [
-        ["ConvBN", None, 1, False, 32, None, 3, 1, "same", "mish", -1, 0, False],
-        ["DarkRes", "csp", 1, True, 64, None, None, None, None, "mish", -1, 1, False],
-        ["DarkRes", "csp", 2, False, 128, None, None, None, None, "mish", -1, 2, False],
-        ["DarkRes", "csp", 8, False, 256, None, None, None, None, "mish", -1, 3, True],
-        ["DarkRes", "csp", 8, False, 512, None, None, None, None, "mish", -1, 4, True],
-        ["DarkRes", "csp", 4, False, 1024, None, None, None, None, "mish", -1, 5, True],
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 106,
+        'neck_split': 132
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 1, 'same', 'mish', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkRes', 'csp', 1, True, 64, None, None, None, None, 'mish', -1,
+            1, 1, False
+        ],
+        [
+            'DarkRes', 'csp', 2, False, 128, None, None, None, None, 'mish', -1,
+            1, 2, False
+        ],
+        [
+            'DarkRes', 'csp', 8, False, 256, None, None, None, None, 'mish', -1,
+            1, 3, True
+        ],
+        [
+            'DarkRes', 'csp', 8, False, 512, None, None, None, None, 'mish', -1,
+            2, 4, True
+        ],
+        [
+            'DarkRes', 'csp', 4, False, 1024, None, None, None, None, 'mish',
+            -1, 4, 5, True
+        ],
+    ]
+}
+
+CSPADARKNET53 = {
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 100,
+        'neck_split': 135
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 1, 'same', 'mish', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkRes', 'residual', 1, True, 64, None, None, None, None, 'mish',
+            -1, 1, 1, False
+        ],
+        [
+            'DarkRes', 'csp', 2, False, 128, None, None, None, None, 'mish', -1,
+            1, 2, False
+        ],
+        [
+            'DarkRes', 'csp', 8, False, 256, None, None, None, None, 'mish', -1,
+            1, 3, True
+        ],
+        [
+            'DarkRes', 'csp', 8, False, 512, None, None, None, None, 'mish', -1,
+            2, 4, True
+        ],
+        [
+            'DarkRes', 'csp', 4, False, 1024, None, None, None, None, 'mish',
+            -1, 4, 5, True
+        ],
+    ]
+}
+
+LARGECSP53 = {
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 100,
+        'neck_split': 135
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 1, 'same', 'mish', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkRes', 'csp', 1, True, 64, None, None, None, None, 'mish', -1,
+            1, 1, False
+        ],
+        [
+            'DarkRes', 'csp', 3, False, 128, None, None, None, None, 'mish', -1,
+            1, 2, False
+        ],
+        [
+            'DarkRes', 'csp', 15, False, 256, None, None, None, None, 'mish',
+            -1, 1, 3, True
+        ],
+        [
+            'DarkRes', 'csp', 15, False, 512, None, None, None, None, 'mish',
+            -1, 2, 4, True
+        ],
+        [
+            'DarkRes', 'csp', 7, False, 1024, None, None, None, None, 'mish',
+            -1, 4, 5, True
+        ],
+        [
+            'DarkRes', 'csp', 7, False, 1024, None, None, None, None, 'mish',
+            -1, 8, 6, True
+        ],
+        [
+            'DarkRes', 'csp', 7, False, 1024, None, None, None, None, 'mish',
+            -1, 16, 7, True
+        ],
    ]
 }

 DARKNET53 = {
-    "list_names": LISTNAMES,
-    "splits": {"backbone_split": 76},
-    "backbone": [
-        ["ConvBN", None, 1, False, 32, None, 3, 1, "same", "leaky", -1, 0, False],
-        ["DarkRes", "residual", 1, True, 64, None, None, None, None, "leaky", -1, 1, False],
-        ["DarkRes", "residual", 2, False, 128, None, None, None, None, "leaky", -1, 2, False],
-        ["DarkRes", "residual", 8, False, 256, None, None, None, None, "leaky", -1, 3, True],
-        ["DarkRes", "residual", 8, False, 512, None, None, None, None, "leaky", -1, 4, True],
-        ["DarkRes", "residual", 4, False, 1024, None, None, None, None, "leaky", -1, 5, True],
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 76
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 1, 'same', 'leaky', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkRes', 'residual', 1, True, 64, None, None, None, None, 'leaky',
+            -1, 1, 1, False
+        ],
+        [
+            'DarkRes', 'residual', 2, False, 128, None, None, None, None,
+            'leaky', -1, 1, 2, False
+        ],
+        [
+            'DarkRes', 'residual', 8, False, 256, None, None, None, None,
+            'leaky', -1, 1, 3, True
+        ],
+        [
+            'DarkRes', 'residual', 8, False, 512, None, None, None, None,
+            'leaky', -1, 2, 4, True
+        ],
+        [
+            'DarkRes', 'residual', 4, False, 1024, None, None, None, None,
+            'leaky', -1, 4, 5, True
+        ],
    ]
 }

 CSPDARKNETTINY = {
-    "list_names": LISTNAMES,
-    "splits": {"backbone_split": 28},
-    "backbone": [
-        ["ConvBN", None, 1, False, 32, None, 3, 2, "same", "leaky", -1, 0, False],
-        ["ConvBN", None, 1, False, 64, None, 3, 2, "same", "leaky", -1, 1, False],
-        ["CSPTiny", "csp_tiny", 1, False, 64, None, 3, 2, "same", "leaky", -1, 2, False],
-        ["CSPTiny", "csp_tiny", 1, False, 128, None, 3, 2, "same", "leaky", -1, 3, False],
-        ["CSPTiny", "csp_tiny", 1, False, 256, None, 3, 2, "same", "leaky", -1, 4, True],
-        ["ConvBN", None, 1, False, 512, None, 3, 1, "same", "leaky", -1, 5, True],
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 28
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 2, 'same', 'leaky', -1, 1, 0,
+            False
+        ],
+        [
+            'ConvBN', None, 1, False, 64, None, 3, 2, 'same', 'leaky', -1, 1, 1,
+            False
+        ],
+        [
+            'CSPTiny', 'csp_tiny', 1, False, 64, None, 3, 2, 'same', 'leaky',
+            -1, 1, 2, False
+        ],
+        [
+            'CSPTiny', 'csp_tiny', 1, False, 128, None, 3, 2, 'same', 'leaky',
+            -1, 1, 3, False
+        ],
+        [
+            'CSPTiny', 'csp_tiny', 1, False, 256, None, 3, 2, 'same', 'leaky',
+            -1, 1, 4, True
+        ],
+        [
+            'ConvBN', None, 1, False, 512, None, 3, 1, 'same', 'leaky', -1, 1,
+            5, True
+        ],
    ]
 }

 DARKNETTINY = {
-    "list_names": LISTNAMES,
-    "splits": {"backbone_split": 14},
-    "backbone": [
-        ["ConvBN", None, 1, False, 16, None, 3, 1, "same", "leaky", -1, 0, False],
-        ["DarkTiny", "tiny", 1, True, 32, None, 3, 2, "same", "leaky", -1, 1, False],
-        ["DarkTiny", "tiny", 1, True, 64, None, 3, 2, "same", "leaky", -1, 2, False],
-        ["DarkTiny", "tiny", 1, False, 128, None, 3, 2, "same", "leaky", -1, 3, False],
-        ["DarkTiny", "tiny", 1, False, 256, None, 3, 2, "same", "leaky", -1, 4, True],
-        ["DarkTiny", "tiny", 1, False, 512, None, 3, 2, "same", "leaky", -1, 5, False],
-        ["DarkTiny", "tiny", 1, False, 1024, None, 3, 1, "same", "leaky", -1, 5, True],
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 14
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 16, None, 3, 1, 'same', 'leaky', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, True, 32, None, 3, 2, 'same', 'leaky', -1, 1,
+            1, False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, True, 64, None, 3, 2, 'same', 'leaky', -1, 1,
+            2, False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, False, 128, None, 3, 2, 'same', 'leaky', -1,
+            1, 3, False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, False, 256, None, 3, 2, 'same', 'leaky', -1,
+            1, 4, True
+        ],
+        [
+            'DarkTiny', 'tiny', 1, False, 512, None, 3, 2, 'same', 'leaky', -1,
+            1, 5, False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, False, 1024, None, 3, 1, 'same', 'leaky', -1,
+            1, 5, True
+        ],
    ]
 }
-# pylint: enable=line-too-long

 BACKBONES = {
-    "darknettiny": DARKNETTINY,
-    "darknet53": DARKNET53,
-    "cspdarknet53": CSPDARKNET53,
-    "cspdarknettiny": CSPDARKNETTINY
+    'darknettiny': DARKNETTINY,
+    'darknet53': DARKNET53,
+    'cspdarknet53': CSPDARKNET53,
+    'altered_cspdarknet53': CSPADARKNET53,
+    'cspdarknettiny': CSPDARKNETTINY,
+    'csp-large': LARGECSP53,
 }


-@tf.keras.utils.register_keras_serializable(package="yolo")
+@tf.keras.utils.register_keras_serializable(package='yolo')
 class Darknet(tf.keras.Model):
-  """Darknet backbone."""
+  """The Darknet backbone architecture."""

  def __init__(
      self,
-      model_id="darknet53",
+      model_id='darknet53',
      input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
      min_level=None,
      max_level=5,
+      width_scale=1.0,
+      depth_scale=1.0,
+      csp_level_mod=(),
      activation=None,
      use_sync_bn=False,
      norm_momentum=0.99,
      norm_epsilon=0.001,
-      kernel_initializer="glorot_uniform",
+      dilate=False,
+      kernel_initializer='glorot_uniform',
      kernel_regularizer=None,
      bias_regularizer=None,
      **kwargs):
@@ -227,12 +400,13 @@ class Darknet(tf.keras.Model):
    self._model_name = model_id
    self._splits = splits
    self._input_shape = input_specs
-    self._registry = LayerFactory()
+    self._registry = LayerBuilder()

    # default layer look up
    self._min_size = min_level
    self._max_size = max_level
    self._output_specs = None
+    self._csp_level_mod = set(csp_level_mod)

    self._kernel_initializer = kernel_initializer
    self._bias_regularizer = bias_regularizer
@@ -241,16 +415,20 @@ class Darknet(tf.keras.Model):
    self._use_sync_bn = use_sync_bn
    self._activation = activation
    self._kernel_regularizer = kernel_regularizer
+    self._dilate = dilate
+    self._width_scale = width_scale
+    self._depth_scale = depth_scale

    self._default_dict = {
-        "kernel_initializer": self._kernel_initializer,
-        "kernel_regularizer": self._kernel_regularizer,
-        "bias_regularizer": self._bias_regularizer,
-        "norm_momentum": self._norm_momentum,
-        "norm_epsilon": self._norm_epislon,
-        "use_sync_bn": self._use_sync_bn,
-        "activation": self._activation,
-        "name": None
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epislon,
+        'use_sync_bn': self._use_sync_bn,
+        'activation': self._activation,
+        'dilation_rate': 1,
+        'name': None
    }

    inputs = tf.keras.layers.Input(shape=self._input_shape.shape[1:])
@@ -273,33 +451,39 @@ class Darknet(tf.keras.Model):
    endpoints = collections.OrderedDict()
    stack_outputs = [inputs]
    for i, config in enumerate(net):
+      if config.output_name > self._max_size:
+        break
+      if config.output_name in self._csp_level_mod:
+        config.stack = 'residual'
+
+      config.filters = int(config.filters * self._width_scale)
+      config.repetitions = int(config.repetitions * self._depth_scale)
+
      if config.stack is None:
-        x = self._build_block(stack_outputs[config.route],
-                              config,
-                              name=f"{config.layer}_{i}")
+        x = self._build_block(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
        stack_outputs.append(x)
-      elif config.stack == "residual":
-        x = self._residual_stack(stack_outputs[config.route],
-                                 config,
-                                 name=f"{config.layer}_{i}")
+      elif config.stack == 'residual':
+        x = self._residual_stack(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
        stack_outputs.append(x)
-      elif config.stack == "csp":
-        x = self._csp_stack(stack_outputs[config.route],
-                            config,
-                            name=f"{config.layer}_{i}")
+      elif config.stack == 'csp':
+        x = self._csp_stack(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
        stack_outputs.append(x)
-      elif config.stack == "csp_tiny":
-        x_pass, x = self._csp_tiny_stack(stack_outputs[config.route],
-                                         config, name=f"{config.layer}_{i}")
+      elif config.stack == 'csp_tiny':
+        x_pass, x = self._csp_tiny_stack(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
        stack_outputs.append(x_pass)
-      elif config.stack == "tiny":
-        x = self._tiny_stack(stack_outputs[config.route],
-                             config,
-                             name=f"{config.layer}_{i}")
+      elif config.stack == 'tiny':
+        x = self._tiny_stack(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
        stack_outputs.append(x)
      if (config.is_output and self._min_size is None):
        endpoints[str(config.output_name)] = x
-      elif self._min_size is not None and config.output_name >= self._min_size and config.output_name <= self._max_size:
+      elif (self._min_size is not None and
+            config.output_name >= self._min_size and
+            config.output_name <= self._max_size):
        endpoints[str(config.output_name)] = x

    self._output_specs = {l: endpoints[l].get_shape() for l in endpoints.keys()}
@@ -308,8 +492,7 @@ class Darknet(tf.keras.Model):
  def _get_activation(self, activation):
    if self._activation is None:
      return activation
-    else:
-      return self._activation
+    return self._activation

  def _csp_stack(self, inputs, config, name):
    if config.bottleneck:
@@ -320,86 +503,135 @@ class Darknet(tf.keras.Model):
      csp_filter_scale = 2
      residual_filter_scale = 1
      scale_filters = 2
-    self._default_dict["activation"] = self._get_activation(config.activation)
-    self._default_dict["name"] = f"{name}_csp_down"
-    x, x_route = nn_blocks.CSPRoute(filters=config.filters,
-                                    filter_scale=csp_filter_scale,
-                                    downsample=True,
-                                    **self._default_dict)(inputs)
-    for i in range(config.repetitions):
-      self._default_dict["name"] = f"{name}_{i}"
-      x = nn_blocks.DarkResidual(filters=config.filters // scale_filters,
-                                 filter_scale=residual_filter_scale,
-                                 **self._default_dict)(x)
-
-    self._default_dict["name"] = f"{name}_csp_connect"
-    output = nn_blocks.CSPConnect(filters=config.filters,
-                                  filter_scale=csp_filter_scale,
-                                  **self._default_dict)([x, x_route])
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._get_activation(config.activation)
+    self._default_dict['name'] = f'{name}_csp_down'
+    if self._dilate:
+      self._default_dict['dilation_rate'] = config.dilation_rate
+    else:
+      self._default_dict['dilation_rate'] = 1
+
+    # swap/add dilation
+    x, x_route = nn_blocks.CSPRoute(
+        filters=config.filters,
+        filter_scale=csp_filter_scale,
+        downsample=True,
+        **self._default_dict)(
+            inputs)
+
+    dilated_reps = config.repetitions - self._default_dict['dilation_rate'] // 2
+    for i in range(dilated_reps):
+      self._default_dict['name'] = f'{name}_{i}'
+      x = nn_blocks.DarkResidual(
+          filters=config.filters // scale_filters,
+          filter_scale=residual_filter_scale,
+          **self._default_dict)(
+              x)
+
+    for i in range(dilated_reps, config.repetitions):
+      self._default_dict[
+          'dilation_rate'] = self._default_dict['dilation_rate'] // 2
+      self._default_dict[
+          'name'] = f"{name}_{i}_degridded_{self._default_dict['dilation_rate']}"
+      x = nn_blocks.DarkResidual(
+          filters=config.filters // scale_filters,
+          filter_scale=residual_filter_scale,
+          **self._default_dict)(
+              x)
+
+    self._default_dict['name'] = f'{name}_csp_connect'
+    output = nn_blocks.CSPConnect(
+        filters=config.filters,
+        filter_scale=csp_filter_scale,
+        **self._default_dict)([x, x_route])
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
    return output

  def _csp_tiny_stack(self, inputs, config, name):
-    self._default_dict["activation"] = self._get_activation(config.activation)
-    self._default_dict["name"] = f"{name}_csp_tiny"
-    x, x_route = nn_blocks.CSPTiny(filters=config.filters,
-                                   **self._default_dict)(inputs)
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._get_activation(config.activation)
+    self._default_dict['name'] = f'{name}_csp_tiny'
+    x, x_route = nn_blocks.CSPTiny(
+        filters=config.filters, **self._default_dict)(
+            inputs)
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
    return x, x_route

  def _tiny_stack(self, inputs, config, name):
-    x = tf.keras.layers.MaxPool2D(pool_size=2,
-                                  strides=config.strides,
-                                  padding="same",
-                                  data_format=None,
-                                  name=f"{name}_tiny/pool")(inputs)
-    self._default_dict["activation"] = self._get_activation(config.activation)
-    self._default_dict["name"] = f"{name}_tiny/conv"
+    x = tf.keras.layers.MaxPool2D(
+        pool_size=2,
+        strides=config.strides,
+        padding='same',
+        data_format=None,
+        name=f'{name}_tiny/pool')(
+            inputs)
+    self._default_dict['activation'] = self._get_activation(config.activation)
+    self._default_dict['name'] = f'{name}_tiny/conv'
    x = nn_blocks.ConvBN(
        filters=config.filters,
        kernel_size=(3, 3),
        strides=(1, 1),
-        padding="same",
+        padding='same',
        **self._default_dict)(
            x)
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
    return x

  def _residual_stack(self, inputs, config, name):
-    self._default_dict["activation"] = self._get_activation(config.activation)
-    self._default_dict["name"] = f"{name}_residual_down"
-    x = nn_blocks.DarkResidual(filters=config.filters,
-                               downsample=True,
-                               **self._default_dict)(inputs)
-    for i in range(config.repetitions - 1):
-      self._default_dict["name"] = f"{name}_{i}"
-      x = nn_blocks.DarkResidual(filters=config.filters,
-                                 **self._default_dict)(x)
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._get_activation(config.activation)
+    self._default_dict['name'] = f'{name}_residual_down'
+    if self._dilate:
+      self._default_dict['dilation_rate'] = config.dilation_rate
+      if config.repetitions < 8:
+        config.repetitions += 2
+    else:
+      self._default_dict['dilation_rate'] = 1
+
+    x = nn_blocks.DarkResidual(
+        filters=config.filters, downsample=True, **self._default_dict)(
+            inputs)
+
+    dilated_reps = config.repetitions - (
+        self._default_dict['dilation_rate'] // 2) - 1
+    for i in range(dilated_reps):
+      self._default_dict['name'] = f'{name}_{i}'
+      x = nn_blocks.DarkResidual(
+          filters=config.filters, **self._default_dict)(
+              x)
+
+    for i in range(dilated_reps, config.repetitions - 1):
+      self._default_dict[
+          'dilation_rate'] = self._default_dict['dilation_rate'] // 2
+      self._default_dict[
+          'name'] = f"{name}_{i}_degridded_{self._default_dict['dilation_rate']}"
+      x = nn_blocks.DarkResidual(
+          filters=config.filters, **self._default_dict)(
+              x)
+
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
+    self._default_dict['dilation_rate'] = 1
    return x

  def _build_block(self, inputs, config, name):
    x = inputs
    i = 0
-    self._default_dict["activation"] = self._get_activation(config.activation)
+    self._default_dict['activation'] = self._get_activation(config.activation)
    while i < config.repetitions:
-      self._default_dict["name"] = f"{name}_{i}"
+      self._default_dict['name'] = f'{name}_{i}'
      layer = self._registry(config, self._default_dict)
      x = layer(x)
      i += 1
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
    return x

  @staticmethod
  def get_model_config(name):
    name = name.lower()
-    backbone = BACKBONES[name]["backbone"]
-    splits = BACKBONES[name]["splits"]
+    backbone = BACKBONES[name]['backbone']
+    splits = BACKBONES[name]['splits']
    return build_block_specs(backbone), splits

  @property
@@ -412,35 +644,41 @@ class Darknet(tf.keras.Model):

  def get_config(self):
    layer_config = {
-        "model_id": self._model_name,
-        "min_level": self._min_size,
-        "max_level": self._max_size,
-        "kernel_initializer": self._kernel_initializer,
-        "kernel_regularizer": self._kernel_regularizer,
-        "bias_regularizer": self._bias_regularizer,
-        "norm_momentum": self._norm_momentum,
-        "norm_epsilon": self._norm_epislon,
-        "use_sync_bn": self._use_sync_bn,
-        "activation": self._activation
+        'model_id': self._model_name,
+        'min_level': self._min_size,
+        'max_level': self._max_size,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epislon,
+        'use_sync_bn': self._use_sync_bn,
+        'activation': self._activation,
    }
    return layer_config


-@factory.register_backbone_builder("darknet")
+@factory.register_backbone_builder('darknet')
 def build_darknet(
    input_specs: tf.keras.layers.InputSpec,
    backbone_config: hyperparams.Config,
    norm_activation_config: hyperparams.Config,
    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
-  """Builds darknet backbone."""
+  """Builds darknet."""

  backbone_cfg = backbone_config.get()
  model = Darknet(
      model_id=backbone_cfg.model_id,
-      input_shape=input_specs,
+      min_level=backbone_cfg.min_level,
+      max_level=backbone_cfg.max_level,
+      input_specs=input_specs,
+      dilate=backbone_cfg.dilate,
+      width_scale=backbone_cfg.width_scale,
+      depth_scale=backbone_cfg.depth_scale,
      activation=norm_activation_config.activation,
      use_sync_bn=norm_activation_config.use_sync_bn,
      norm_momentum=norm_activation_config.norm_momentum,
      norm_epsilon=norm_activation_config.norm_epsilon,
      kernel_regularizer=l2_regularizer)
+  model.summary()
  return model
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 # Lint as: python3
-"""Tests for resnet."""
+"""Tests for yolo."""

 from absl.testing import parameterized
 import numpy as np
@@ -24,35 +24,48 @@ from tensorflow.python.distribute import strategy_combinations
 from official.vision.beta.projects.yolo.modeling.backbones import darknet


-class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
+class DarknetTest(parameterized.TestCase, tf.test.TestCase):

  @parameterized.parameters(
-      (224, "darknet53", 2, 1),
-      (224, "darknettiny", 1, 2),
-      (224, "cspdarknettiny", 1, 1),
-      (224, "cspdarknet53", 2, 1),
+      (224, 'darknet53', 2, 1, True),
+      (224, 'darknettiny', 1, 2, False),
+      (224, 'cspdarknettiny', 1, 1, False),
+      (224, 'cspdarknet53', 2, 1, True),
  )
-  def test_network_creation(self, input_size, model_id,
-                            endpoint_filter_scale, scale_final):
+  def test_network_creation(self, input_size, model_id, endpoint_filter_scale,
+                            scale_final, dilate):
    """Test creation of ResNet family models."""
-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')

-    network = darknet.Darknet(model_id=model_id, min_level=3, max_level=5)
+    network = darknet.Darknet(
+        model_id=model_id, min_level=3, max_level=5, dilate=dilate)
    self.assertEqual(network.model_id, model_id)

    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
    endpoints = network(inputs)

-    self.assertAllEqual(
-        [1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale],
-        endpoints["3"].shape.as_list())
-    self.assertAllEqual(
-        [1, input_size / 2**4, input_size / 2**4, 256 * endpoint_filter_scale],
-        endpoints["4"].shape.as_list())
-    self.assertAllEqual([
-        1, input_size / 2**5, input_size / 2**5,
-        512 * endpoint_filter_scale * scale_final
-    ], endpoints["5"].shape.as_list())
+    if dilate:
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale
+      ], endpoints['3'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 256 * endpoint_filter_scale
+      ], endpoints['4'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3,
+          512 * endpoint_filter_scale * scale_final
+      ], endpoints['5'].shape.as_list())
+    else:
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale
+      ], endpoints['3'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**4, input_size / 2**4, 256 * endpoint_filter_scale
+      ], endpoints['4'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**5, input_size / 2**5,
+          512 * endpoint_filter_scale * scale_final
+      ], endpoints['5'].shape.as_list())

  @combinations.generate(
      combinations.combine(
@@ -66,20 +79,20 @@ class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
    """Test for sync bn on TPU and GPU devices."""
    inputs = np.random.rand(1, 224, 224, 3)

-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')

    with strategy.scope():
-      network = darknet.Darknet(model_id="darknet53", min_size=3, max_size=5)
+      network = darknet.Darknet(model_id='darknet53', min_size=3, max_size=5)
      _ = network(inputs)

  @parameterized.parameters(1, 3, 4)
  def test_input_specs(self, input_dim):
    """Test different input feature dimensions."""
-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')

    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim])
    network = darknet.Darknet(
-        model_id="darknet53", min_level=3, max_level=5, input_specs=input_specs)
+        model_id='darknet53', min_level=3, max_level=5, input_specs=input_specs)

    inputs = tf.keras.Input(shape=(224, 224, input_dim), batch_size=1)
    _ = network(inputs)
@@ -87,14 +100,14 @@ class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
  def test_serialize_deserialize(self):
    # Create a network object that sets all of its config options.
    kwargs = dict(
-        model_id="darknet53",
+        model_id='darknet53',
        min_level=3,
        max_level=5,
        use_sync_bn=False,
-        activation="relu",
+        activation='relu',
        norm_momentum=0.99,
        norm_epsilon=0.001,
-        kernel_initializer="VarianceScaling",
+        kernel_initializer='VarianceScaling',
        kernel_regularizer=None,
        bias_regularizer=None,
    )
@@ -113,5 +126,5 @@ class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
    self.assertAllEqual(network.get_config(), new_network.get_config())


-if __name__ == "__main__":
+if __name__ == '__main__':
  tf.test.main()
--- a/official/vision/beta/projects/yolo/modeling/decoders/__init__.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Feature Pyramid Network and Path Aggregation variants used in YOLO."""
+
+import tensorflow as tf
+from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class _IdentityRoute(tf.keras.layers.Layer):
+
+  def call(self, inputs):
+    return None, inputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloFPN(tf.keras.layers.Layer):
+  """YOLO Feature pyramid network."""
+
+  def __init__(self,
+               fpn_depth=4,
+               use_spatial_attention=False,
+               csp_stack=False,
+               activation='leaky',
+               fpn_filter_scale=1,
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               **kwargs):
+    """Yolo FPN initialization function (Yolo V4).
+
+    Args:
+      fpn_depth: `int`, number of layers to use in each FPN path
+        if you choose to use an FPN.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      activation: `str`, the activation function to use typically leaky or mish.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization momentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    super().__init__(**kwargs)
+    self._fpn_depth = fpn_depth
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_spatial_attention = use_spatial_attention
+    self._filter_scale = fpn_filter_scale
+    self._csp_stack = csp_stack
+
+    self._base_config = dict(
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        kernel_regularizer=self._kernel_regularizer,
+        kernel_initializer=self._kernel_initializer,
+        bias_regularizer=self._bias_regularizer,
+        norm_epsilon=self._norm_epsilon,
+        norm_momentum=self._norm_momentum)
+
+  def get_raw_depths(self, minimum_depth, inputs):
+    """Calculates the unscaled depths of the FPN branches.
+
+    Args:
+      minimum_depth (int): depth of the smallest branch of the FPN.
+      inputs (dict): dictionary of the shape of input args as a dictionary of
+        lists.
+
+    Returns:
+      The unscaled depths of the FPN branches.
+    """
+
+    depths = []
+    for i in range(self._min_level, self._max_level + 1):
+      depths.append(inputs[str(i)][-1] / self._filter_scale)
+    return list(reversed(depths))
+
+  def build(self, inputs):
+    """Use config dictionary to generate all important attributes for head.
+
+    Args:
+       inputs: dictionary of the shape of input args as a dictionary of lists.
+    """
+
+    keys = [int(key) for key in inputs.keys()]
+    self._min_level = min(keys)
+    self._max_level = max(keys)
+    self._min_depth = inputs[str(self._min_level)][-1]
+    self._depths = self.get_raw_depths(self._min_depth, inputs)
+
+    # directly connect to an input path and process it
+    self.preprocessors = dict()
+    # resample an input and merge it with the output of another path
+    # inorder to aggregate backbone outputs
+    self.resamples = dict()
+    # set of convoltion layers and upsample layers that are used to
+    # prepare the FPN processors for output
+
+    for level, depth in zip(
+        reversed(range(self._min_level, self._max_level + 1)), self._depths):
+      if level == self._min_level:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=depth // 2,
+            inverted=True,
+            upsample=True,
+            drop_final=self._csp_stack == 0,
+            upsample_size=2,
+            **self._base_config)
+        self.preprocessors[str(level)] = _IdentityRoute()
+      elif level != self._max_level:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=depth // 2,
+            inverted=True,
+            upsample=True,
+            drop_final=False,
+            upsample_size=2,
+            **self._base_config)
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=depth,
+            repetitions=self._fpn_depth - int(level == self._min_level),
+            block_invert=True,
+            insert_spp=False,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+      else:
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=depth,
+            repetitions=self._fpn_depth + 1 * int(self._csp_stack == 0),
+            insert_spp=True,
+            block_invert=False,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+
+  def call(self, inputs):
+    outputs = dict()
+    layer_in = inputs[str(self._max_level)]
+    for level in reversed(range(self._min_level, self._max_level + 1)):
+      _, x = self.preprocessors[str(level)](layer_in)
+      outputs[str(level)] = x
+      if level > self._min_level:
+        x_next = inputs[str(level - 1)]
+        _, layer_in = self.resamples[str(level - 1)]([x_next, x])
+    return outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloPAN(tf.keras.layers.Layer):
+  """YOLO Path Aggregation Network."""
+
+  def __init__(self,
+               path_process_len=6,
+               max_level_process_len=None,
+               embed_spp=False,
+               use_spatial_attention=False,
+               csp_stack=False,
+               activation='leaky',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               fpn_input=True,
+               fpn_filter_scale=1.0,
+               **kwargs):
+    """Yolo Path Aggregation Network initialization function (Yolo V3 and V4).
+
+    Args:
+      path_process_len: `int`, number of layers ot use in each Decoder path.
+      max_level_process_len: `int`, number of layers ot use in the largest
+        processing path, or the backbones largest output if it is different.
+      embed_spp: `bool`, use the SPP found in the YoloV3 and V4 model.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      activation: `str`, the activation function to use typically leaky or mish.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization omentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing
+        by zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      fpn_input: `bool`, for whether the input into this fucntion is an FPN or
+        a backbone.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    super().__init__(**kwargs)
+
+    self._path_process_len = path_process_len
+    self._embed_spp = embed_spp
+    self._use_spatial_attention = use_spatial_attention
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._fpn_input = fpn_input
+    self._max_level_process_len = max_level_process_len
+    self._csp_stack = csp_stack
+    self._fpn_filter_scale = fpn_filter_scale
+
+    if max_level_process_len is None:
+      self._max_level_process_len = path_process_len
+
+    self._base_config = dict(
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        kernel_regularizer=self._kernel_regularizer,
+        kernel_initializer=self._kernel_initializer,
+        bias_regularizer=self._bias_regularizer,
+        norm_epsilon=self._norm_epsilon,
+        norm_momentum=self._norm_momentum)
+
+  def build(self, inputs):
+    """Use config dictionary to generate all important attributes for head.
+
+    Args:
+      inputs: dictionary of the shape of input args as a dictionary of lists.
+    """
+
+    # define the key order
+    keys = [int(key) for key in inputs.keys()]
+    self._min_level = min(keys)
+    self._max_level = max(keys)
+    self._min_depth = inputs[str(self._min_level)][-1]
+    self._depths = self.get_raw_depths(self._min_depth, inputs)
+
+    # directly connect to an input path and process it
+    self.preprocessors = dict()
+    # resample an input and merge it with the output of another path
+    # inorder to aggregate backbone outputs
+    self.resamples = dict()
+
+    # FPN will reverse the key process order for the backbone, so we need
+    # adjust the order that objects are created and processed to adjust for
+    # this. not using an FPN will directly connect the decoder to the backbone
+    # therefore the object creation order needs to be done from the largest
+    # to smallest level.
+    if self._fpn_input:
+      # process order {... 3, 4, 5}
+      self._iterator = range(self._min_level, self._max_level + 1)
+      self._check = lambda x: x < self._max_level
+      self._key_shift = lambda x: x + 1
+      self._input = self._min_level
+      downsample = True
+      upsample = False
+    else:
+      # process order {5, 4, 3, ...}
+      self._iterator = list(
+          reversed(range(self._min_level, self._max_level + 1)))
+      self._check = lambda x: x > self._min_level
+      self._key_shift = lambda x: x - 1
+      self._input = self._max_level
+      downsample = False
+      upsample = True
+
+    if self._csp_stack == 0:
+      proc_filters = lambda x: x
+      resample_filters = lambda x: x // 2
+    else:
+      proc_filters = lambda x: x * 2
+      resample_filters = lambda x: x
+    for level, depth in zip(self._iterator, self._depths):
+      if level == self._input:
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=proc_filters(depth),
+            repetitions=self._max_level_process_len,
+            insert_spp=self._embed_spp,
+            block_invert=False,
+            insert_sam=self._use_spatial_attention,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+      else:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=resample_filters(depth),
+            upsample=upsample,
+            downsample=downsample,
+            inverted=False,
+            drop_final=self._csp_stack == 0,
+            **self._base_config)
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=proc_filters(depth),
+            repetitions=self._path_process_len,
+            insert_spp=False,
+            insert_sam=self._use_spatial_attention,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+
+  def get_raw_depths(self, minimum_depth, inputs):
+    """Calculates the unscaled depths of the FPN branches.
+
+    Args:
+      minimum_depth: `int` depth of the smallest branch of the FPN.
+      inputs: `dict[str, tf.InputSpec]` of the shape of input args as a
+        dictionary of lists.
+
+    Returns:
+      The unscaled depths of the FPN branches.
+    """
+
+    depths = []
+    if len(inputs.keys()) > 3 or self._fpn_filter_scale > 1:
+      for i in range(self._min_level, self._max_level + 1):
+        depths.append(inputs[str(i)][-1] * 2)
+    else:
+      for _ in range(self._min_level, self._max_level + 1):
+        depths.append(minimum_depth)
+        minimum_depth *= 2
+    if self._fpn_input:
+      return depths
+    return list(reversed(depths))
+
+  def call(self, inputs):
+    outputs = dict()
+    layer_in = inputs[str(self._input)]
+
+    for level in self._iterator:
+      x_route, x = self.preprocessors[str(level)](layer_in)
+      outputs[str(level)] = x
+      if self._check(level):
+        x_next = inputs[str(self._key_shift(level))]
+        _, layer_in = self.resamples[str(
+            self._key_shift(level))]([x_route, x_next])
+    return outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloDecoder(tf.keras.Model):
+  """Darknet Backbone Decoder."""
+
+  def __init__(self,
+               input_specs,
+               use_fpn=False,
+               use_spatial_attention=False,
+               csp_stack=False,
+               fpn_depth=4,
+               fpn_filter_scale=1,
+               path_process_len=6,
+               max_level_process_len=None,
+               embed_spp=False,
+               activation='leaky',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               **kwargs):
+    """Yolo Decoder initialization function.
+
+    A unified model that ties all decoder components into a conditionally build
+    YOLO decoder.
+
+    Args:
+      input_specs: `dict[str, tf.InputSpec]`: input specs of each of the inputs
+        to the heads.
+      use_fpn: `bool`, use the FPN found in the YoloV4 model.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      fpn_depth: `int`, number of layers ot use in each FPN path
+        if you choose to use an FPN.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      path_process_len: `int`, number of layers ot use in each Decoder path.
+      max_level_process_len: `int`, number of layers ot use in the largest
+        processing path, or the backbones largest output if it is different.
+      embed_spp: `bool`, use the SPP found in the YoloV3 and V4 model.
+      activation: `str`, the activation function to use typically leaky or mish.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization omentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    self._input_specs = input_specs
+    self._use_fpn = use_fpn
+    self._fpn_depth = fpn_depth
+    self._path_process_len = path_process_len
+    self._max_level_process_len = max_level_process_len
+    self._embed_spp = embed_spp
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    self._base_config = dict(
+        use_spatial_attention=use_spatial_attention,
+        csp_stack=csp_stack,
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        fpn_filter_scale=fpn_filter_scale,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    self._decoder_config = dict(
+        path_process_len=self._path_process_len,
+        max_level_process_len=self._max_level_process_len,
+        embed_spp=self._embed_spp,
+        fpn_input=self._use_fpn,
+        **self._base_config)
+
+    inputs = {
+        key: tf.keras.layers.Input(shape=value[1:])
+        for key, value in input_specs.items()
+    }
+    if self._use_fpn:
+      inter_outs = YoloFPN(
+          fpn_depth=self._fpn_depth, **self._base_config)(
+              inputs)
+      outputs = YoloPAN(**self._decoder_config)(inter_outs)
+    else:
+      inter_outs = None
+      outputs = YoloPAN(**self._decoder_config)(inputs)
+
+    self._output_specs = {key: value.shape for key, value in outputs.items()}
+    super().__init__(inputs=inputs, outputs=outputs, name='YoloDecoder')
+
+  @property
+  def use_fpn(self):
+    return self._use_fpn
+
+  @property
+  def output_specs(self):
+    return self._output_specs
+
+  def get_config(self):
+    config = dict(
+        input_specs=self._input_specs,
+        use_fpn=self._use_fpn,
+        fpn_depth=self._fpn_depth,
+        **self._decoder_config)
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)