Merge remote-tracking branch 'upstream/master'

2b676a9b · Gunho Park · 6ddd627a · bcbce005 · 2b676a9b · 2b676a9b
Commit 2b676a9b authored Jun 16, 2021 by Gunho Park
20 changed files
--- a/official/core/base_trainer.py
+++ b/official/core/base_trainer.py
@@ -246,10 +246,11 @@ class Trainer(_AsyncTrainer):
    self._train_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
    self._validation_loss = tf.keras.metrics.Mean(
        "validation_loss", dtype=tf.float32)
+    model_metrics = model.metrics if hasattr(model, "metrics") else []
    self._train_metrics = self.task.build_metrics(
-        training=True) + self.model.metrics
+        training=True) + model_metrics
    self._validation_metrics = self.task.build_metrics(
-        training=False) + self.model.metrics
+        training=False) + model_metrics

    self.init_async()


--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -181,20 +181,21 @@ class AxProcessor(DataProcessor):
 class ColaProcessor(DataProcessor):
  """Processor for the CoLA data set (GLUE version)."""

+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
+    super(ColaProcessor, self).__init__(process_text_fn)
+    self.dataset = tfds.load("glue/cola", try_gcs=True)
+
  def get_train_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    return self._create_examples_tfds("train")

  def get_dev_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    return self._create_examples_tfds("validation")

  def get_test_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    return self._create_examples_tfds("test")

  def get_labels(self):
    """See base class."""
@@ -205,22 +206,19 @@ class ColaProcessor(DataProcessor):
    """See base class."""
    return "COLA"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = self.dataset[set_type].as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      # Only the test set has a header.
-      if set_type == "test" and i == 0:
-        continue
+    for i, example in enumerate(dataset):
      guid = "%s-%s" % (set_type, i)
-      if set_type == "test":
-        text_a = self.process_text_fn(line[1])
-        label = "0"
-      else:
-        text_a = self.process_text_fn(line[3])
-        label = self.process_text_fn(line[1])
+      label = "0"
+      text_a = self.process_text_fn(example["sentence"])
+      if set_type != "test":
+        label = str(example["label"])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=None, label=label, weight=None))
    return examples



--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -40,6 +40,7 @@ class SentencePredictionDataConfig(cfg.DataConfig):
  label_type: str = 'int'
  # Whether to include the example id number.
  include_example_id: bool = False
+  label_field: str = 'label_ids'
  # Maps the key in TfExample to feature name.
  # E.g 'label_ids' to 'next_sentence_labels'
  label_name: Optional[Tuple[str, str]] = None
@@ -53,6 +54,7 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
    self._params = params
    self._seq_length = params.seq_length
    self._include_example_id = params.include_example_id
+    self._label_field = params.label_field
    if params.label_name:
      self._label_name_mapping = dict([params.label_name])
    else:
@@ -65,7 +67,7 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'label_ids': tf.io.FixedLenFeature([], label_type),
+        self._label_field: tf.io.FixedLenFeature([], label_type),
    }
    if self._include_example_id:
      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
@@ -92,10 +94,10 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
    if self._include_example_id:
      x['example_id'] = record['example_id']

-    x['label_ids'] = record['label_ids']
+    x[self._label_field] = record[self._label_field]

-    if 'label_ids' in self._label_name_mapping:
-      x[self._label_name_mapping['label_ids']] = record['label_ids']
+    if self._label_field in self._label_name_mapping:
+      x[self._label_name_mapping[self._label_field]] = record[self._label_field]

    return x

@@ -215,7 +217,7 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
    model_inputs = self._text_processor(segments)
    if self._include_example_id:
      model_inputs['example_id'] = record['example_id']
-    model_inputs['label_ids'] = record[self._label_field]
+    model_inputs[self._label_field] = record[self._label_field]
    return model_inputs

  def _decode(self, record: tf.Tensor):

--- a/official/nlp/data/sentence_prediction_dataloader_test.py
+++ b/official/nlp/data/sentence_prediction_dataloader_test.py
@@ -197,13 +197,14 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
        vocab_file=vocab_file_path)
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
+    label_field = data_config.label_field
    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', 'label_ids'],
+        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
        features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['label_ids'].shape, (batch_size,))
+    self.assertEqual(features[label_field].shape, (batch_size,))

  @parameterized.parameters(True, False)
  def test_python_sentencepiece_preprocessing(self, use_tfds):
@@ -231,13 +232,14 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    )
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
+    label_field = data_config.label_field
    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', 'label_ids'],
+        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
        features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['label_ids'].shape, (batch_size,))
+    self.assertEqual(features[label_field].shape, (batch_size,))

  @parameterized.parameters(True, False)
  def test_saved_model_preprocessing(self, use_tfds):
@@ -265,13 +267,14 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    )
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
+    label_field = data_config.label_field
    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', 'label_ids'],
+        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
        features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['label_ids'].shape, (batch_size,))
+    self.assertEqual(features[label_field].shape, (batch_size,))


 if __name__ == '__main__':

--- a/official/nlp/projects/mobilebert/README.md
+++ b/official/nlp/projects/mobilebert/README.md
@@ -22,7 +22,7 @@ modeling library:
  * [mobile_bert_encoder.py](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/mobile_bert_encoder.py)
  contains `MobileBERTEncoder` implementation.
  * [mobile_bert_layers.py](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/mobile_bert_layers.py)
-  contains `MobileBertEmbedding`, `MobileBertMaskedLM` and `MobileBertMaskedLM`
+  contains `MobileBertEmbedding`, `MobileBertTransformer` and `MobileBertMaskedLM`
  implementation.

 ## Pre-trained Models

--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -69,6 +69,10 @@ class SentencePredictionTask(base_task.Task):
    if params.metric_type not in METRIC_TYPES:
      raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
    self.metric_type = params.metric_type
+    if hasattr(params.train_data, 'label_field'):
+      self.label_field = params.train_data.label_field
+    else:
+      self.label_field = 'label_ids'

  def build_model(self):
    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
@@ -95,7 +99,7 @@ class SentencePredictionTask(base_task.Task):
          use_encoder_pooler=self.task_config.model.use_encoder_pooler)

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    label_ids = labels['label_ids']
+    label_ids = labels[self.label_field]
    if self.task_config.model.num_classes == 1:
      loss = tf.keras.losses.mean_squared_error(label_ids, model_outputs)
    else:
@@ -121,7 +125,7 @@ class SentencePredictionTask(base_task.Task):
          y = tf.zeros((1,), dtype=tf.float32)
        else:
          y = tf.zeros((1, 1), dtype=tf.int32)
-        x['label_ids'] = y
+        x[self.label_field] = y
        return x

      dataset = tf.data.Dataset.range(1)
@@ -144,10 +148,10 @@ class SentencePredictionTask(base_task.Task):

  def process_metrics(self, metrics, labels, model_outputs):
    for metric in metrics:
-      metric.update_state(labels['label_ids'], model_outputs)
+      metric.update_state(labels[self.label_field], model_outputs)

  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    compiled_metrics.update_state(labels, model_outputs)
+    compiled_metrics.update_state(labels[self.label_field], model_outputs)

  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
    if self.metric_type == 'accuracy':
@@ -163,12 +167,12 @@ class SentencePredictionTask(base_task.Task):
          'sentence_prediction':  # Ensure one prediction along batch dimension.
              tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=1),
          'labels':
-              labels['label_ids'],
+              labels[self.label_field],
      })
    if self.metric_type == 'pearson_spearman_corr':
      logs.update({
          'sentence_prediction': outputs,
-          'labels': labels['label_ids'],
+          'labels': labels[self.label_field],
      })
    return logs


--- a/official/projects/README.md
+++ b/official/projects/README.md
+This directory contains projects using TensorFlow Model Garden Modeling
+libraries.
--- a/official/vision/beta/data/create_coco_tf_record.py
+++ b/official/vision/beta/data/create_coco_tf_record.py
@@ -46,7 +46,7 @@ from official.vision.beta.data import tfrecord_lib
 flags.DEFINE_boolean(
    'include_masks', False, 'Whether to include instance segmentations masks '
    '(PNG encoded) in the result. default: False.')
-flags.DEFINE_string('image_dir', '', 'Directory containing images.')
+flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
 flags.DEFINE_string(
    'image_info_file', '', 'File containing image information. '
    'Tf Examples in the output files correspond to the image '
@@ -159,7 +159,7 @@ def encode_caption_annotations(caption_annotations):


 def create_tf_example(image,
-                      image_dir,
+                      image_dirs,
                      bbox_annotations=None,
                      id_to_name_map=None,
                      caption_annotations=None,
@@ -169,7 +169,7 @@ def create_tf_example(image,
  Args:
    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
      u'width', u'date_captured', u'flickr_url', u'id']
-    image_dir: directory containing the image files.
+    image_dirs: list of directories containing the image files.
    bbox_annotations:
      list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
        u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
@@ -190,14 +190,31 @@ def create_tf_example(image,
    num_annotations_skipped: Number of (invalid) annotations that were ignored.

  Raises:
-    ValueError: if the image pointed to by data['filename'] is not a valid JPEG
+    ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
+      does not exist, or is not unique across image directories.
  """
  image_height = image['height']
  image_width = image['width']
  filename = image['file_name']
  image_id = image['id']

-  full_path = os.path.join(image_dir, filename)
+  if len(image_dirs) > 1:
+    full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
+    full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
+    if not full_existing_paths:
+      raise ValueError(
+          '{} does not exist across image directories.'.format(filename))
+    if len(full_existing_paths) > 1:
+      raise ValueError(
+          '{} is not unique across image directories'.format(filename))
+    full_path, = full_existing_paths
+  # If there is only one image directory, it's not worth checking for existence,
+  # since trying to open the file will raise an informative error message if it
+  # does not exist.
+  else:
+    image_dir, = image_dirs
+    full_path = os.path.join(image_dir, filename)
+
  with tf.io.gfile.GFile(full_path, 'rb') as fid:
    encoded_jpg = fid.read()

@@ -276,7 +293,7 @@ def _load_images_info(images_info_file):
  return info_dict['images']


-def generate_annotations(images, image_dir,
+def generate_annotations(images, image_dirs,
                         img_to_obj_annotation=None,
                         img_to_caption_annotation=None, id_to_name_map=None,
                         include_masks=False):
@@ -289,12 +306,12 @@ def generate_annotations(images, image_dir,
    caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
                         img_to_caption_annotation else None)

-    yield (image, image_dir, object_annotation, id_to_name_map,
+    yield (image, image_dirs, object_annotation, id_to_name_map,
           caption_annotaion, include_masks)


 def _create_tf_record_from_coco_annotations(images_info_file,
-                                            image_dir,
+                                            image_dirs,
                                            output_path,
                                            num_shards,
                                            object_annotations_file=None,
@@ -309,7 +326,7 @@ def _create_tf_record_from_coco_annotations(images_info_file,
      files Eg. 'image_info_test-dev2017.json',
      'instance_annotations_train2017.json',
      'caption_annotations_train2017.json', etc.
-    image_dir: Directory containing the image files.
+    image_dirs: List of directories containing the image files.
    output_path: Path to output tf.Record file.
    num_shards: Number of output files to create.
    object_annotations_file: JSON file containing bounding box annotations.
@@ -333,7 +350,7 @@ def _create_tf_record_from_coco_annotations(images_info_file,
        _load_caption_annotations(caption_annotations_file))

  coco_annotations_iter = generate_annotations(
-      images, image_dir, img_to_obj_annotation, img_to_caption_annotation,
+      images, image_dirs, img_to_obj_annotation, img_to_caption_annotation,
      id_to_name_map=id_to_name_map, include_masks=include_masks)

  num_skipped = tfrecord_lib.write_tf_record_dataset(

--- a/official/vision/beta/data/process_coco_few_shot.sh
+++ b/official/vision/beta/data/process_coco_few_shot.sh
+#!/bin/bash
+#
+# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
+
+tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
+output_dir="/tmp/coco_few_shot"
+while getopts "o:" o; do
+  case "${o}" in
+    o) output_dir=${OPTARG} ;;
+    *) echo "Usage: ${0} [-o <output_dir>]" 1>&2; exit 1 ;;
+  esac
+done
+
+cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
+wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
+    -P "${tmp_dir}" -A "5k.json,*10shot*.json,*30shot*.json" \
+    "http://${cocosplit_url}/"
+mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
+rm -rf "${tmp_dir}/${cocosplit_url}/"
+
+python process_coco_few_shot_json_files.py \
+    --logtostderr --workdir="${tmp_dir}"
+
+for seed in {0..9}; do
+  for shots in 10 30; do
+    python create_coco_tf_record.py \
+        --logtostderr \
+        --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/train2014 \
+        --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/val2014 \
+        --image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --caption_annotations_file="" \
+        --output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
+        --num_shards=4
+  done
+done
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/train2014 \
+    --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/val2014 \
+    --image_info_file="${tmp_dir}/datasplit/5k.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/5k.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/5k" \
+    --num_shards=10
+
+rm -rf "${tmp_dir}"
--- a/official/vision/beta/data/process_coco_few_shot_json_files.py
+++ b/official/vision/beta/data/process_coco_few_shot_json_files.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processes the JSON files for COCO few-shot.
+
+We assume that `workdir` mirrors the contents of
+http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
+files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
+"Frustratingly Simple Few-Shot Object Detection" paper uses.
+"""
+
+import collections
+import itertools
+import json
+import logging
+import os
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+
+flags.DEFINE_string('workdir', None, 'Working directory.')
+
+FLAGS = flags.FLAGS
+CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
+              'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
+              'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
+              'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
+              'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
+              'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
+              'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
+              'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
+              'parking meter', 'person', 'pizza', 'potted plant',
+              'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
+              'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
+              'stop sign', 'suitcase', 'surfboard', 'teddy bear',
+              'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
+              'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
+              'wine glass', 'zebra']
+SEEDS = list(range(10))
+SHOTS = [10, 30]
+
+FILE_SUFFIXES = collections.defaultdict(list)
+for _seed, _shots in itertools.product(SEEDS, SHOTS):
+  for _category in CATEGORIES:
+    FILE_SUFFIXES[(_seed, _shots)].append(
+        '{}full_box_{}shot_{}_trainval.json'.format(
+            # http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
+            #
+            #   datasplit/
+            #     trainvalno5k.json
+            #     5k.json
+            #   full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #   seed{1-9}/
+            #     full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #
+            # This means that the JSON files for seed0 are located in the root
+            # directory rather than in a `seed?/` subdirectory, hence the
+            # conditional expression below.
+            '' if _seed == 0 else 'seed{}/'.format(_seed),
+            _shots,
+            _category))
+
+
+def main(unused_argv):
+  workdir = FLAGS.workdir
+
+  for seed, shots in itertools.product(SEEDS, SHOTS):
+    # Retrieve all examples for a given seed and shots setting.
+    file_paths = [os.path.join(workdir, suffix)
+                  for suffix in FILE_SUFFIXES[(seed, shots)]]
+    json_dicts = []
+    for file_path in file_paths:
+      with tf.io.gfile.GFile(file_path, 'r') as f:
+        json_dicts.append(json.load(f))
+
+    # Make sure that all JSON files for a given seed and shots setting have the
+    # same metadata. We count on this to fuse them later on.
+    metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
+                       'categories': d['categories']} for d in json_dicts]
+    if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
+      raise RuntimeError(
+          'JSON files for {} shots (seed {}) '.format(shots, seed) +
+          'have different info, licences, or categories fields')
+
+    # Retrieve images across all JSON files.
+    images = sum((d['images'] for d in json_dicts), [])
+    # Remove duplicate image entries.
+    images = list({image['id']: image for image in images}.values())
+
+    output_dict = {
+        'info': json_dicts[0]['info'],
+        'licenses': json_dicts[0]['licenses'],
+        'categories': json_dicts[0]['categories'],
+        'images': images,
+        'annotations': sum((d['annotations'] for d in json_dicts), [])
+    }
+
+    output_path = os.path.join(workdir,
+                               '{}shot_seed{}.json'.format(shots, seed))
+    with tf.io.gfile.GFile(output_path, 'w') as f:
+      json.dump(output_dict, f)
+    logger.info('Processed %d shots (seed %d) and saved to %s',
+                shots, seed, output_path)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('workdir')
+  app.run(main)
--- a/official/vision/beta/projects/movinet/modeling/movinet.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet.py
@@ -525,7 +525,6 @@ class Movinet(tf.keras.Model):
    Returns:
      A dict mapping state names to state shapes.
    """
-
    def divide_resolution(shape, num_downsamples):
      """Downsamples the dimension to calculate strided convolution shape."""
      if shape is None:
@@ -564,6 +563,12 @@ class Movinet(tf.keras.Model):
        for layer_idx, layer in enumerate(params):
          expand_filters, kernel_size, strides = layer

+          # If we use a 2D kernel, we apply spatial downsampling
+          # before the buffer.
+          if (tuple(strides[1:3]) != (1, 1) and
+              self._conv_type in ['2plus1d', '3d_2plus1d']):
+            num_downsamples += 1
+
          if kernel_size[0] > 1:
            states[f'state/b{block_idx}/l{layer_idx}/stream_buffer'] = (
                input_shape[0],
@@ -585,7 +590,11 @@ class Movinet(tf.keras.Model):
          if strides[1] != strides[2]:
            raise ValueError('Strides must match in the spatial dimensions, '
                             'got {}'.format(strides))
-          if strides[1] != 1 or strides[2] != 1:
+
+          # If we use a 3D kernel, we apply spatial downsampling
+          # after the buffer.
+          if (tuple(strides[1:3]) != (1, 1) and
+              self._conv_type not in ['2plus1d', '3d_2plus1d']):
            num_downsamples += 1
      elif isinstance(block, HeadSpec):
        states['state/head/pool_buffer'] = (

--- a/official/vision/beta/projects/movinet/modeling/movinet_layers.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet_layers.py
@@ -633,9 +633,28 @@ class StreamConvBlock(ConvBlock):
    states = dict(states) if states is not None else {}

    x = inputs
-    if self._stream_buffer is not None:
+
+    # If we have no separate temporal conv, use the buffer before the 3D conv.
+    if self._conv_temporal is None and self._stream_buffer is not None:
      x, states = self._stream_buffer(x, states=states)
-    x = super(StreamConvBlock, self).call(x)
+
+    x = self._conv(x)
+    if self._batch_norm is not None:
+      x = self._batch_norm(x)
+    if self._activation_layer is not None:
+      x = self._activation_layer(x)
+
+    if self._conv_temporal is not None:
+      if self._stream_buffer is not None:
+        # If we have a separate temporal conv, use the buffer before the
+        # 1D conv instead (otherwise, we may waste computation on the 2D conv).
+        x, states = self._stream_buffer(x, states=states)
+
+      x = self._conv_temporal(x)
+      if self._batch_norm_temporal is not None:
+        x = self._batch_norm_temporal(x)
+      if self._activation_layer is not None:
+        x = self._activation_layer(x)

    return x, states


--- a/official/vision/beta/projects/movinet/modeling/movinet_model.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet_model.py
@@ -115,15 +115,31 @@ class MovinetClassifier(tf.keras.Model):
    inputs = {**states, 'image': image}

    if backbone.use_external_states:
-      before_states = set(states)
+      before_states = states
      endpoints, states = backbone(inputs)
-      after_states = set(states)
+      after_states = states

-      new_states = after_states - before_states
+      new_states = set(after_states) - set(before_states)
      if new_states:
-        raise AttributeError('Expected input and output states to be the same. '
-                             'Got extra states {}, expected {}'.format(
-                                 new_states, before_states))
+        raise ValueError(
+            'Expected input and output states to be the same. Got extra states '
+            '{}, expected {}'.format(new_states, set(before_states)))
+
+      mismatched_shapes = {}
+      for name in after_states:
+        before_shape = before_states[name].shape
+        after_shape = after_states[name].shape
+        if len(before_shape) != len(after_shape):
+          mismatched_shapes[name] = (before_shape, after_shape)
+          continue
+        for before, after in zip(before_shape, after_shape):
+          if before is not None and after is not None and before != after:
+            mismatched_shapes[name] = (before_shape, after_shape)
+            break
+      if mismatched_shapes:
+        raise ValueError(
+            'Got mismatched input and output state shapes: {}'.format(
+                mismatched_shapes))
    else:
      endpoints, states = backbone(inputs)


--- a/official/vision/beta/projects/yolo/README.md
+++ b/official/vision/beta/projects/yolo/README.md
+DISCLAIMER: this YOLO implementation is still under development. No support will
+be provided during the development phase.
+
 # YOLO Object Detectors, You Only Look Once

 [![Paper](http://img.shields.io/badge/Paper-arXiv.1804.02767-B3181B?logo=arXiv)](https://arxiv.org/abs/1804.02767)
@@ -74,3 +77,5 @@ head could be connected to a new, more powerful backbone if a person chose to.

 [![TensorFlow 2.2](https://img.shields.io/badge/TensorFlow-2.2-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
 [![Python 3.8](https://img.shields.io/badge/Python-3.8-3776AB)](https://www.python.org/downloads/release/python-380/)
+
+
--- a/official/vision/beta/projects/yolo/configs/backbones.py
+++ b/official/vision/beta/projects/yolo/configs/backbones.py
@@ -24,11 +24,14 @@ from official.vision.beta.configs import backbones


 @dataclasses.dataclass
-class DarkNet(hyperparams.Config):
-  """DarkNet config."""
-  model_id: str = "darknet53"
+class Darknet(hyperparams.Config):
+  """Darknet config."""
+  model_id: str = 'darknet53'
+  width_scale: float = 1.0
+  depth_scale: float = 1.0
+  dilate: bool = False


 @dataclasses.dataclass
 class Backbone(backbones.Backbone):
-  darknet: DarkNet = DarkNet()
+  darknet: Darknet = Darknet()
--- a/official/vision/beta/projects/yolo/configs/darknet_classification.py
+++ b/official/vision/beta/projects/yolo/configs/darknet_classification.py
@@ -32,7 +32,7 @@ class ImageClassificationModel(hyperparams.Config):
  num_classes: int = 0
  input_size: List[int] = dataclasses.field(default_factory=list)
  backbone: backbones.Backbone = backbones.Backbone(
-      type='darknet', resnet=backbones.DarkNet())
+      type='darknet', darknet=backbones.Darknet())
  dropout_rate: float = 0.0
  norm_activation: common.NormActivation = common.NormActivation()
  # Adds a BatchNormalization layer pre-GlobalAveragePooling in classification

--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 # Lint as: python3
-"""Tests for resnet."""
+"""Tests for yolo."""

 from absl.testing import parameterized
 import numpy as np
@@ -24,35 +24,48 @@ from tensorflow.python.distribute import strategy_combinations
 from official.vision.beta.projects.yolo.modeling.backbones import darknet


-class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
+class DarknetTest(parameterized.TestCase, tf.test.TestCase):

  @parameterized.parameters(
-      (224, "darknet53", 2, 1),
-      (224, "darknettiny", 1, 2),
-      (224, "cspdarknettiny", 1, 1),
-      (224, "cspdarknet53", 2, 1),
+      (224, 'darknet53', 2, 1, True),
+      (224, 'darknettiny', 1, 2, False),
+      (224, 'cspdarknettiny', 1, 1, False),
+      (224, 'cspdarknet53', 2, 1, True),
  )
-  def test_network_creation(self, input_size, model_id,
-                            endpoint_filter_scale, scale_final):
+  def test_network_creation(self, input_size, model_id, endpoint_filter_scale,
+                            scale_final, dilate):
    """Test creation of ResNet family models."""
-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')

-    network = darknet.Darknet(model_id=model_id, min_level=3, max_level=5)
+    network = darknet.Darknet(
+        model_id=model_id, min_level=3, max_level=5, dilate=dilate)
    self.assertEqual(network.model_id, model_id)

    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
    endpoints = network(inputs)

-    self.assertAllEqual(
-        [1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale],
-        endpoints["3"].shape.as_list())
-    self.assertAllEqual(
-        [1, input_size / 2**4, input_size / 2**4, 256 * endpoint_filter_scale],
-        endpoints["4"].shape.as_list())
-    self.assertAllEqual([
-        1, input_size / 2**5, input_size / 2**5,
-        512 * endpoint_filter_scale * scale_final
-    ], endpoints["5"].shape.as_list())
+    if dilate:
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale
+      ], endpoints['3'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 256 * endpoint_filter_scale
+      ], endpoints['4'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3,
+          512 * endpoint_filter_scale * scale_final
+      ], endpoints['5'].shape.as_list())
+    else:
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale
+      ], endpoints['3'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**4, input_size / 2**4, 256 * endpoint_filter_scale
+      ], endpoints['4'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**5, input_size / 2**5,
+          512 * endpoint_filter_scale * scale_final
+      ], endpoints['5'].shape.as_list())

  @combinations.generate(
      combinations.combine(
@@ -66,20 +79,20 @@ class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
    """Test for sync bn on TPU and GPU devices."""
    inputs = np.random.rand(1, 224, 224, 3)

-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')

    with strategy.scope():
-      network = darknet.Darknet(model_id="darknet53", min_size=3, max_size=5)
+      network = darknet.Darknet(model_id='darknet53', min_size=3, max_size=5)
      _ = network(inputs)

  @parameterized.parameters(1, 3, 4)
  def test_input_specs(self, input_dim):
    """Test different input feature dimensions."""
-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')

    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim])
    network = darknet.Darknet(
-        model_id="darknet53", min_level=3, max_level=5, input_specs=input_specs)
+        model_id='darknet53', min_level=3, max_level=5, input_specs=input_specs)

    inputs = tf.keras.Input(shape=(224, 224, input_dim), batch_size=1)
    _ = network(inputs)
@@ -87,14 +100,14 @@ class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
  def test_serialize_deserialize(self):
    # Create a network object that sets all of its config options.
    kwargs = dict(
-        model_id="darknet53",
+        model_id='darknet53',
        min_level=3,
        max_level=5,
        use_sync_bn=False,
-        activation="relu",
+        activation='relu',
        norm_momentum=0.99,
        norm_epsilon=0.001,
-        kernel_initializer="VarianceScaling",
+        kernel_initializer='VarianceScaling',
        kernel_regularizer=None,
        bias_regularizer=None,
    )
@@ -113,5 +126,5 @@ class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
    self.assertAllEqual(network.get_config(), new_network.get_config())


-if __name__ == "__main__":
+if __name__ == '__main__':
  tf.test.main()
--- a/official/vision/beta/projects/yolo/modeling/decoders/__init__.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Feature Pyramid Network and Path Aggregation variants used in YOLO."""
+
+import tensorflow as tf
+from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class _IdentityRoute(tf.keras.layers.Layer):
+
+  def call(self, inputs):
+    return None, inputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloFPN(tf.keras.layers.Layer):
+  """YOLO Feature pyramid network."""
+
+  def __init__(self,
+               fpn_depth=4,
+               use_spatial_attention=False,
+               csp_stack=False,
+               activation='leaky',
+               fpn_filter_scale=1,
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               **kwargs):
+    """Yolo FPN initialization function (Yolo V4).
+
+    Args:
+      fpn_depth: `int`, number of layers to use in each FPN path
+        if you choose to use an FPN.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      activation: `str`, the activation function to use typically leaky or mish.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization momentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    super().__init__(**kwargs)
+    self._fpn_depth = fpn_depth
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_spatial_attention = use_spatial_attention
+    self._filter_scale = fpn_filter_scale
+    self._csp_stack = csp_stack
+
+    self._base_config = dict(
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        kernel_regularizer=self._kernel_regularizer,
+        kernel_initializer=self._kernel_initializer,
+        bias_regularizer=self._bias_regularizer,
+        norm_epsilon=self._norm_epsilon,
+        norm_momentum=self._norm_momentum)
+
+  def get_raw_depths(self, minimum_depth, inputs):
+    """Calculates the unscaled depths of the FPN branches.
+
+    Args:
+      minimum_depth (int): depth of the smallest branch of the FPN.
+      inputs (dict): dictionary of the shape of input args as a dictionary of
+        lists.
+
+    Returns:
+      The unscaled depths of the FPN branches.
+    """
+
+    depths = []
+    for i in range(self._min_level, self._max_level + 1):
+      depths.append(inputs[str(i)][-1] / self._filter_scale)
+    return list(reversed(depths))
+
+  def build(self, inputs):
+    """Use config dictionary to generate all important attributes for head.
+
+    Args:
+       inputs: dictionary of the shape of input args as a dictionary of lists.
+    """
+
+    keys = [int(key) for key in inputs.keys()]
+    self._min_level = min(keys)
+    self._max_level = max(keys)
+    self._min_depth = inputs[str(self._min_level)][-1]
+    self._depths = self.get_raw_depths(self._min_depth, inputs)
+
+    # directly connect to an input path and process it
+    self.preprocessors = dict()
+    # resample an input and merge it with the output of another path
+    # inorder to aggregate backbone outputs
+    self.resamples = dict()
+    # set of convoltion layers and upsample layers that are used to
+    # prepare the FPN processors for output
+
+    for level, depth in zip(
+        reversed(range(self._min_level, self._max_level + 1)), self._depths):
+      if level == self._min_level:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=depth // 2,
+            inverted=True,
+            upsample=True,
+            drop_final=self._csp_stack == 0,
+            upsample_size=2,
+            **self._base_config)
+        self.preprocessors[str(level)] = _IdentityRoute()
+      elif level != self._max_level:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=depth // 2,
+            inverted=True,
+            upsample=True,
+            drop_final=False,
+            upsample_size=2,
+            **self._base_config)
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=depth,
+            repetitions=self._fpn_depth - int(level == self._min_level),
+            block_invert=True,
+            insert_spp=False,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+      else:
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=depth,
+            repetitions=self._fpn_depth + 1 * int(self._csp_stack == 0),
+            insert_spp=True,
+            block_invert=False,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+
+  def call(self, inputs):
+    outputs = dict()
+    layer_in = inputs[str(self._max_level)]
+    for level in reversed(range(self._min_level, self._max_level + 1)):
+      _, x = self.preprocessors[str(level)](layer_in)
+      outputs[str(level)] = x
+      if level > self._min_level:
+        x_next = inputs[str(level - 1)]
+        _, layer_in = self.resamples[str(level - 1)]([x_next, x])
+    return outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloPAN(tf.keras.layers.Layer):
+  """YOLO Path Aggregation Network."""
+
+  def __init__(self,
+               path_process_len=6,
+               max_level_process_len=None,
+               embed_spp=False,
+               use_spatial_attention=False,
+               csp_stack=False,
+               activation='leaky',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               fpn_input=True,
+               fpn_filter_scale=1.0,
+               **kwargs):
+    """Yolo Path Aggregation Network initialization function (Yolo V3 and V4).
+
+    Args:
+      path_process_len: `int`, number of layers ot use in each Decoder path.
+      max_level_process_len: `int`, number of layers ot use in the largest
+        processing path, or the backbones largest output if it is different.
+      embed_spp: `bool`, use the SPP found in the YoloV3 and V4 model.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      activation: `str`, the activation function to use typically leaky or mish.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization omentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing
+        by zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      fpn_input: `bool`, for whether the input into this fucntion is an FPN or
+        a backbone.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    super().__init__(**kwargs)
+
+    self._path_process_len = path_process_len
+    self._embed_spp = embed_spp
+    self._use_spatial_attention = use_spatial_attention
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._fpn_input = fpn_input
+    self._max_level_process_len = max_level_process_len
+    self._csp_stack = csp_stack
+    self._fpn_filter_scale = fpn_filter_scale
+
+    if max_level_process_len is None:
+      self._max_level_process_len = path_process_len
+
+    self._base_config = dict(
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        kernel_regularizer=self._kernel_regularizer,
+        kernel_initializer=self._kernel_initializer,
+        bias_regularizer=self._bias_regularizer,
+        norm_epsilon=self._norm_epsilon,
+        norm_momentum=self._norm_momentum)
+
+  def build(self, inputs):
+    """Use config dictionary to generate all important attributes for head.
+
+    Args:
+      inputs: dictionary of the shape of input args as a dictionary of lists.
+    """
+
+    # define the key order
+    keys = [int(key) for key in inputs.keys()]
+    self._min_level = min(keys)
+    self._max_level = max(keys)
+    self._min_depth = inputs[str(self._min_level)][-1]
+    self._depths = self.get_raw_depths(self._min_depth, inputs)
+
+    # directly connect to an input path and process it
+    self.preprocessors = dict()
+    # resample an input and merge it with the output of another path
+    # inorder to aggregate backbone outputs
+    self.resamples = dict()
+
+    # FPN will reverse the key process order for the backbone, so we need
+    # adjust the order that objects are created and processed to adjust for
+    # this. not using an FPN will directly connect the decoder to the backbone
+    # therefore the object creation order needs to be done from the largest
+    # to smallest level.
+    if self._fpn_input:
+      # process order {... 3, 4, 5}
+      self._iterator = range(self._min_level, self._max_level + 1)
+      self._check = lambda x: x < self._max_level
+      self._key_shift = lambda x: x + 1
+      self._input = self._min_level
+      downsample = True
+      upsample = False
+    else:
+      # process order {5, 4, 3, ...}
+      self._iterator = list(
+          reversed(range(self._min_level, self._max_level + 1)))
+      self._check = lambda x: x > self._min_level
+      self._key_shift = lambda x: x - 1
+      self._input = self._max_level
+      downsample = False
+      upsample = True
+
+    if self._csp_stack == 0:
+      proc_filters = lambda x: x
+      resample_filters = lambda x: x // 2
+    else:
+      proc_filters = lambda x: x * 2
+      resample_filters = lambda x: x
+    for level, depth in zip(self._iterator, self._depths):
+      if level == self._input:
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=proc_filters(depth),
+            repetitions=self._max_level_process_len,
+            insert_spp=self._embed_spp,
+            block_invert=False,
+            insert_sam=self._use_spatial_attention,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+      else:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=resample_filters(depth),
+            upsample=upsample,
+            downsample=downsample,
+            inverted=False,
+            drop_final=self._csp_stack == 0,
+            **self._base_config)
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=proc_filters(depth),
+            repetitions=self._path_process_len,
+            insert_spp=False,
+            insert_sam=self._use_spatial_attention,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+
+  def get_raw_depths(self, minimum_depth, inputs):
+    """Calculates the unscaled depths of the FPN branches.
+
+    Args:
+      minimum_depth: `int` depth of the smallest branch of the FPN.
+      inputs: `dict[str, tf.InputSpec]` of the shape of input args as a
+        dictionary of lists.
+
+    Returns:
+      The unscaled depths of the FPN branches.
+    """
+
+    depths = []
+    if len(inputs.keys()) > 3 or self._fpn_filter_scale > 1:
+      for i in range(self._min_level, self._max_level + 1):
+        depths.append(inputs[str(i)][-1] * 2)
+    else:
+      for _ in range(self._min_level, self._max_level + 1):
+        depths.append(minimum_depth)
+        minimum_depth *= 2
+    if self._fpn_input:
+      return depths
+    return list(reversed(depths))
+
+  def call(self, inputs):
+    outputs = dict()
+    layer_in = inputs[str(self._input)]
+
+    for level in self._iterator:
+      x_route, x = self.preprocessors[str(level)](layer_in)
+      outputs[str(level)] = x
+      if self._check(level):
+        x_next = inputs[str(self._key_shift(level))]
+        _, layer_in = self.resamples[str(
+            self._key_shift(level))]([x_route, x_next])
+    return outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloDecoder(tf.keras.Model):
+  """Darknet Backbone Decoder."""
+
+  def __init__(self,
+               input_specs,
+               use_fpn=False,
+               use_spatial_attention=False,
+               csp_stack=False,
+               fpn_depth=4,
+               fpn_filter_scale=1,
+               path_process_len=6,
+               max_level_process_len=None,
+               embed_spp=False,
+               activation='leaky',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               **kwargs):
+    """Yolo Decoder initialization function.
+
+    A unified model that ties all decoder components into a conditionally build
+    YOLO decoder.
+
+    Args:
+      input_specs: `dict[str, tf.InputSpec]`: input specs of each of the inputs
+        to the heads.
+      use_fpn: `bool`, use the FPN found in the YoloV4 model.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      fpn_depth: `int`, number of layers ot use in each FPN path
+        if you choose to use an FPN.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      path_process_len: `int`, number of layers ot use in each Decoder path.
+      max_level_process_len: `int`, number of layers ot use in the largest
+        processing path, or the backbones largest output if it is different.
+      embed_spp: `bool`, use the SPP found in the YoloV3 and V4 model.
+      activation: `str`, the activation function to use typically leaky or mish.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization omentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    self._input_specs = input_specs
+    self._use_fpn = use_fpn
+    self._fpn_depth = fpn_depth
+    self._path_process_len = path_process_len
+    self._max_level_process_len = max_level_process_len
+    self._embed_spp = embed_spp
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    self._base_config = dict(
+        use_spatial_attention=use_spatial_attention,
+        csp_stack=csp_stack,
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        fpn_filter_scale=fpn_filter_scale,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    self._decoder_config = dict(
+        path_process_len=self._path_process_len,
+        max_level_process_len=self._max_level_process_len,
+        embed_spp=self._embed_spp,
+        fpn_input=self._use_fpn,
+        **self._base_config)
+
+    inputs = {
+        key: tf.keras.layers.Input(shape=value[1:])
+        for key, value in input_specs.items()
+    }
+    if self._use_fpn:
+      inter_outs = YoloFPN(
+          fpn_depth=self._fpn_depth, **self._base_config)(
+              inputs)
+      outputs = YoloPAN(**self._decoder_config)(inter_outs)
+    else:
+      inter_outs = None
+      outputs = YoloPAN(**self._decoder_config)(inputs)
+
+    self._output_specs = {key: value.shape for key, value in outputs.items()}
+    super().__init__(inputs=inputs, outputs=outputs, name='YoloDecoder')
+
+  @property
+  def use_fpn(self):
+    return self._use_fpn
+
+  @property
+  def output_specs(self):
+    return self._output_specs
+
+  def get_config(self):
+    config = dict(
+        input_specs=self._input_specs,
+        use_fpn=self._use_fpn,
+        fpn_depth=self._fpn_depth,
+        **self._decoder_config)
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)