Merge pull request #10263 from PurdueDualityLab:dataload_pr

PiperOrigin-RevId: 399483092

Merge pull request #10263 from PurdueDualityLab:dataload_pr
PiperOrigin-RevId: 399483092
482823c8 · A. Unique TensorFlower · 61f8185d · 77aa3ea9 · 482823c8 · 482823c8
Commit 482823c8 authored Sep 28, 2021 by A. Unique TensorFlower
20 changed files
--- a/official/vision/beta/projects/yolo/configs/backbones.py
+++ b/official/vision/beta/projects/yolo/configs/backbones.py
@@ -12,26 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
-
 """Backbones configurations."""
-
 import dataclasses
-
 from official.modeling import hyperparams
-
 from official.vision.beta.configs import backbones


 @dataclasses.dataclass
 class Darknet(hyperparams.Config):
-  """Darknet config."""
-  model_id: str = 'darknet53'
+  """DarkNet config."""
+  model_id: str = 'cspdarknet53'
  width_scale: float = 1.0
  depth_scale: float = 1.0
  dilate: bool = False
  min_level: int = 3
  max_level: int = 5
+  use_separable_conv: bool = False
+  use_reorg_input: bool = False


 @dataclasses.dataclass

--- a/official/vision/beta/projects/yolo/configs/darknet_classification.py
+++ b/official/vision/beta/projects/yolo/configs/darknet_classification.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
 """Image classification with darknet configs."""

 import dataclasses
@@ -28,14 +27,16 @@ from official.vision.beta.projects.yolo.configs import backbones

 @dataclasses.dataclass
 class ImageClassificationModel(hyperparams.Config):
+  """Image classification model config."""
  num_classes: int = 0
-  input_size: List[int] = dataclasses.field(default_factory=list)
+  input_size: List[int] = dataclasses.field(default_factory=lambda: [224, 224])
  backbone: backbones.Backbone = backbones.Backbone(
      type='darknet', darknet=backbones.Darknet())
  dropout_rate: float = 0.0
  norm_activation: common.NormActivation = common.NormActivation()
  # Adds a Batch Normalization layer pre-GlobalAveragePooling in classification.
  add_head_batch_norm: bool = False
+  kernel_initializer: str = 'VarianceScaling'


 @dataclasses.dataclass

--- a/official/vision/beta/projects/yolo/dataloaders/classification_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/classification_input.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classification decoder and parser."""
+import tensorflow as tf
+from official.vision.beta.dataloaders import classification_input
+from official.vision.beta.ops import preprocess_ops
+
+
+class Parser(classification_input.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+
+  def _parse_train_image(self, decoded_tensors):
+    """Parses image data for training."""
+    image_bytes = decoded_tensors[self._image_field_key]
+
+    if self._decode_jpeg_only:
+      image_shape = tf.image.extract_jpeg_shape(image_bytes)
+
+      # Crops image.
+      cropped_image = preprocess_ops.random_crop_image_v2(
+          image_bytes, image_shape)
+      image = tf.cond(
+          tf.reduce_all(tf.equal(tf.shape(cropped_image), image_shape)),
+          lambda: preprocess_ops.center_crop_image_v2(image_bytes, image_shape),
+          lambda: cropped_image)
+    else:
+      # Decodes image.
+      image = tf.io.decode_image(image_bytes, channels=3)
+      image.set_shape([None, None, 3])
+
+      # Crops image.
+      cropped_image = preprocess_ops.random_crop_image(image)
+
+      image = tf.cond(
+          tf.reduce_all(tf.equal(tf.shape(cropped_image), tf.shape(image))),
+          lambda: preprocess_ops.center_crop_image(image),
+          lambda: cropped_image)
+
+    if self._aug_rand_hflip:
+      image = tf.image.random_flip_left_right(image)
+
+    # Resizes image.
+    image = tf.image.resize(
+        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
+    image.set_shape([self._output_size[0], self._output_size[1], 3])
+
+    # Apply autoaug or randaug.
+    if self._augmenter is not None:
+      image = self._augmenter.distort(image)
+
+    # Convert image to self._dtype.
+    image = tf.image.convert_image_dtype(image, self._dtype)
+    image = image / 255.0
+    return image
+
+  def _parse_eval_image(self, decoded_tensors):
+    """Parses image data for evaluation."""
+    image_bytes = decoded_tensors[self._image_field_key]
+
+    if self._decode_jpeg_only:
+      image_shape = tf.image.extract_jpeg_shape(image_bytes)
+
+      # Center crops.
+      image = preprocess_ops.center_crop_image_v2(image_bytes, image_shape)
+    else:
+      # Decodes image.
+      image = tf.io.decode_image(image_bytes, channels=3)
+      image.set_shape([None, None, 3])
+
+      # Center crops.
+      image = preprocess_ops.center_crop_image(image)
+
+    image = tf.image.resize(
+        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
+    image.set_shape([self._output_size[0], self._output_size[1], 3])
+
+    # Convert image to self._dtype.
+    image = tf.image.convert_image_dtype(image, self._dtype)
+    image = image / 255.0
+    return image
--- a/official/vision/beta/projects/yolo/dataloaders/classification_tfds_decoder.py
+++ b/official/vision/beta/projects/yolo/dataloaders/classification_tfds_decoder.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""TFDS Classification decoder."""
-
-import tensorflow as tf
-from official.vision.beta.dataloaders import decoder
-
-
-class Decoder(decoder.Decoder):
-  """A tf.Example decoder for classification task."""
-
-  def __init__(self):
-    return
-
-  def decode(self, serialized_example):
-    sample_dict = {
-        'image/encoded':
-            tf.io.encode_jpeg(serialized_example['image'], quality=100),
-        'image/class/label':
-            serialized_example['label'],
-    }
-    return sample_dict
--- a/official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py
+++ b/official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tensorflow Example proto decoder for object detection.
+
+A decoder to decode string tensors containing serialized tensorflow.Example
+protos for object detection.
+"""
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_decoder
+
+
+def _coco91_to_80(classif, box, areas, iscrowds):
+  """Function used to reduce COCO 91 to COCO 80 (2017 to 2014 format)."""
+  # Vector where index i coralates to the class at index[i].
+  class_ids = [
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+      23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+      44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+      63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85,
+      86, 87, 88, 89, 90
+  ]
+  new_classes = tf.expand_dims(tf.convert_to_tensor(class_ids), axis=0)
+
+  # Resahpe the classes to in order to build a class mask.
+  classes = tf.expand_dims(classif, axis=-1)
+
+  # One hot the classificiations to match the 80 class format.
+  ind = classes == tf.cast(new_classes, classes.dtype)
+
+  # Select the max values.
+  selected_class = tf.reshape(
+      tf.math.argmax(tf.cast(ind, tf.float32), axis=-1), [-1])
+  ind = tf.where(tf.reduce_any(ind, axis=-1))
+
+  # Gather the valuable instances.
+  classif = tf.gather_nd(selected_class, ind)
+  box = tf.gather_nd(box, ind)
+  areas = tf.gather_nd(areas, ind)
+  iscrowds = tf.gather_nd(iscrowds, ind)
+
+  # Restate the number of viable detections, ideally it should be the same.
+  num_detections = tf.shape(classif)[0]
+  return classif, box, areas, iscrowds, num_detections
+
+
+class TfExampleDecoder(tf_example_decoder.TfExampleDecoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self,
+               coco91_to_80=None,
+               include_mask=False,
+               regenerate_source_id=False,
+               mask_binarize_threshold=None):
+    """Initialize the example decoder.
+
+    Args:
+      coco91_to_80: `bool` indicating whether to convert coco from its 91 class
+        format to the 80 class format.
+      include_mask: `bool` indicating if the decoder should also decode instance
+        masks for instance segmentation.
+      regenerate_source_id: `bool` indicating if the source id needs to be
+        recreated for each image sample.
+      mask_binarize_threshold: `float` for binarizing mask values.
+    """
+    if coco91_to_80 and include_mask:
+      raise ValueError('If masks are included you cannot convert coco from the'
+                       '91 class format to the 80 class format.')
+
+    self._coco91_to_80 = coco91_to_80
+    super().__init__(
+        include_mask=include_mask,
+        regenerate_source_id=regenerate_source_id,
+        mask_binarize_threshold=mask_binarize_threshold)
+
+  def decode(self, serialized_example):
+    """Decode the serialized example.
+
+    Args:
+      serialized_example: a single serialized tf.Example string.
+
+    Returns:
+      decoded_tensors: a dictionary of tensors with the following fields:
+        - source_id: a string scalar tensor.
+        - image: a uint8 tensor of shape [None, None, 3].
+        - height: an integer scalar tensor.
+        - width: an integer scalar tensor.
+        - groundtruth_classes: a int64 tensor of shape [None].
+        - groundtruth_is_crowd: a bool tensor of shape [None].
+        - groundtruth_area: a float32 tensor of shape [None].
+        - groundtruth_boxes: a float32 tensor of shape [None, 4].
+        - groundtruth_instance_masks: a float32 tensor of shape
+            [None, None, None].
+        - groundtruth_instance_masks_png: a string tensor of shape [None].
+    """
+    decoded_tensors = super().decode(serialized_example)
+
+    if self._coco91_to_80:
+      (decoded_tensors['groundtruth_classes'],
+       decoded_tensors['groundtruth_boxes'],
+       decoded_tensors['groundtruth_area'],
+       decoded_tensors['groundtruth_is_crowd'],
+       _) = _coco91_to_80(decoded_tensors['groundtruth_classes'],
+                          decoded_tensors['groundtruth_boxes'],
+                          decoded_tensors['groundtruth_area'],
+                          decoded_tensors['groundtruth_is_crowd'])
+    return decoded_tensors
--- a/official/vision/beta/projects/yolo/dataloaders/yolo_detection_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_detection_input.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Detection Data parser and processing for YOLO.
-
-Parse image and ground truths in a dataset to training targets and package them
-into (image, labels) tuple for RetinaNet.
-"""
-
-import tensorflow as tf
-
-from official.vision.beta.dataloaders import parser
-from official.vision.beta.ops import box_ops
-from official.vision.beta.ops import preprocess_ops
-from official.vision.beta.projects.yolo.ops import box_ops as yolo_box_ops
-from official.vision.beta.projects.yolo.ops import preprocess_ops as yolo_preprocess_ops
-
-
-class Parser(parser.Parser):
-  """Parser to parse an image and its annotations into a dictionary of tensors."""
-
-  def __init__(self,
-               output_size,
-               num_classes,
-               fixed_size=True,
-               jitter_im=0.1,
-               jitter_boxes=0.005,
-               use_tie_breaker=True,
-               min_level=3,
-               max_level=5,
-               masks=None,
-               max_process_size=608,
-               min_process_size=320,
-               max_num_instances=200,
-               random_flip=True,
-               aug_rand_saturation=True,
-               aug_rand_brightness=True,
-               aug_rand_zoom=True,
-               aug_rand_hue=True,
-               anchors=None,
-               seed=10,
-               dtype=tf.float32):
-    """Initializes parameters for parsing annotations in the dataset.
-
-    Args:
-      output_size: a `Tuple` for (width, height) of input image.
-      num_classes: a `Tensor` or `int` for the number of classes.
-      fixed_size: a `bool` if True all output images have the same size.
-      jitter_im: a `float` representing a pixel value that is the maximum jitter
-        applied to the image for data augmentation during training.
-      jitter_boxes: a `float` representing a pixel value that is the maximum
-        jitter applied to the bounding box for data augmentation during
-        training.
-      use_tie_breaker: boolean value for wether or not to use the tie_breaker.
-      min_level: `int` number of minimum level of the output feature pyramid.
-      max_level: `int` number of maximum level of the output feature pyramid.
-      masks: a `Tensor`, `List` or `numpy.ndarray` for anchor masks.
-      max_process_size: an `int` for maximum image width and height.
-      min_process_size: an `int` for minimum image width and height.
-      max_num_instances: an `int` number of maximum number of instances in an
-        image.
-      random_flip: a `bool` if True, augment training with random horizontal
-        flip.
-      aug_rand_saturation: `bool`, if True, augment training with random
-        saturation.
-      aug_rand_brightness: `bool`, if True, augment training with random
-        brightness.
-      aug_rand_zoom: `bool`, if True, augment training with random zoom.
-      aug_rand_hue: `bool`, if True, augment training with random hue.
-      anchors: a `Tensor`, `List` or `numpy.ndarrray` for bounding box priors.
-      seed: an `int` for the seed used by tf.random
-      dtype: a `tf.dtypes.DType` object that represents the dtype the outputs
-        will be casted to. The available types are tf.float32, tf.float16, or
-        tf.bfloat16.
-    """
-    self._net_down_scale = 2**max_level
-
-    self._num_classes = num_classes
-    self._image_w = (output_size[0] //
-                     self._net_down_scale) * self._net_down_scale
-    self._image_h = (output_size[1] //
-                     self._net_down_scale) * self._net_down_scale
-
-    self._max_process_size = max_process_size
-    self._min_process_size = min_process_size
-    self._fixed_size = fixed_size
-
-    self._anchors = anchors
-    self._masks = {
-        key: tf.convert_to_tensor(value) for key, value in masks.items()
-    }
-    self._use_tie_breaker = use_tie_breaker
-
-    self._jitter_im = 0.0 if jitter_im is None else jitter_im
-    self._jitter_boxes = 0.0 if jitter_boxes is None else jitter_boxes
-    self._max_num_instances = max_num_instances
-    self._random_flip = random_flip
-
-    self._aug_rand_saturation = aug_rand_saturation
-    self._aug_rand_brightness = aug_rand_brightness
-    self._aug_rand_zoom = aug_rand_zoom
-    self._aug_rand_hue = aug_rand_hue
-
-    self._seed = seed
-    self._dtype = dtype
-
-  def _build_grid(self, raw_true, width, batch=False, use_tie_breaker=False):
-    mask = self._masks
-    for key in self._masks.keys():
-      if not batch:
-        mask[key] = yolo_preprocess_ops.build_grided_gt(
-            raw_true, self._masks[key], width // 2**int(key),
-            raw_true['bbox'].dtype, use_tie_breaker)
-      else:
-        mask[key] = yolo_preprocess_ops.build_batch_grided_gt(
-            raw_true, self._masks[key], width // 2**int(key),
-            raw_true['bbox'].dtype, use_tie_breaker)
-    return mask
-
-  def _parse_train_data(self, data):
-    """Generates images and labels that are usable for model training.
-
-    Args:
-      data: a dict of Tensors produced by the decoder.
-    Returns:
-      images: the image tensor.
-      labels: a dict of Tensors that contains labels.
-    """
-
-    shape = tf.shape(data['image'])
-    image = data['image'] / 255
-    boxes = data['groundtruth_boxes']
-    width = shape[0]
-    height = shape[1]
-
-    image, boxes = yolo_preprocess_ops.fit_preserve_aspect_ratio(
-        image,
-        boxes,
-        width=width,
-        height=height,
-        target_dim=self._max_process_size)
-
-    image_shape = tf.shape(image)[:2]
-
-    if self._random_flip:
-      image, boxes, _ = preprocess_ops.random_horizontal_flip(
-          image, boxes, seed=self._seed)
-
-    randscale = self._image_w // self._net_down_scale
-
-    if not self._fixed_size:
-      do_scale = tf.greater(
-          tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 0.5)
-      if do_scale:
-        # This scales the image to a random multiple of net_down_scale
-        # between 320 to 608
-        randscale = tf.random.uniform(
-            [],
-            minval=self._min_process_size // self._net_down_scale,
-            maxval=self._max_process_size // self._net_down_scale,
-            seed=self._seed,
-            dtype=tf.int32) * self._net_down_scale
-
-    if self._jitter_boxes != 0.0:
-      boxes = box_ops.denormalize_boxes(boxes, image_shape)
-      boxes = box_ops.jitter_boxes(boxes, 0.025)
-      boxes = box_ops.normalize_boxes(boxes, image_shape)
-
-    # YOLO loss function uses x-center, y-center format
-    boxes = yolo_box_ops.yxyx_to_xcycwh(boxes)
-
-    if self._jitter_im != 0.0:
-      image, boxes = yolo_preprocess_ops.random_translate(
-          image, boxes, self._jitter_im, seed=self._seed)
-
-    if self._aug_rand_zoom:
-      image, boxes = yolo_preprocess_ops.resize_crop_filter(
-          image,
-          boxes,
-          default_width=self._image_w,
-          default_height=self._image_h,
-          target_width=randscale,
-          target_height=randscale)
-    image = tf.image.resize(image, (416, 416), preserve_aspect_ratio=False)
-
-    if self._aug_rand_brightness:
-      image = tf.image.random_brightness(
-          image=image, max_delta=.1)  # Brightness
-    if self._aug_rand_saturation:
-      image = tf.image.random_saturation(
-          image=image, lower=0.75, upper=1.25)  # Saturation
-    if self._aug_rand_hue:
-      image = tf.image.random_hue(image=image, max_delta=.3)  # Hue
-    image = tf.clip_by_value(image, 0.0, 1.0)
-    # Find the best anchor for the ground truth labels to maximize the iou
-    best_anchors = yolo_preprocess_ops.get_best_anchor(
-        boxes, self._anchors, width=self._image_w, height=self._image_h)
-
-    # Padding
-    boxes = preprocess_ops.clip_or_pad_to_fixed_size(boxes,
-                                                     self._max_num_instances, 0)
-    classes = preprocess_ops.clip_or_pad_to_fixed_size(
-        data['groundtruth_classes'], self._max_num_instances, -1)
-    best_anchors = preprocess_ops.clip_or_pad_to_fixed_size(
-        best_anchors, self._max_num_instances, 0)
-    area = preprocess_ops.clip_or_pad_to_fixed_size(data['groundtruth_area'],
-                                                    self._max_num_instances, 0)
-    is_crowd = preprocess_ops.clip_or_pad_to_fixed_size(
-        tf.cast(data['groundtruth_is_crowd'], tf.int32),
-        self._max_num_instances, 0)
-
-    labels = {
-        'source_id': data['source_id'],
-        'bbox': tf.cast(boxes, self._dtype),
-        'classes': tf.cast(classes, self._dtype),
-        'area': tf.cast(area, self._dtype),
-        'is_crowd': is_crowd,
-        'best_anchors': tf.cast(best_anchors, self._dtype),
-        'width': width,
-        'height': height,
-        'num_detections': tf.shape(data['groundtruth_classes'])[0],
-    }
-
-    if self._fixed_size:
-      grid = self._build_grid(
-          labels, self._image_w, use_tie_breaker=self._use_tie_breaker)
-      labels.update({'grid_form': grid})
-
-    return image, labels
-
-  def _parse_eval_data(self, data):
-    """Generates images and labels that are usable for model training.
-
-    Args:
-      data: a dict of Tensors produced by the decoder.
-    Returns:
-      images: the image tensor.
-      labels: a dict of Tensors that contains labels.
-    """
-
-    shape = tf.shape(data['image'])
-    image = data['image'] / 255
-    boxes = data['groundtruth_boxes']
-    width = shape[0]
-    height = shape[1]
-
-    image, boxes = yolo_preprocess_ops.fit_preserve_aspect_ratio(
-        image, boxes, width=width, height=height, target_dim=self._image_w)
-    boxes = yolo_box_ops.yxyx_to_xcycwh(boxes)
-
-    # Find the best anchor for the ground truth labels to maximize the iou
-    best_anchors = yolo_preprocess_ops.get_best_anchor(
-        boxes, self._anchors, width=self._image_w, height=self._image_h)
-    boxes = yolo_preprocess_ops.pad_max_instances(boxes,
-                                                  self._max_num_instances, 0)
-    classes = yolo_preprocess_ops.pad_max_instances(data['groundtruth_classes'],
-                                                    self._max_num_instances, 0)
-    best_anchors = yolo_preprocess_ops.pad_max_instances(
-        best_anchors, self._max_num_instances, 0)
-    area = yolo_preprocess_ops.pad_max_instances(data['groundtruth_area'],
-                                                 self._max_num_instances, 0)
-    is_crowd = yolo_preprocess_ops.pad_max_instances(
-        tf.cast(data['groundtruth_is_crowd'], tf.int32),
-        self._max_num_instances, 0)
-
-    labels = {
-        'source_id': data['source_id'],
-        'bbox': tf.cast(boxes, self._dtype),
-        'classes': tf.cast(classes, self._dtype),
-        'area': tf.cast(area, self._dtype),
-        'is_crowd': is_crowd,
-        'best_anchors': tf.cast(best_anchors, self._dtype),
-        'width': width,
-        'height': height,
-        'num_detections': tf.shape(data['groundtruth_classes'])[0],
-    }
-
-    grid = self._build_grid(
-        labels,
-        self._image_w,
-        batch=False,
-        use_tie_breaker=self._use_tie_breaker)
-    labels.update({'grid_form': grid})
-    return image, labels
-
-  def _postprocess_fn(self, image, label):
-    randscale = self._image_w // self._net_down_scale
-    if not self._fixed_size:
-      do_scale = tf.greater(
-          tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 0.5)
-      if do_scale:
-        # This scales the image to a random multiple of net_down_scale
-        # between 320 to 608
-        randscale = tf.random.uniform(
-            [],
-            minval=self._min_process_size // self._net_down_scale,
-            maxval=self._max_process_size // self._net_down_scale,
-            seed=self._seed,
-            dtype=tf.int32) * self._net_down_scale
-    width = randscale
-    image = tf.image.resize(image, (width, width))
-    grid = self._build_grid(
-        label, width, batch=True, use_tie_breaker=self._use_tie_breaker)
-    label.update({'grid_form': grid})
-    return image, label
-
-  def postprocess_fn(self, is_training=True):
-    return self._postprocess_fn if not self._fixed_size and is_training else None
--- a/official/vision/beta/projects/yolo/dataloaders/yolo_detection_input_test.py
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_detection_input_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test case for YOLO detection dataloader configuration definition."""
-from absl.testing import parameterized
-import dataclasses
-import tensorflow as tf
-
-from official.core import config_definitions as cfg
-from official.core import input_reader
-from official.modeling import hyperparams
-from official.vision.beta.dataloaders import tfds_detection_decoders
-from official.vision.beta.projects.yolo.dataloaders import yolo_detection_input
-
-
-@dataclasses.dataclass
-class Parser(hyperparams.Config):
-  """Dummy configuration for parser."""
-  output_size: int = (416, 416)
-  num_classes: int = 80
-  fixed_size: bool = True
-  jitter_im: float = 0.1
-  jitter_boxes: float = 0.005
-  min_process_size: int = 320
-  max_process_size: int = 608
-  max_num_instances: int = 200
-  random_flip: bool = True
-  seed: int = 10
-  shuffle_buffer_size: int = 10000
-
-
-@dataclasses.dataclass
-class DataConfig(cfg.DataConfig):
-  """Input config for training."""
-  input_path: str = ''
-  tfds_name: str = 'coco/2017'
-  tfds_split: str = 'train'
-  global_batch_size: int = 10
-  is_training: bool = True
-  dtype: str = 'float16'
-  decoder = None
-  parser: Parser = Parser()
-  shuffle_buffer_size: int = 10
-
-
-class YoloDetectionInputTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(('training', True), ('testing', False))
-  def test_yolo_input(self, is_training):
-    params = DataConfig(is_training=is_training)
-
-    decoder = tfds_detection_decoders.MSCOCODecoder()
-    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
-               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
-               [348.0, 340.0]]
-    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
-
-    parser = yolo_detection_input.Parser(
-        output_size=params.parser.output_size,
-        num_classes=params.parser.num_classes,
-        fixed_size=params.parser.fixed_size,
-        jitter_im=params.parser.jitter_im,
-        jitter_boxes=params.parser.jitter_boxes,
-        min_process_size=params.parser.min_process_size,
-        max_process_size=params.parser.max_process_size,
-        max_num_instances=params.parser.max_num_instances,
-        random_flip=params.parser.random_flip,
-        seed=params.parser.seed,
-        anchors=anchors,
-        masks=masks)
-    postprocess_fn = parser.postprocess_fn(is_training=is_training)
-
-    reader = input_reader.InputReader(params,
-                                      dataset_fn=tf.data.TFRecordDataset,
-                                      decoder_fn=decoder.decode,
-                                      parser_fn=parser.parse_fn(
-                                          params.is_training))
-    dataset = reader.read(input_context=None).batch(10).take(1)
-    if postprocess_fn:
-      image, _ = postprocess_fn(
-          *tf.data.experimental.get_single_element(dataset))
-    else:
-      image, _ = tf.data.experimental.get_single_element(dataset)
-    print(image.shape)
-    self.assertAllEqual(image.shape, (10, 10, 416, 416, 3))
-    self.assertTrue(
-        tf.reduce_all(tf.math.logical_and(image >= 0, image <= 1)))
-
-
-if __name__ == '__main__':
-  tf.test.main()
-
--- a/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Detection Data parser and processing for YOLO."""
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import parser
+from official.vision.beta.dataloaders import utils
+from official.vision.beta.ops import box_ops as bbox_ops
+from official.vision.beta.ops import preprocess_ops
+from official.vision.beta.projects.yolo.ops import anchor
+from official.vision.beta.projects.yolo.ops import preprocessing_ops
+
+
+class Parser(parser.Parser):
+  """Parse the dataset in to the YOLO model format."""
+
+  def __init__(self,
+               output_size,
+               anchors,
+               expanded_strides,
+               level_limits=None,
+               max_num_instances=200,
+               area_thresh=0.1,
+               aug_rand_hue=1.0,
+               aug_rand_saturation=1.0,
+               aug_rand_brightness=1.0,
+               letter_box=False,
+               random_pad=True,
+               random_flip=True,
+               jitter=0.0,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               aug_rand_translate=0.0,
+               aug_rand_perspective=0.0,
+               aug_rand_angle=0.0,
+               anchor_t=4.0,
+               scale_xy=None,
+               best_match_only=False,
+               darknet=False,
+               use_tie_breaker=True,
+               dtype='float32',
+               seed=None):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `List` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      anchors: `Dict[List[Union[int, float]]]` of anchor boxes to be bes used in
+        each level.
+      expanded_strides: `Dict[int]` for how much the model scales down the
+        images at the largest level. For example, level 3 down samples the image
+        by a factor of 16, in the expanded strides dictionary, we will pass
+        along {3: 16} indicating that relative to the original image, the shapes
+          must be reduced by a factor of 16 to compute the loss.
+      level_limits: `List` the box sizes that will be allowed at each FPN level
+        as is done in the FCOS and YOLOX paper for anchor free box assignment.
+      max_num_instances: `int` for the number of boxes to compute loss on.
+      area_thresh: `float` for the minimum area of a box to allow to pass
+        through for optimization.
+      aug_rand_hue: `float` indicating the maximum scaling value for hue.
+        saturation will be scaled between 1 - value and 1 + value.
+      aug_rand_saturation: `float` indicating the maximum scaling value for
+        saturation. saturation will be scaled between 1/value and value.
+      aug_rand_brightness: `float` indicating the maximum scaling value for
+        brightness. brightness will be scaled between 1/value and value.
+      letter_box: `boolean` indicating whether upon start of the datapipeline
+        regardless of the preprocessing ops that are used, the aspect ratio of
+        the images should be preserved.
+      random_pad: `bool` indiccating wether to use padding to apply random
+        translation true for darknet yolo false for scaled yolo.
+      random_flip: `boolean` indicating whether or not to randomly flip the
+        image horizontally.
+      jitter: `float` for the maximum change in aspect ratio expected in each
+        preprocessing step.
+      aug_scale_min: `float` indicating the minimum scaling value for image
+        scale jitter.
+      aug_scale_max: `float` indicating the maximum scaling value for image
+        scale jitter.
+      aug_rand_translate: `float` ranging from 0 to 1 indicating the maximum
+        amount to randomly translate an image.
+      aug_rand_perspective: `float` ranging from 0.000 to 0.001 indicating how
+        much to prespective warp the image.
+      aug_rand_angle: `float` indicating the maximum angle value for angle.
+        angle will be changes between 0 and value.
+      anchor_t: `float` indicating the threshold over which an anchor will be
+        considered for prediction, at zero, all the anchors will be used and at
+        1.0 only the best will be used. for anchor thresholds larger than 1.0 we
+        stop using the IOU for anchor comparison and resort directly to
+        comparing the width and height, this is used for the scaled models.
+      scale_xy: dictionary `float` values inidcating how far each pixel can see
+        outside of its containment of 1.0. a value of 1.2 indicates there is a
+        20% extended radius around each pixel that this specific pixel can
+        predict values for a center at. the center can range from 0 - value/2 to
+        1 + value/2, this value is set in the yolo filter, and resused here.
+        there should be one value for scale_xy for each level from min_level to
+        max_level.
+      best_match_only: `boolean` indicating how boxes are selected for
+        optimization.
+      darknet: `boolean` indicating which data pipeline to use. Setting to True
+        swaps the pipeline to output images realtive to Yolov4 and older.
+      use_tie_breaker: `boolean` indicating whether to use the anchor threshold
+        value.
+      dtype: `str` indicating the output datatype of the datapipeline selecting
+        from {"float32", "float16", "bfloat16"}.
+      seed: `int` the seed for random number generation.
+    """
+    for key in anchors:
+      # Assert that the width and height is viable
+      assert output_size[1] % expanded_strides[str(key)] == 0
+      assert output_size[0] % expanded_strides[str(key)] == 0
+
+    # Set the width and height properly and base init:
+    self._image_w = output_size[1]
+    self._image_h = output_size[0]
+    self._max_num_instances = max_num_instances
+
+    # Image scaling params
+    self._jitter = 0.0 if jitter is None else jitter
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+    self._aug_rand_translate = aug_rand_translate
+    self._aug_rand_perspective = aug_rand_perspective
+
+    # Image spatial distortion
+    self._random_flip = random_flip
+    self._letter_box = letter_box
+    self._random_pad = random_pad
+    self._aug_rand_angle = aug_rand_angle
+
+    # Color space distortion of the image
+    self._aug_rand_saturation = aug_rand_saturation
+    self._aug_rand_brightness = aug_rand_brightness
+    self._aug_rand_hue = aug_rand_hue
+
+    # Set the per level values needed for operation
+    self._darknet = darknet
+    self._area_thresh = area_thresh
+
+    self._seed = seed
+    self._dtype = dtype
+
+    self._label_builder = anchor.YoloAnchorLabeler(
+        anchors=anchors,
+        anchor_free_level_limits=level_limits,
+        level_strides=expanded_strides,
+        center_radius=scale_xy,
+        max_num_instances=max_num_instances,
+        match_threshold=anchor_t,
+        best_matches_only=best_match_only,
+        use_tie_breaker=use_tie_breaker,
+        darknet=darknet,
+        dtype=dtype)
+
+  def _pad_infos_object(self, image):
+    """Get a Tensor to pad the info object list."""
+    shape_ = tf.shape(image)
+    val = tf.stack([
+        tf.cast(shape_[:2], tf.float32),
+        tf.cast(shape_[:2], tf.float32),
+        tf.ones_like(tf.cast(shape_[:2], tf.float32)),
+        tf.zeros_like(tf.cast(shape_[:2], tf.float32)),
+    ])
+    return val
+
+  def _jitter_scale(self, image, shape, letter_box, jitter, random_pad,
+                    aug_scale_min, aug_scale_max, translate, angle,
+                    perspective):
+    """Distort and scale each input image."""
+    infos = []
+    if (aug_scale_min != 1.0 or aug_scale_max != 1.0):
+      crop_only = True
+      # jitter gives you only one info object, resize and crop gives you one,
+      # if crop only then there can be 1 form jitter and 1 from crop
+      infos.append(self._pad_infos_object(image))
+    else:
+      crop_only = False
+    image, crop_info, _ = preprocessing_ops.resize_and_jitter_image(
+        image,
+        shape,
+        letter_box=letter_box,
+        jitter=jitter,
+        crop_only=crop_only,
+        random_pad=random_pad,
+        seed=self._seed,
+    )
+    infos.extend(crop_info)
+    image, _, affine = preprocessing_ops.affine_warp_image(
+        image,
+        shape,
+        scale_min=aug_scale_min,
+        scale_max=aug_scale_max,
+        translate=translate,
+        degrees=angle,
+        perspective=perspective,
+        random_pad=random_pad,
+        seed=self._seed,
+    )
+    return image, infos, affine
+
+  def _parse_train_data(self, data):
+    """Parses data for training."""
+
+    # Initialize the shape constants.
+    image = data['image']
+    boxes = data['groundtruth_boxes']
+    classes = data['groundtruth_classes']
+
+    if self._random_flip:
+      # Randomly flip the image horizontally.
+      image, boxes, _ = preprocess_ops.random_horizontal_flip(
+          image, boxes, seed=self._seed)
+
+    if not data['is_mosaic']:
+      image, infos, affine = self._jitter_scale(
+          image, [self._image_h, self._image_w], self._letter_box, self._jitter,
+          self._random_pad, self._aug_scale_min, self._aug_scale_max,
+          self._aug_rand_translate, self._aug_rand_angle,
+          self._aug_rand_perspective)
+
+      # Clip and clean boxes.
+      boxes, inds = preprocessing_ops.transform_and_clip_boxes(
+          boxes,
+          infos,
+          affine=affine,
+          shuffle_boxes=False,
+          area_thresh=self._area_thresh,
+          augment=True,
+          seed=self._seed)
+      classes = tf.gather(classes, inds)
+      info = infos[-1]
+    else:
+      image = tf.image.resize(
+          image, (self._image_h, self._image_w), method='nearest')
+      output_size = tf.cast([640, 640], tf.float32)
+      boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
+      inds = bbox_ops.get_non_empty_box_indices(boxes_)
+      boxes = tf.gather(boxes, inds)
+      classes = tf.gather(classes, inds)
+      info = self._pad_infos_object(image)
+
+    # Apply scaling to the hue saturation and brightness of an image.
+    image = tf.cast(image, dtype=self._dtype)
+    image = image / 255.0
+    image = preprocessing_ops.image_rand_hsv(
+        image,
+        self._aug_rand_hue,
+        self._aug_rand_saturation,
+        self._aug_rand_brightness,
+        seed=self._seed,
+        darknet=self._darknet)
+
+    # Cast the image to the selcted datatype.
+    image, labels = self._build_label(
+        image, boxes, classes, info, inds, data, is_training=True)
+    return image, labels
+
+  def _parse_eval_data(self, data):
+    """Parses data for evaluation."""
+
+    # Get the image shape constants and cast the image to the selcted datatype.
+    image = tf.cast(data['image'], dtype=self._dtype)
+    boxes = data['groundtruth_boxes']
+    classes = data['groundtruth_classes']
+
+    image, infos, _ = preprocessing_ops.resize_and_jitter_image(
+        image, [self._image_h, self._image_w],
+        letter_box=self._letter_box,
+        random_pad=False,
+        shiftx=0.5,
+        shifty=0.5,
+        jitter=0.0)
+
+    # Clip and clean boxes.
+    image = image / 255.0
+    boxes, inds = preprocessing_ops.transform_and_clip_boxes(
+        boxes, infos, shuffle_boxes=False, area_thresh=0.0, augment=True)
+    classes = tf.gather(classes, inds)
+    info = infos[-1]
+
+    image, labels = self._build_label(
+        image, boxes, classes, info, inds, data, is_training=False)
+    return image, labels
+
+  def set_shape(self, values, pad_axis=0, pad_value=0, inds=None):
+    """Calls set shape for all input objects."""
+    if inds is not None:
+      values = tf.gather(values, inds)
+    vshape = values.get_shape().as_list()
+
+    values = preprocessing_ops.pad_max_instances(
+        values, self._max_num_instances, pad_axis=pad_axis, pad_value=pad_value)
+
+    vshape[pad_axis] = self._max_num_instances
+    values.set_shape(vshape)
+    return values
+
+  def _build_label(self,
+                   image,
+                   gt_boxes,
+                   gt_classes,
+                   info,
+                   inds,
+                   data,
+                   is_training=True):
+    """Label construction for both the train and eval data."""
+    width = self._image_w
+    height = self._image_h
+
+    # Set the image shape.
+    imshape = image.get_shape().as_list()
+    imshape[-1] = 3
+    image.set_shape(imshape)
+
+    labels = dict()
+    (labels['inds'], labels['upds'],
+     labels['true_conf']) = self._label_builder(gt_boxes, gt_classes, width,
+                                                height)
+
+    # Set/fix the boxes shape.
+    boxes = self.set_shape(gt_boxes, pad_axis=0, pad_value=0)
+    classes = self.set_shape(gt_classes, pad_axis=0, pad_value=-1)
+
+    # Build the dictionary set.
+    labels.update({
+        'source_id': utils.process_source_id(data['source_id']),
+        'bbox': tf.cast(boxes, dtype=self._dtype),
+        'classes': tf.cast(classes, dtype=self._dtype),
+    })
+
+    # Update the labels dictionary.
+    if not is_training:
+
+      # Sets up groundtruth data for evaluation.
+      groundtruths = {
+          'source_id': labels['source_id'],
+          'height': height,
+          'width': width,
+          'num_detections': tf.shape(gt_boxes)[0],
+          'image_info': info,
+          'boxes': gt_boxes,
+          'classes': gt_classes,
+          'areas': tf.gather(data['groundtruth_area'], inds),
+          'is_crowds':
+              tf.cast(tf.gather(data['groundtruth_is_crowd'], inds), tf.int32),
+      }
+      groundtruths['source_id'] = utils.process_source_id(
+          groundtruths['source_id'])
+      groundtruths = utils.pad_groundtruths_to_fixed_size(
+          groundtruths, self._max_num_instances)
+      labels['groundtruths'] = groundtruths
+    return image, labels
--- a/official/vision/beta/projects/yolo/losses/yolo_loss.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss.py
@@ -33,7 +33,6 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):

  def __init__(self,
               classes,
-               mask,
               anchors,
               path_stride=1,
               ignore_thresh=0.7,
@@ -52,8 +51,6 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):

    Args:
      classes: `int` for the number of classes
-      mask: `List[int]` for the output level that this specific model output
-        level
      anchors: `List[List[int]]` for the anchor boxes that are used in the model
        at all levels. For anchor free prediction set the anchor list to be the
        same as the image resolution.
@@ -85,11 +82,10 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
      max_delta: gradient clipping to apply to the box loss.
    """
    self._loss_type = loss_type
-    self._classes = tf.constant(tf.cast(classes, dtype=tf.int32))
-    self._num = tf.cast(len(mask), dtype=tf.int32)
+    self._classes = classes
+    self._num = tf.cast(len(anchors), dtype=tf.int32)
    self._truth_thresh = truth_thresh
    self._ignore_thresh = ignore_thresh
-    self._masks = mask
    self._anchors = anchors

    self._iou_normalizer = iou_normalizer
@@ -111,8 +107,8 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
        max_delta=self._max_delta)
    self._decode_boxes = functools.partial(
        loss_utils.get_predicted_box, **box_kwargs)
-    self._search_pairs = lambda pred_boxes, pred_classes, boxes, classes, scale, yxyx: (None, None, None, None)  # pylint:disable=line-too-long
-    self._build_per_path_attributes()
+
+    self._search_pairs = lambda *args: (None, None, None, None)
    self._build_per_path_attributes()

  def box_loss(self, true_box, pred_box, darknet=False):
@@ -136,10 +132,15 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
                               scale=None):
    """Search of all groundtruths to associate groundtruths to predictions."""

+    boxes = box_ops.yxyx_to_xcycwh(boxes)
+
+    if scale is not None:
+      boxes = boxes * tf.cast(tf.stop_gradient(scale), boxes.dtype)
+
    # Search all predictions against ground truths to find mathcing boxes for
    # each pixel.
-    _, _, iou_max, _ = self._search_pairs(
-        pred_boxes, pred_classes, boxes, classes, scale=scale, yxyx=True)
+    _, _, iou_max, _ = self._search_pairs(pred_boxes, pred_classes, boxes,
+                                          classes)

    if iou_max is None:
      return true_conf, tf.ones_like(true_conf)
@@ -199,9 +200,6 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
     grid_mask) = self._compute_loss(true_counts, inds, y_true, boxes, classes,
                                     y_pred)

-    # Temporary metrics
-    box_loss = tf.stop_gradient(0.05 * box_loss / self._iou_normalizer)
-
    # Metric compute using done here to save time and resources.
    sigmoid_conf = tf.stop_gradient(tf.sigmoid(pred_conf))
    iou = tf.stop_gradient(iou)
@@ -222,21 +220,28 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
    """The actual logic to apply to the raw model for optimization."""
    ...

-  def post_path_aggregation(self, loss, ground_truths, predictions):  # pylint:disable=unused-argument
+  def post_path_aggregation(self, loss, box_loss, conf_loss, class_loss,
+                            ground_truths, predictions):  # pylint:disable=unused-argument
    """This method allows for post processing of a loss value.

    After the loss has been aggregated across all the FPN levels some post
    proceessing may need to occur to poroperly scale the loss. The default
-    behavior is to pass the loss through with no alterations.
+    behavior is to pass the loss through with no alterations. Passing the
+    individual losses for each mask will allow for aggeregation of loss across
+    paths for some losses.

    Args:
      loss: `tf.float` scalar for the actual loss.
+      box_loss: `tf.float` for the loss on the boxs only.
+      conf_loss: `tf.float` for the loss on the confidences only.
+      class_loss: `tf.float` for the loss on the classes only.
      ground_truths: `Dict` holding all the ground truth tensors.
      predictions: `Dict` holding all the predicted values.

    Returns:
      loss: `tf.float` scalar for the scaled loss.
    """
+    del box_loss, conf_loss, class_loss, ground_truths, predictions
    return loss

  @abc.abstractmethod
@@ -280,7 +285,6 @@ class DarknetLoss(YoloLossBase):
    association.
    """
    self._anchor_generator = loss_utils.GridGenerator(
-        masks=self._masks,
        anchors=self._anchors,
        scale_anchors=self._path_stride)

@@ -314,8 +318,7 @@ class DarknetLoss(YoloLossBase):
    anchor_grid = tf.stop_gradient(anchor_grid)

    # Split all the ground truths to use as seperate items in loss computation.
-    (true_box, ind_mask, true_class, _, _) = tf.split(
-        y_true, [4, 1, 1, 1, 1], axis=-1)
+    (true_box, ind_mask, true_class) = tf.split(y_true, [4, 1, 1], axis=-1)
    true_conf = tf.squeeze(true_conf, axis=-1)
    true_class = tf.squeeze(true_class, axis=-1)
    grid_mask = true_conf
@@ -432,13 +435,14 @@ class ScaledLoss(YoloLossBase):
    association.
    """
    self._anchor_generator = loss_utils.GridGenerator(
-        masks=self._masks,
        anchors=self._anchors,
        scale_anchors=self._path_stride)

    if self._ignore_thresh > 0.0:
      self._search_pairs = loss_utils.PairWiseSearch(
          iou_type=self._loss_type, any_match=False, min_conf=0.25)
+
+    self._cls_normalizer = self._cls_normalizer * self._classes / 80
    return

  def _compute_loss(self, true_counts, inds, y_true, boxes, classes, y_pred):
@@ -457,8 +461,7 @@ class ScaledLoss(YoloLossBase):
        width, height, batch_size, dtype=tf.float32)

    # Split the y_true list.
-    (true_box, ind_mask, true_class, _, _) = tf.split(
-        y_true, [4, 1, 1, 1, 1], axis=-1)
+    (true_box, ind_mask, true_class) = tf.split(y_true, [4, 1, 1], axis=-1)
    grid_mask = true_conf = tf.squeeze(true_conf, axis=-1)
    true_class = tf.squeeze(true_class, axis=-1)
    num_objs = tf.cast(tf.reduce_sum(ind_mask), dtype=y_pred.dtype)
@@ -469,7 +472,7 @@ class ScaledLoss(YoloLossBase):
    pred_box, pred_conf, pred_class = tf.split(y_pred, [4, 1, -1], axis=-1)

    # Decode the boxes for loss compute.
-    scale, pred_box, _ = self._decode_boxes(
+    scale, pred_box, pbg = self._decode_boxes(
        fwidth, fheight, pred_box, anchor_grid, grid_points, darknet=False)

    # If the ignore threshold is enabled, search all boxes ignore all
@@ -477,20 +480,24 @@ class ScaledLoss(YoloLossBase):
    # noted ground truth list.
    if self._ignore_thresh != 0.0:
      (_, obj_mask) = self._tiled_global_box_search(
-          pred_box,
+          pbg,
          tf.stop_gradient(tf.sigmoid(pred_class)),
          boxes,
          classes,
          true_conf,
          smoothed=False,
-          scale=scale)
+          scale=None)

    # Scale and shift and select the ground truth boxes
    # and predictions to the prediciton domain.
-    offset = tf.cast(
-        tf.gather_nd(grid_points, inds, batch_dims=1), true_box.dtype)
-    offset = tf.concat([offset, tf.zeros_like(offset)], axis=-1)
-    true_box = loss_utils.apply_mask(ind_mask, (scale * true_box) - offset)
+    if self._box_type == 'anchor_free':
+      true_box = loss_utils.apply_mask(ind_mask,
+                                       (scale * self._path_stride * true_box))
+    else:
+      offset = tf.cast(
+          tf.gather_nd(grid_points, inds, batch_dims=1), true_box.dtype)
+      offset = tf.concat([offset, tf.zeros_like(offset)], axis=-1)
+      true_box = loss_utils.apply_mask(ind_mask, (scale * true_box) - offset)
    pred_box = loss_utils.apply_mask(ind_mask,
                                     tf.gather_nd(pred_box, inds, batch_dims=1))

@@ -523,7 +530,9 @@ class ScaledLoss(YoloLossBase):
        tf.expand_dims(true_conf, axis=-1), pred_conf, from_logits=True)
    if self._ignore_thresh != 0.0:
      bce = loss_utils.apply_mask(obj_mask, bce)
-    conf_loss = tf.reduce_mean(bce)
+      conf_loss = tf.reduce_sum(bce) / tf.reduce_sum(obj_mask)
+    else:
+      conf_loss = tf.reduce_mean(bce)

    # Compute the cross entropy loss for the class maps.
    class_loss = tf.keras.losses.binary_crossentropy(
@@ -547,7 +556,8 @@ class ScaledLoss(YoloLossBase):
    return (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf,
            ind_mask, grid_mask)

-  def post_path_aggregation(self, loss, ground_truths, predictions):
+  def post_path_aggregation(self, loss, box_loss, conf_loss, class_loss,
+                            ground_truths, predictions):
    """This method allows for post processing of a loss value.

    By default the model will have about 3 FPN levels {3, 4, 5}, on
@@ -558,9 +568,11 @@ class ScaledLoss(YoloLossBase):

    Args:
      loss: `tf.float` scalar for the actual loss.
+      box_loss: `tf.float` for the loss on the boxs only.
+      conf_loss: `tf.float` for the loss on the confidences only.
+      class_loss: `tf.float` for the loss on the classes only.
      ground_truths: `Dict` holding all the ground truth tensors.
      predictions: `Dict` holding all the predicted values.
-
    Returns:
      loss: `tf.float` scalar for the scaled loss.
    """
@@ -568,7 +580,7 @@ class ScaledLoss(YoloLossBase):
    return loss * scale

  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
-    """In the scaled loss, take the sum of the loss across replicas."""
+    """this method is not specific to each loss path, but each loss type."""
    return loss


@@ -579,7 +591,6 @@ class YoloLoss:
               keys,
               classes,
               anchors,
-               masks=None,
               path_strides=None,
               truth_thresholds=None,
               ignore_thresholds=None,
@@ -603,8 +614,6 @@ class YoloLoss:
      anchors: `List[List[int]]` for the anchor boxes that are used in the model
        at all levels. For anchor free prediction set the anchor list to be the
        same as the image resolution.
-      masks: `List[int]` for the output level that this specific model output
-        level
      path_strides: `Dict[int]` for how much to scale this level to get the
        orginal input shape for each FPN path.
      truth_thresholds: `Dict[float]` for the IOU value over which the loss is
@@ -651,8 +660,7 @@ class YoloLoss:
    for key in keys:
      self._loss_dict[key] = losses[loss_type](
          classes=classes,
-          anchors=anchors,
-          mask=masks[key],
+          anchors=anchors[key],
          truth_thresh=truth_thresholds[key],
          ignore_thresh=ignore_thresholds[key],
          loss_type=loss_types[key],
@@ -667,7 +675,7 @@ class YoloLoss:
          update_on_repeat=update_on_repeat,
          label_smoothing=label_smoothing)

-  def __call__(self, ground_truth, predictions, use_reduced_logs=True):
+  def __call__(self, ground_truth, predictions):
    metric_dict = collections.defaultdict(dict)
    metric_dict['net']['box'] = 0
    metric_dict['net']['class'] = 0
@@ -687,8 +695,10 @@ class YoloLoss:

      # after computing the loss, scale loss as needed for aggregation
      # across FPN levels
-      loss = self._loss_dict[key].post_path_aggregation(
-          loss, ground_truth, predictions)
+      loss = self._loss_dict[key].post_path_aggregation(loss, loss_box,
+                                                        loss_conf, loss_class,
+                                                        ground_truth,
+                                                        predictions)

      # after completing the scaling of the loss on each replica, handle
      # scaling the loss for mergeing the loss across replicas
@@ -703,11 +713,6 @@ class YoloLoss:
      metric_dict[key]['avg_iou'] = tf.stop_gradient(avg_iou)
      metric_dict[key]['avg_obj'] = tf.stop_gradient(avg_obj)

-      if not use_reduced_logs:
-        metric_dict[key]['conf_loss'] = tf.stop_gradient(loss_conf)
-        metric_dict[key]['box_loss'] = tf.stop_gradient(loss_box)
-        metric_dict[key]['class_loss'] = tf.stop_gradient(loss_class)
-
      metric_dict['net']['box'] += tf.stop_gradient(loss_box)
      metric_dict['net']['class'] += tf.stop_gradient(loss_class)
      metric_dict['net']['conf'] += tf.stop_gradient(loss_conf)

--- a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
@@ -42,10 +42,11 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
        '5': [1, 13, 13, 255]
    }
    classes = 80
-    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
-    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
-               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
-               [348.0, 340.0]]
+    anchors = {
+        '3': [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0]],
+        '4': [[46.0, 114.0], [133.0, 127.0], [79.0, 225.0]],
+        '5': [[301.0, 150.0], [172.0, 286.0], [348.0, 340.0]]
+    }
    keys = ['3', '4', '5']
    path_strides = {key: 2**int(key) for key in keys}

@@ -53,7 +54,6 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
        keys,
        classes,
        anchors,
-        masks=masks,
        path_strides=path_strides,
        truth_thresholds={key: 1.0 for key in keys},
        ignore_thresholds={key: 0.7 for key in keys},
@@ -79,7 +79,7 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
        '4': [1, 300, 3],
        '5': [1, 300, 3]
    }, tf.int32)
-    truths = inpdict({'3': [1, 300, 8], '4': [1, 300, 8], '5': [1, 300, 8]})
+    truths = inpdict({'3': [1, 300, 6], '4': [1, 300, 6], '5': [1, 300, 6]})
    boxes = tf.ones([1, 300, 4], dtype=tf.float32)
    classes = tf.ones([1, 300], dtype=tf.float32)


--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -383,9 +383,11 @@ class Darknet(tf.keras.Model):
      max_level=5,
      width_scale=1.0,
      depth_scale=1.0,
+      use_reorg_input=False,
      csp_level_mod=(),
      activation=None,
      use_sync_bn=False,
+      use_separable_conv=False,
      norm_momentum=0.99,
      norm_epsilon=0.001,
      dilate=False,
@@ -412,11 +414,13 @@ class Darknet(tf.keras.Model):
    self._norm_momentum = norm_momentum
    self._norm_epislon = norm_epsilon
    self._use_sync_bn = use_sync_bn
+    self._use_separable_conv = use_separable_conv
    self._activation = activation
    self._kernel_regularizer = kernel_regularizer
    self._dilate = dilate
    self._width_scale = width_scale
    self._depth_scale = depth_scale
+    self._use_reorg_input = use_reorg_input

    self._default_dict = {
        'kernel_initializer': self._kernel_initializer,
@@ -426,6 +430,7 @@ class Darknet(tf.keras.Model):
        'norm_epsilon': self._norm_epislon,
        'use_sync_bn': self._use_sync_bn,
        'activation': self._activation,
+        'use_separable_conv': self._use_separable_conv,
        'dilation_rate': 1,
        'name': None
    }
@@ -447,6 +452,9 @@ class Darknet(tf.keras.Model):
    return self._splits

  def _build_struct(self, net, inputs):
+    if self._use_reorg_input:
+      inputs = nn_blocks.Reorg()(inputs)
+
    endpoints = collections.OrderedDict()
    stack_outputs = [inputs]
    for i, config in enumerate(net):
@@ -662,25 +670,26 @@ class Darknet(tf.keras.Model):
 @factory.register_backbone_builder('darknet')
 def build_darknet(
    input_specs: tf.keras.layers.InputSpec,
-    backbone_cfg: hyperparams.Config,
+    backbone_config: hyperparams.Config,
    norm_activation_config: hyperparams.Config,
-    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:  # pytype: disable=annotation-type-mismatch  # typed-keras
+    l2_regularizer: tf.keras.regularizers.Regularizer = None
+) -> tf.keras.Model:  # pytype: disable=annotation-type-mismatch  # typed-keras
  """Builds darknet."""

-  backbone_cfg = backbone_cfg.get()
-
+  backbone_config = backbone_config.get()
  model = Darknet(
-      model_id=backbone_cfg.model_id,
-      min_level=backbone_cfg.min_level,
-      max_level=backbone_cfg.max_level,
+      model_id=backbone_config.model_id,
+      min_level=backbone_config.min_level,
+      max_level=backbone_config.max_level,
      input_specs=input_specs,
-      dilate=backbone_cfg.dilate,
-      width_scale=backbone_cfg.width_scale,
-      depth_scale=backbone_cfg.depth_scale,
+      dilate=backbone_config.dilate,
+      width_scale=backbone_config.width_scale,
+      depth_scale=backbone_config.depth_scale,
+      use_reorg_input=backbone_config.use_reorg_input,
      activation=norm_activation_config.activation,
      use_sync_bn=norm_activation_config.use_sync_bn,
+      use_separable_conv=backbone_config.use_separable_conv,
      norm_momentum=norm_activation_config.norm_momentum,
      norm_epsilon=norm_activation_config.norm_epsilon,
      kernel_regularizer=l2_regularizer)
-  model.summary()
  return model
--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -21,7 +21,7 @@ from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
 @tf.keras.utils.register_keras_serializable(package='yolo')
 class _IdentityRoute(tf.keras.layers.Layer):

-  def call(self, inputs):
+  def call(self, inputs):  # pylint: disable=arguments-differ
    return None, inputs


@@ -36,6 +36,7 @@ class YoloFPN(tf.keras.layers.Layer):
               activation='leaky',
               fpn_filter_scale=1,
               use_sync_bn=False,
+               use_separable_conv=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
               kernel_initializer='VarianceScaling',
@@ -52,6 +53,7 @@ class YoloFPN(tf.keras.layers.Layer):
      activation: `str`, the activation function to use typically leaky or mish.
      fpn_filter_scale: `int`, scaling factor for the FPN filters.
      use_sync_bn: if True, use synchronized batch normalization.
+      use_separable_conv: `bool` whether to use separable convs.
      norm_momentum: `float`, normalization momentum for the moving average.
      norm_epsilon: `float`, small float added to variance to avoid dividing by
        zero.
@@ -66,6 +68,7 @@ class YoloFPN(tf.keras.layers.Layer):

    self._activation = activation
    self._use_sync_bn = use_sync_bn
+    self._use_separable_conv = use_separable_conv
    self._norm_momentum = norm_momentum
    self._norm_epsilon = norm_epsilon
    self._kernel_initializer = kernel_initializer
@@ -78,6 +81,7 @@ class YoloFPN(tf.keras.layers.Layer):
    self._base_config = dict(
        activation=self._activation,
        use_sync_bn=self._use_sync_bn,
+        use_separable_conv=self._use_separable_conv,
        kernel_regularizer=self._kernel_regularizer,
        kernel_initializer=self._kernel_initializer,
        bias_regularizer=self._bias_regularizer,
@@ -181,6 +185,7 @@ class YoloPAN(tf.keras.layers.Layer):
               csp_stack=False,
               activation='leaky',
               use_sync_bn=False,
+               use_separable_conv=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
               kernel_initializer='VarianceScaling',
@@ -200,6 +205,7 @@ class YoloPAN(tf.keras.layers.Layer):
      csp_stack: `bool`, CSPize the FPN.
      activation: `str`, the activation function to use typically leaky or mish.
      use_sync_bn: if True, use synchronized batch normalization.
+      use_separable_conv: `bool` whether to use separable convs.
      norm_momentum: `float`, normalization omentum for the moving average.
      norm_epsilon: `float`, small float added to variance to avoid dividing
        by zero.
@@ -220,6 +226,7 @@ class YoloPAN(tf.keras.layers.Layer):

    self._activation = activation
    self._use_sync_bn = use_sync_bn
+    self._use_separable_conv = use_separable_conv
    self._norm_momentum = norm_momentum
    self._norm_epsilon = norm_epsilon
    self._kernel_initializer = kernel_initializer
@@ -236,6 +243,7 @@ class YoloPAN(tf.keras.layers.Layer):
    self._base_config = dict(
        activation=self._activation,
        use_sync_bn=self._use_sync_bn,
+        use_separable_conv=self._use_separable_conv,
        kernel_regularizer=self._kernel_regularizer,
        kernel_initializer=self._kernel_initializer,
        bias_regularizer=self._bias_regularizer,
@@ -371,6 +379,7 @@ class YoloDecoder(tf.keras.Model):
               embed_spp=False,
               activation='leaky',
               use_sync_bn=False,
+               use_separable_conv=False,
               norm_momentum=0.99,
               norm_epsilon=0.001,
               kernel_initializer='VarianceScaling',
@@ -397,6 +406,7 @@ class YoloDecoder(tf.keras.Model):
      embed_spp: `bool`, use the SPP found in the YoloV3 and V4 model.
      activation: `str`, the activation function to use typically leaky or mish.
      use_sync_bn: if True, use synchronized batch normalization.
+      use_separable_conv: `bool` wether to use separable convs.
      norm_momentum: `float`, normalization omentum for the moving average.
      norm_epsilon: `float`, small float added to variance to avoid dividing by
        zero.
@@ -415,6 +425,7 @@ class YoloDecoder(tf.keras.Model):

    self._activation = activation
    self._use_sync_bn = use_sync_bn
+    self._use_separable_conv = use_separable_conv
    self._norm_momentum = norm_momentum
    self._norm_epsilon = norm_epsilon
    self._kernel_initializer = kernel_initializer
@@ -426,6 +437,7 @@ class YoloDecoder(tf.keras.Model):
        csp_stack=csp_stack,
        activation=self._activation,
        use_sync_bn=self._use_sync_bn,
+        use_separable_conv=self._use_separable_conv,
        fpn_filter_scale=fpn_filter_scale,
        norm_momentum=self._norm_momentum,
        norm_epsilon=self._norm_epsilon,

--- a/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
@@ -34,6 +34,7 @@ class YoloHead(tf.keras.layers.Layer):
               bias_regularizer=None,
               activation=None,
               smart_bias=False,
+               use_separable_conv=False,
               **kwargs):
    """Yolo Prediction Head initialization function.

@@ -52,7 +53,8 @@ class YoloHead(tf.keras.layers.Layer):
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
      activation: `str`, the activation function to use typically leaky or mish.
-      smart_bias: `bool` whether or not use smart bias.
+      smart_bias: `bool`, whether to use smart bias.
+      use_separable_conv: `bool` wether to use separable convs.
      **kwargs: keyword arguments to be passed.
    """

@@ -70,6 +72,7 @@ class YoloHead(tf.keras.layers.Layer):

    self._output_conv = (classes + output_extras + 5) * boxes_per_level
    self._smart_bias = smart_bias
+    self._use_separable_conv = use_separable_conv

    self._base_config = dict(
        activation=activation,
@@ -85,6 +88,7 @@ class YoloHead(tf.keras.layers.Layer):
        strides=(1, 1),
        padding='same',
        use_bn=False,
+        use_separable_conv=self._use_separable_conv,
        **self._base_config)

  def bias_init(self, scale, inshape, isize=640, no_per_conf=8):

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
@@ -26,7 +26,6 @@ class YoloLayer(tf.keras.Model):
  """Yolo layer (detection generator)."""

  def __init__(self,
-               masks,
               anchors,
               classes,
               iou_thresh=0.0,
@@ -52,8 +51,6 @@ class YoloLayer(tf.keras.Model):
    """Parameters for the loss functions used at each detection head output.

    Args:
-      masks: `List[int]` for the output level that this specific model output
-        level.
      anchors: `List[List[int]]` for the anchor boxes that are used in the
        model.
      classes: `int` for the number of classes.
@@ -107,7 +104,6 @@ class YoloLayer(tf.keras.Model):
      **kwargs: Addtional keyword arguments.
    """
    super().__init__(**kwargs)
-    self._masks = masks
    self._anchors = anchors
    self._thresh = iou_thresh
    self._ignore_thresh = ignore_thresh
@@ -127,30 +123,24 @@ class YoloLayer(tf.keras.Model):

    self._pre_nms_points = pre_nms_points
    self._label_smoothing = label_smoothing
-    self._keys = list(masks.keys())
+
+    self._keys = list(anchors.keys())
    self._len_keys = len(self._keys)
    self._box_type = box_type
-    self._path_scale = path_scale or {
-        key: 2**int(key) for key, _ in masks.items()
-    }
+    self._path_scale = path_scale or {key: 2**int(key) for key in self._keys}

    self._nms_type = nms_type
-    self._scale_xy = scale_xy or {key: 1.0 for key, _ in masks.items()}
+    self._scale_xy = scale_xy or {key: 1.0 for key, _ in anchors.items()}

    self._generator = {}
    self._len_mask = {}
    for key in self._keys:
-      anchors = [self._anchors[mask] for mask in self._masks[key]]
-      self._generator[key] = self.get_generators(anchors, self._path_scale[key],  # pylint: disable=assignment-from-none
-                                                 key)
-      self._len_mask[key] = len(self._masks[key])
+      anchors = self._anchors[key]
+      self._generator[key] = loss_utils.GridGenerator(
+          anchors, scale_anchors=self._path_scale[key])
+      self._len_mask[key] = len(anchors)
    return

-  def get_generators(self, anchors, path_scale, path_key):
-    anchor_generator = loss_utils.GridGenerator(
-        anchors, scale_anchors=path_scale)
-    return anchor_generator
-
  def parse_prediction_path(self, key, inputs):
    shape_ = tf.shape(inputs)
    shape = inputs.get_shape().as_list()
@@ -280,18 +270,19 @@ class YoloLayer(tf.keras.Model):
        'num_detections': num_detections,
    }

-  @property
-  def losses(self):
+  def get_losses(self):
    """Generates a dictionary of losses to apply to each path.

    Done in the detection generator because all parameters are the same
-    across both loss and detection generator
+    across both loss and detection generator.
+
+    Returns:
+      Dict[str, tf.Tensor] of losses
    """
    loss = yolo_loss.YoloLoss(
        keys=self._keys,
        classes=self._classes,
        anchors=self._anchors,
-        masks=self._masks,
        path_strides=self._path_scale,
        truth_thresholds=self._truth_thresh,
        ignore_thresholds=self._ignore_thresh,
@@ -310,7 +301,6 @@ class YoloLayer(tf.keras.Model):

  def get_config(self):
    return {
-        'masks': dict(self._masks),
        'anchors': [list(a) for a in self._anchors],
        'thresh': self._thresh,
        'max_boxes': self._max_boxes,

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 """Tests for yolo detection generator."""
-
 from absl.testing import parameterized
 import tensorflow as tf

@@ -35,14 +34,15 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
        '5': [1, 13, 13, 255]
    }
    classes = 80
-    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
-    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
-               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
-               [348.0, 340.0]]
-    box_type = {key: 'scaled' for key in masks.keys()}
-
-    layer = dg.YoloLayer(
-        masks, anchors, classes, box_type=box_type, max_boxes=10)
+    anchors = {
+        '3': [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0]],
+        '4': [[46.0, 114.0], [133.0, 127.0], [79.0, 225.0]],
+        '5': [[301.0, 150.0], [172.0, 286.0], [348.0, 340.0]]
+    }
+
+    box_type = {key: 'scaled' for key in anchors.keys()}
+
+    layer = dg.YoloLayer(anchors, classes, box_type=box_type, max_boxes=10)

    inputs = {}
    for key in input_shape:

--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
--- a/official/vision/beta/projects/yolo/modeling/yolo_model.py
+++ b/official/vision/beta/projects/yolo/modeling/yolo_model.py
@@ -16,7 +16,6 @@

 import tensorflow as tf

-
 # static base Yolo Models that do not require configuration
 # similar to a backbone model id.

@@ -104,7 +103,7 @@ class Yolo(tf.keras.Model):
    self._backbone = backbone
    self._decoder = decoder
    self._head = head
-    self._filter = detection_generator
+    self._detection_generator = detection_generator
    return

  def call(self, inputs, training=False):
@@ -115,7 +114,7 @@ class Yolo(tf.keras.Model):
      return {"raw_output": raw_predictions}
    else:
      # Post-processing.
-      predictions = self._filter(raw_predictions)
+      predictions = self._detection_generator(raw_predictions)
      predictions.update({"raw_output": raw_predictions})
      return predictions

@@ -132,8 +131,8 @@ class Yolo(tf.keras.Model):
    return self._head

  @property
-  def filter(self):
-    return self._filter
+  def detection_generator(self):
+    return self._detection_generator

  def get_config(self):
    return self._config_dict

--- a/official/vision/beta/projects/yolo/ops/anchor.py
+++ b/official/vision/beta/projects/yolo/ops/anchor.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Yolo Anchor labler."""
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.projects.yolo.ops import box_ops
+from official.vision.beta.projects.yolo.ops import loss_utils
+from official.vision.beta.projects.yolo.ops import preprocessing_ops
+
+INF = 10000000
+
+
+def get_best_anchor(y_true,
+                    anchors,
+                    stride,
+                    width=1,
+                    height=1,
+                    iou_thresh=0.25,
+                    best_match_only=False,
+                    use_tie_breaker=True):
+  """Get the correct anchor that is assoiciated with each box using IOU.
+
+  Args:
+    y_true: tf.Tensor[] for the list of bounding boxes in the yolo format.
+    anchors: list or tensor for the anchor boxes to be used in prediction found
+      via Kmeans.
+    stride: `int` stride for the anchors.
+    width: int for the image width.
+    height: int for the image height.
+    iou_thresh: `float` the minimum iou threshold to use for selecting boxes for
+      each level.
+    best_match_only: `bool` if the box only has one match and it is less than
+      the iou threshold, when set to True, this match will be dropped as no
+      anchors can be linked to it.
+    use_tie_breaker: `bool` if there is many anchors for a given box, then
+      attempt to use all of them, if False, only the first matching box will be
+      used.
+  Returns:
+    tf.Tensor: y_true with the anchor associated with each ground truth box
+      known
+  """
+  with tf.name_scope('get_best_anchor'):
+    width = tf.cast(width, dtype=tf.float32)
+    height = tf.cast(height, dtype=tf.float32)
+    scaler = tf.convert_to_tensor([width, height])
+
+    # scale to levels houts width and height
+    true_wh = tf.cast(y_true[..., 2:4], dtype=tf.float32) * scaler
+
+    # scale down from large anchor to small anchor type
+    anchors = tf.cast(anchors, dtype=tf.float32) / stride
+
+    k = tf.shape(anchors)[0]
+
+    anchors = tf.concat([tf.zeros_like(anchors), anchors], axis=-1)
+    truth_comp = tf.concat([tf.zeros_like(true_wh), true_wh], axis=-1)
+
+    if iou_thresh >= 1.0:
+      anchors = tf.expand_dims(anchors, axis=-2)
+      truth_comp = tf.expand_dims(truth_comp, axis=-3)
+
+      aspect = truth_comp[..., 2:4] / anchors[..., 2:4]
+      aspect = tf.where(tf.math.is_nan(aspect), tf.zeros_like(aspect), aspect)
+      aspect = tf.maximum(aspect, 1 / aspect)
+      aspect = tf.where(tf.math.is_nan(aspect), tf.zeros_like(aspect), aspect)
+      aspect = tf.reduce_max(aspect, axis=-1)
+
+      values, indexes = tf.math.top_k(
+          tf.transpose(-aspect, perm=[1, 0]),
+          k=tf.cast(k, dtype=tf.int32),
+          sorted=True)
+      values = -values
+      ind_mask = tf.cast(values < iou_thresh, dtype=indexes.dtype)
+    else:
+      truth_comp = box_ops.xcycwh_to_yxyx(truth_comp)
+      anchors = box_ops.xcycwh_to_yxyx(anchors)
+      iou_raw = box_ops.aggregated_comparitive_iou(
+          truth_comp,
+          anchors,
+          iou_type=3,
+      )
+      values, indexes = tf.math.top_k(
+          iou_raw, k=tf.cast(k, dtype=tf.int32), sorted=True)
+      ind_mask = tf.cast(values >= iou_thresh, dtype=indexes.dtype)
+
+    # pad the indexs such that all values less than the thresh are -1
+    # add one, multiply the mask to zeros all the bad locations
+    # subtract 1 makeing all the bad locations 0.
+    if best_match_only:
+      iou_index = ((indexes[..., 0:] + 1) * ind_mask[..., 0:]) - 1
+    elif use_tie_breaker:
+      iou_index = tf.concat([
+          tf.expand_dims(indexes[..., 0], axis=-1),
+          ((indexes[..., 1:] + 1) * ind_mask[..., 1:]) - 1
+      ],
+                            axis=-1)
+    else:
+      iou_index = tf.concat([
+          tf.expand_dims(indexes[..., 0], axis=-1),
+          tf.zeros_like(indexes[..., 1:]) - 1
+      ],
+                            axis=-1)
+
+  return tf.cast(iou_index, dtype=tf.float32), tf.cast(values, dtype=tf.float32)
+
+
+class YoloAnchorLabeler:
+  """Anchor labeler for the Yolo Models."""
+
+  def __init__(self,
+               anchors=None,
+               anchor_free_level_limits=None,
+               level_strides=None,
+               center_radius=None,
+               max_num_instances=200,
+               match_threshold=0.25,
+               best_matches_only=False,
+               use_tie_breaker=True,
+               darknet=False,
+               dtype='float32'):
+    """Initialization for anchor labler.
+
+    Args:
+      anchors: `Dict[List[Union[int, float]]]` values for each anchor box.
+      anchor_free_level_limits: `List` the box sizes that will be allowed at
+        each FPN level as is done in the FCOS and YOLOX paper for anchor free
+        box assignment.
+      level_strides: `Dict[int]` for how much the model scales down the images
+        at the each level.
+      center_radius: `Dict[float]` for radius around each box center to search
+        for extra centers in each level.
+      max_num_instances: `int` for the number of boxes to compute loss on.
+      match_threshold: `float` indicating the threshold over which an anchor
+        will be considered for prediction, at zero, all the anchors will be used
+        and at 1.0 only the best will be used. for anchor thresholds larger than
+        1.0 we stop using the IOU for anchor comparison and resort directly to
+        comparing the width and height, this is used for the scaled models.
+      best_matches_only: `boolean` indicating how boxes are selected for
+        optimization.
+      use_tie_breaker: `boolean` indicating whether to use the anchor threshold
+        value.
+      darknet: `boolean` indicating which data pipeline to use. Setting to True
+        swaps the pipeline to output images realtive to Yolov4 and older.
+      dtype: `str` indicating the output datatype of the datapipeline selecting
+        from {"float32", "float16", "bfloat16"}.
+    """
+    self.anchors = anchors
+    self.masks = self._get_mask()
+    self.anchor_free_level_limits = self._get_level_limits(
+        anchor_free_level_limits)
+
+    if darknet and self.anchor_free_level_limits is None:
+      center_radius = None
+
+    self.keys = self.anchors.keys()
+    if self.anchor_free_level_limits is not None:
+      maxim = 2000
+      match_threshold = -0.01
+      self.num_instances = {key: maxim for key in self.keys}
+    elif not darknet:
+      self.num_instances = {
+          key: (6 - i) * max_num_instances for i, key in enumerate(self.keys)
+      }
+    else:
+      self.num_instances = {key: max_num_instances for key in self.keys}
+
+    self.center_radius = center_radius
+    self.level_strides = level_strides
+    self.match_threshold = match_threshold
+    self.best_matches_only = best_matches_only
+    self.use_tie_breaker = use_tie_breaker
+    self.dtype = dtype
+
+  def _get_mask(self):
+    """For each level get indexs of each anchor for box search across levels."""
+    masks = {}
+    start = 0
+
+    minimum = int(min(self.anchors.keys()))
+    maximum = int(max(self.anchors.keys()))
+    for i in range(minimum, maximum + 1):
+      per_scale = len(self.anchors[str(i)])
+      masks[str(i)] = list(range(start, per_scale + start))
+      start += per_scale
+    return masks
+
+  def _get_level_limits(self, level_limits):
+    """For each level receptive feild range for anchor free box placement."""
+    if level_limits is not None:
+      level_limits_dict = {}
+      level_limits = [0.0] + level_limits + [np.inf]
+
+      for i, key in enumerate(self.anchors.keys()):
+        level_limits_dict[key] = level_limits[i:i + 2]
+    else:
+      level_limits_dict = None
+    return level_limits_dict
+
+  def _tie_breaking_search(self, anchors, mask, boxes, classes):
+    """After search, link each anchor ind to the correct map in ground truth."""
+    mask = tf.cast(tf.reshape(mask, [1, 1, 1, -1]), anchors.dtype)
+    anchors = tf.expand_dims(anchors, axis=-1)
+    viable = tf.where(tf.squeeze(anchors == mask, axis=0))
+
+    gather_id, _, anchor_id = tf.split(viable, 3, axis=-1)
+
+    boxes = tf.gather_nd(boxes, gather_id)
+    classes = tf.gather_nd(classes, gather_id)
+
+    classes = tf.expand_dims(classes, axis=-1)
+    classes = tf.cast(classes, boxes.dtype)
+    anchor_id = tf.cast(anchor_id, boxes.dtype)
+    return boxes, classes, anchor_id
+
+  def _get_anchor_id(self,
+                     key,
+                     boxes,
+                     classes,
+                     width,
+                     height,
+                     stride,
+                     iou_index=None):
+    """Find the object anchor assignments in an anchor based paradigm."""
+
+    # find the best anchor
+    anchors = self.anchors[key]
+    num_anchors = len(anchors)
+    if self.best_matches_only:
+      # get the best anchor for each box
+      iou_index, _ = get_best_anchor(
+          boxes,
+          anchors,
+          stride,
+          width=width,
+          height=height,
+          best_match_only=True,
+          iou_thresh=self.match_threshold)
+      mask = range(num_anchors)
+    else:
+      # search is done across FPN levels, get the mask of anchor indexes
+      # corralated to this level.
+      mask = self.masks[key]
+
+    # search for the correct box to use
+    (boxes, classes,
+     anchors) = self._tie_breaking_search(iou_index, mask, boxes, classes)
+    return boxes, classes, anchors, num_anchors
+
+  def _get_centers(self, boxes, classes, anchors, width, height, scale_xy):
+    """Find the object center assignments in an anchor based paradigm."""
+    offset = tf.cast(0.5 * (scale_xy - 1), boxes.dtype)
+
+    grid_xy, _ = tf.split(boxes, 2, axis=-1)
+    wh_scale = tf.cast(tf.convert_to_tensor([width, height]), boxes.dtype)
+
+    grid_xy = grid_xy * wh_scale
+    centers = tf.math.floor(grid_xy)
+
+    if offset != 0.0:
+      clamp = lambda x, ma: tf.maximum(  # pylint:disable=g-long-lambda
+          tf.minimum(x, tf.cast(ma, x.dtype)), tf.zeros_like(x))
+
+      grid_xy_index = grid_xy - centers
+      positive_shift = ((grid_xy_index < offset) & (grid_xy > 1.))
+      negative_shift = ((grid_xy_index > (1 - offset)) & (grid_xy <
+                                                          (wh_scale - 1.)))
+
+      zero, _ = tf.split(tf.ones_like(positive_shift), 2, axis=-1)
+      shift_mask = tf.concat([zero, positive_shift, negative_shift], axis=-1)
+      offset = tf.cast([[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]],
+                       offset.dtype) * offset
+
+      num_shifts = tf.shape(shift_mask)
+      num_shifts = num_shifts[-1]
+      boxes = tf.tile(tf.expand_dims(boxes, axis=-2), [1, num_shifts, 1])
+      classes = tf.tile(tf.expand_dims(classes, axis=-2), [1, num_shifts, 1])
+      anchors = tf.tile(tf.expand_dims(anchors, axis=-2), [1, num_shifts, 1])
+
+      shift_mask = tf.cast(shift_mask, boxes.dtype)
+      shift_ind = shift_mask * tf.range(0, num_shifts, dtype=boxes.dtype)
+      shift_ind = shift_ind - (1 - shift_mask)
+      shift_ind = tf.expand_dims(shift_ind, axis=-1)
+
+      boxes_and_centers = tf.concat([boxes, classes, anchors, shift_ind],
+                                    axis=-1)
+      boxes_and_centers = tf.reshape(boxes_and_centers, [-1, 7])
+      _, center_ids = tf.split(boxes_and_centers, [6, 1], axis=-1)
+
+      select = tf.where(center_ids >= 0)
+      select, _ = tf.split(select, 2, axis=-1)
+
+      boxes_and_centers = tf.gather_nd(boxes_and_centers, select)
+
+      center_ids = tf.gather_nd(center_ids, select)
+      center_ids = tf.cast(center_ids, tf.int32)
+      shifts = tf.gather_nd(offset, center_ids)
+
+      boxes, classes, anchors, _ = tf.split(
+          boxes_and_centers, [4, 1, 1, 1], axis=-1)
+      grid_xy, _ = tf.split(boxes, 2, axis=-1)
+      centers = tf.math.floor(grid_xy * wh_scale - shifts)
+      centers = clamp(centers, wh_scale - 1)
+
+    x, y = tf.split(centers, 2, axis=-1)
+    centers = tf.cast(tf.concat([y, x, anchors], axis=-1), tf.int32)
+    return boxes, classes, centers
+
+  def _get_anchor_free(self, key, boxes, classes, height, width, stride,
+                       center_radius):
+    """Find the box assignements in an anchor free paradigm."""
+    level_limits = self.anchor_free_level_limits[key]
+    gen = loss_utils.GridGenerator(anchors=[[1, 1]], scale_anchors=stride)
+    grid_points = gen(width, height, 1, boxes.dtype)[0]
+    grid_points = tf.squeeze(grid_points, axis=0)
+    box_list = boxes
+    class_list = classes
+
+    grid_points = (grid_points + 0.5) * stride
+    x_centers, y_centers = grid_points[..., 0], grid_points[..., 1]
+    boxes *= (tf.convert_to_tensor([width, height, width, height]) * stride)
+
+    tlbr_boxes = box_ops.xcycwh_to_yxyx(boxes)
+
+    boxes = tf.reshape(boxes, [1, 1, -1, 4])
+    tlbr_boxes = tf.reshape(tlbr_boxes, [1, 1, -1, 4])
+    if self.use_tie_breaker:
+      area = tf.reduce_prod(boxes[..., 2:], axis=-1)
+
+    # check if the box is in the receptive feild of the this fpn level
+    b_t = y_centers - tlbr_boxes[..., 0]
+    b_l = x_centers - tlbr_boxes[..., 1]
+    b_b = tlbr_boxes[..., 2] - y_centers
+    b_r = tlbr_boxes[..., 3] - x_centers
+    box_delta = tf.stack([b_t, b_l, b_b, b_r], axis=-1)
+    if level_limits is not None:
+      max_reg_targets_per_im = tf.reduce_max(box_delta, axis=-1)
+      gt_min = max_reg_targets_per_im >= level_limits[0]
+      gt_max = max_reg_targets_per_im <= level_limits[1]
+      is_in_boxes = tf.logical_and(gt_min, gt_max)
+    else:
+      is_in_boxes = tf.reduce_min(box_delta, axis=-1) > 0.0
+    is_in_boxes_all = tf.reduce_any(is_in_boxes, axis=(0, 1), keepdims=True)
+
+    # check if the center is in the receptive feild of the this fpn level
+    c_t = y_centers - (boxes[..., 1] - center_radius * stride)
+    c_l = x_centers - (boxes[..., 0] - center_radius * stride)
+    c_b = (boxes[..., 1] + center_radius * stride) - y_centers
+    c_r = (boxes[..., 0] + center_radius * stride) - x_centers
+    centers_delta = tf.stack([c_t, c_l, c_b, c_r], axis=-1)
+    is_in_centers = tf.reduce_min(centers_delta, axis=-1) > 0.0
+    is_in_centers_all = tf.reduce_any(is_in_centers, axis=(0, 1), keepdims=True)
+
+    # colate all masks to get the final locations
+    is_in_index = tf.logical_or(is_in_boxes_all, is_in_centers_all)
+    is_in_boxes_and_center = tf.logical_and(is_in_boxes, is_in_centers)
+    is_in_boxes_and_center = tf.logical_and(is_in_index, is_in_boxes_and_center)
+
+    if self.use_tie_breaker:
+      boxes_all = tf.cast(is_in_boxes_and_center, area.dtype)
+      boxes_all = ((boxes_all * area) + ((1 - boxes_all) * INF))
+      boxes_min = tf.reduce_min(boxes_all, axis=-1, keepdims=True)
+      boxes_min = tf.where(boxes_min == INF, -1.0, boxes_min)
+      is_in_boxes_and_center = boxes_all == boxes_min
+
+    # construct the index update grid
+    reps = tf.reduce_sum(tf.cast(is_in_boxes_and_center, tf.int16), axis=-1)
+    indexes = tf.cast(tf.where(is_in_boxes_and_center), tf.int32)
+    y, x, t = tf.split(indexes, 3, axis=-1)
+
+    boxes = tf.gather_nd(box_list, t)
+    classes = tf.cast(tf.gather_nd(class_list, t), boxes.dtype)
+    reps = tf.gather_nd(reps, tf.concat([y, x], axis=-1))
+    reps = tf.cast(tf.expand_dims(reps, axis=-1), boxes.dtype)
+    classes = tf.cast(tf.expand_dims(classes, axis=-1), boxes.dtype)
+    conf = tf.ones_like(classes)
+
+    # return the samples and the indexes
+    samples = tf.concat([boxes, conf, classes], axis=-1)
+    indexes = tf.concat([y, x, tf.zeros_like(t)], axis=-1)
+    return indexes, samples
+
+  def build_label_per_path(self,
+                           key,
+                           boxes,
+                           classes,
+                           width,
+                           height,
+                           iou_index=None):
+    """Builds the labels for one path."""
+    stride = self.level_strides[key]
+    scale_xy = self.center_radius[key] if self.center_radius is not None else 1
+
+    width = tf.cast(width // stride, boxes.dtype)
+    height = tf.cast(height // stride, boxes.dtype)
+
+    if self.anchor_free_level_limits is None:
+      (boxes, classes, anchors, num_anchors) = self._get_anchor_id(
+          key, boxes, classes, width, height, stride, iou_index=iou_index)
+      boxes, classes, centers = self._get_centers(boxes, classes, anchors,
+                                                  width, height, scale_xy)
+      ind_mask = tf.ones_like(classes)
+      updates = tf.concat([boxes, ind_mask, classes], axis=-1)
+    else:
+      num_anchors = 1
+      (centers, updates) = self._get_anchor_free(key, boxes, classes, height,
+                                                 width, stride, scale_xy)
+      boxes, ind_mask, classes = tf.split(updates, [4, 1, 1], axis=-1)
+
+    width = tf.cast(width, tf.int32)
+    height = tf.cast(height, tf.int32)
+    full = tf.zeros([height, width, num_anchors, 1], dtype=classes.dtype)
+    full = tf.tensor_scatter_nd_add(full, centers, ind_mask)
+
+    num_instances = int(self.num_instances[key])
+    centers = preprocessing_ops.pad_max_instances(
+        centers, num_instances, pad_value=0, pad_axis=0)
+    updates = preprocessing_ops.pad_max_instances(
+        updates, num_instances, pad_value=0, pad_axis=0)
+
+    updates = tf.cast(updates, self.dtype)
+    full = tf.cast(full, self.dtype)
+    return centers, updates, full
+
+  def __call__(self, boxes, classes, width, height):
+    """Builds the labels for a single image, not functional in batch mode.
+
+    Args:
+      boxes: `Tensor` of shape [None, 4] indicating the object locations in an
+        image.
+      classes: `Tensor` of shape [None] indicating the each objects classes.
+      width: `int` for the images width.
+      height: `int` for the images height.
+
+    Returns:
+      centers: `Tensor` of shape [None, 3] of indexes in the final grid where
+        boxes are located.
+      updates: `Tensor` of shape [None, 8] the value to place in the final grid.
+      full: `Tensor` of [width/stride, height/stride, num_anchors, 1] holding
+        a mask of where boxes are locates for confidence losses.
+    """
+    indexes = {}
+    updates = {}
+    true_grids = {}
+    iou_index = None
+
+    boxes = box_ops.yxyx_to_xcycwh(boxes)
+    if not self.best_matches_only and self.anchor_free_level_limits is None:
+      # stitch and search boxes across fpn levels
+      anchorsvec = []
+      for stitch in self.anchors:
+        anchorsvec.extend(self.anchors[stitch])
+
+      stride = tf.cast([width, height], boxes.dtype)
+      # get the best anchor for each box
+      iou_index, _ = get_best_anchor(
+          boxes,
+          anchorsvec,
+          stride,
+          width=1.0,
+          height=1.0,
+          best_match_only=False,
+          use_tie_breaker=self.use_tie_breaker,
+          iou_thresh=self.match_threshold)
+
+    for key in self.keys:
+      indexes[key], updates[key], true_grids[key] = self.build_label_per_path(
+          key, boxes, classes, width, height, iou_index=iou_index)
+    return indexes, updates, true_grids
--- a/official/vision/beta/projects/yolo/ops/loss_utils.py
+++ b/official/vision/beta/projects/yolo/ops/loss_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 """Yolo loss utility functions."""
+
 import numpy as np
 import tensorflow as tf

@@ -129,6 +130,10 @@ def build_grid(indexes, truths, preds, ind_mask, update=False, grid=None):
  indexes = apply_mask(tf.cast(ind_mask, indexes.dtype), indexes)
  indexes = (indexes + (ind_mask - 1))

+  # mask truths
+  truths = apply_mask(tf.cast(ind_mask, truths.dtype), truths)
+  truths = (truths + (tf.cast(ind_mask, truths.dtype) - 1))
+
  # reshape the indexes into the correct shape for the loss,
  # just flatten all indexes but the last
  indexes = tf.reshape(indexes, [-1, 4])
@@ -157,26 +162,16 @@ def build_grid(indexes, truths, preds, ind_mask, update=False, grid=None):
 class GridGenerator:
  """Grid generator that generates anchor grids for box decoding."""

-  def __init__(self, anchors, masks=None, scale_anchors=None):
+  def __init__(self, anchors, scale_anchors=None):
    """Initialize Grid Generator.

    Args:
      anchors: A `List[List[int]]` for the anchor boxes that are used in the
        model at all levels.
-      masks: A `List[int]` for the output level that this specific model output
-        Level.
      scale_anchors: An `int` for how much to scale this level to get the
        original input shape.
    """
    self.dtype = tf.keras.backend.floatx()
-    if masks is not None:
-      self._num = len(masks)
-    else:
-      self._num = tf.shape(anchors)[0]
-
-    if masks is not None:
-      anchors = [anchors[mask] for mask in masks]
-
    self._scale_anchors = scale_anchors
    self._anchors = tf.convert_to_tensor(anchors)
    return
@@ -331,18 +326,10 @@ class PairWiseSearch:
               pred_classes,
               boxes,
               classes,
-               scale=None,
-               yxyx=True,
               clip_thresh=0.0):
    num_boxes = tf.shape(boxes)[-2]
    num_tiles = (num_boxes // TILE_SIZE) - 1

-    if yxyx:
-      boxes = box_ops.yxyx_to_xcycwh(boxes)
-
-    if scale is not None:
-      boxes = boxes * tf.stop_gradient(scale)
-
    if self._min_conf > 0.0:
      pred_classes = tf.cast(pred_classes > self._min_conf, pred_classes.dtype)

@@ -535,32 +522,35 @@ def _darknet_new_coord_boxes(encoded_boxes, width, height, anchor_grid,
  return (scaler, scaled_box, pred_box), delta


-def _anchor_free_scale_boxes(encoded_boxes, width, height, stride, grid_points,
-                             scale_xy):
+def _anchor_free_scale_boxes(encoded_boxes,
+                             width,
+                             height,
+                             stride,
+                             grid_points,
+                             darknet=False):
  """Decode models boxes using FPN stride under anchor free conditions."""
+  del darknet
  # split the boxes
  pred_xy = encoded_boxes[..., 0:2]
  pred_wh = encoded_boxes[..., 2:4]

  # build a scaling tensor to get the offset of th ebox relative to the image
  scaler = tf.convert_to_tensor([height, width, height, width])
-  scale_xy = tf.cast(scale_xy, encoded_boxes.dtype)
-
-  # scale the centers and find the offset of each box relative to
-  # their center pixel
-  pred_xy = pred_xy * scale_xy - 0.5 * (scale_xy - 1)

  # scale the offsets and add them to the grid points or a tensor that is
  # the realtive location of each pixel
-  box_xy = (grid_points + pred_xy) * stride
+  box_xy = (grid_points + pred_xy)

  # scale the width and height of the predictions and corlate them
  # to anchor boxes
-  box_wh = tf.math.exp(pred_wh) * stride
+  box_wh = tf.math.exp(pred_wh)

  # build the final predicted box
  scaled_box = tf.concat([box_xy, box_wh], axis=-1)
-  pred_box = scaled_box / scaler
+
+  # properly scaling boxes gradeints
+  scaled_box = scaled_box * tf.cast(stride, scaled_box.dtype)
+  pred_box = scaled_box / tf.cast(scaler * stride, scaled_box.dtype)
  return (scaler, scaled_box, pred_box)


@@ -608,9 +598,8 @@ def get_predicted_box(width,
      range.
  """
  if box_type == 'anchor_free':
-    (scaler, scaled_box,
-     pred_box) = _anchor_free_scale_boxes(encoded_boxes, width, height, stride,
-                                          grid_points, scale_xy)
+    (scaler, scaled_box, pred_box) = _anchor_free_scale_boxes(
+        encoded_boxes, width, height, stride, grid_points, darknet=darknet)
  elif darknet:

    # pylint:disable=unbalanced-tuple-unpacking

--- a/official/vision/beta/projects/yolo/ops/mosaic.py
+++ b/official/vision/beta/projects/yolo/ops/mosaic.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mosaic op."""
+import random
+
+import tensorflow as tf
+import tensorflow_addons as tfa
+
+from official.vision.beta.ops import box_ops
+from official.vision.beta.ops import preprocess_ops
+from official.vision.beta.projects.yolo.ops import preprocessing_ops
+
+
+class Mosaic:
+  """Stitch together sets of 4 images to generate samples with more boxes."""
+
+  def __init__(self,
+               output_size,
+               mosaic_frequency=1.0,
+               mixup_frequency=0.0,
+               letter_box=True,
+               jitter=0.0,
+               mosaic_crop_mode='scale',
+               mosaic_center=0.25,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               aug_rand_angle=0.0,
+               aug_rand_perspective=0.0,
+               aug_rand_translate=0.0,
+               random_pad=False,
+               random_flip=False,
+               area_thresh=0.1,
+               pad_value=preprocessing_ops.PAD_VALUE,
+               seed=None):
+    """Initializes parameters for mosaic.
+
+    Args:
+      output_size: `Tensor` or `List` for [height, width] of output image.
+      mosaic_frequency: `float` indicating how often to apply mosaic.
+      mixup_frequency: `float` indicating how often to apply mixup.
+      letter_box: `boolean` indicating whether upon start of the datapipeline
+        regardless of the preprocessing ops that are used, the aspect ratio of
+        the images should be preserved.
+      jitter: `float` for the maximum change in aspect ratio expected in each
+        preprocessing step.
+      mosaic_crop_mode: `str` they type of mosaic to apply. The options are
+        {crop, scale, None}, crop will construct a mosaic by slicing images
+        togther, scale will create a mosaic by concatnating and shifting the
+        image, and None will default to scale and apply no post processing to
+        the created mosaic.
+      mosaic_center: `float` indicating how much to randomly deviate from the
+        from the center of the image when creating a mosaic.
+      aug_scale_min: `float` indicating the minimum scaling value for image
+        scale jitter.
+      aug_scale_max: `float` indicating the maximum scaling value for image
+        scale jitter.
+      aug_rand_angle: `float` indicating the maximum angle value for angle.
+        angle will be changes between 0 and value.
+      aug_rand_perspective: `float` ranging from 0.000 to 0.001 indicating how
+        much to prespective warp the image.
+      aug_rand_translate: `float` ranging from 0 to 1 indicating the maximum
+        amount to randomly translate an image.
+      random_pad: `bool` indiccating wether to use padding to apply random
+        translation true for darknet yolo false for scaled yolo.
+      random_flip: `bool` whether or not to random flip the image.
+      area_thresh: `float` for the minimum area of a box to allow to pass
+        through for optimization.
+      pad_value: `int` padding value.
+      seed: `int` the seed for random number generation.
+    """
+
+    self._output_size = output_size
+    self._area_thresh = area_thresh
+
+    self._mosaic_frequency = mosaic_frequency
+    self._mixup_frequency = mixup_frequency
+
+    self._letter_box = letter_box
+    self._random_crop = jitter
+
+    self._mosaic_crop_mode = mosaic_crop_mode
+    self._mosaic_center = mosaic_center
+
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+    self._random_pad = random_pad
+    self._aug_rand_translate = aug_rand_translate
+    self._aug_rand_angle = aug_rand_angle
+    self._aug_rand_perspective = aug_rand_perspective
+    self._random_flip = random_flip
+    self._pad_value = pad_value
+
+    self._deterministic = seed is not None
+    self._seed = seed if seed is not None else random.randint(0, 2**30)
+
+  def _generate_cut(self):
+    """Generate a random center to use for slicing and patching the images."""
+    if self._mosaic_crop_mode == 'crop':
+      min_offset = self._mosaic_center
+      cut_x = preprocessing_ops.random_uniform_strong(
+          self._output_size[1] * min_offset,
+          self._output_size[1] * (1 - min_offset),
+          seed=self._seed)
+      cut_y = preprocessing_ops.random_uniform_strong(
+          self._output_size[0] * min_offset,
+          self._output_size[0] * (1 - min_offset),
+          seed=self._seed)
+      cut = [cut_y, cut_x]
+      ishape = tf.convert_to_tensor(
+          [self._output_size[0], self._output_size[1], 3])
+    else:
+      cut = None
+      ishape = tf.convert_to_tensor(
+          [self._output_size[0] * 2, self._output_size[1] * 2, 3])
+    return cut, ishape
+
+  def scale_boxes(self, patch, ishape, boxes, classes, xs, ys):
+    """Scale and translate the boxes for each image prior to patching."""
+    xs = tf.cast(xs, boxes.dtype)
+    ys = tf.cast(ys, boxes.dtype)
+    pshape = tf.cast(tf.shape(patch), boxes.dtype)
+    ishape = tf.cast(ishape, boxes.dtype)
+    translate = tf.cast((ishape - pshape), boxes.dtype)
+
+    boxes = box_ops.denormalize_boxes(boxes, pshape[:2])
+    boxes = boxes + tf.cast([
+        translate[0] * ys, translate[1] * xs, translate[0] * ys,
+        translate[1] * xs
+    ], boxes.dtype)
+    boxes = box_ops.normalize_boxes(boxes, ishape[:2])
+    return boxes, classes
+
+  def _select_ind(self, inds, *args):
+    items = []
+    for item in args:
+      items.append(tf.gather(item, inds))
+    return items
+
+  def _augment_image(self,
+                     image,
+                     boxes,
+                     classes,
+                     is_crowd,
+                     area,
+                     xs=0.0,
+                     ys=0.0,
+                     cut=None):
+    """Process a single image prior to the application of patching."""
+    if self._random_flip:
+      # Randomly flip the image horizontally.
+      image, boxes, _ = preprocess_ops.random_horizontal_flip(
+          image, boxes, seed=self._seed)
+
+    # Augment the image without resizing
+    image, infos, crop_points = preprocessing_ops.resize_and_jitter_image(
+        image, [self._output_size[0], self._output_size[1]],
+        random_pad=False,
+        letter_box=self._letter_box,
+        jitter=self._random_crop,
+        shiftx=xs,
+        shifty=ys,
+        cut=cut,
+        seed=self._seed)
+
+    # Clip and clean boxes.
+    boxes, inds = preprocessing_ops.transform_and_clip_boxes(
+        boxes,
+        infos,
+        area_thresh=self._area_thresh,
+        shuffle_boxes=False,
+        augment=True,
+        seed=self._seed)
+    classes, is_crowd, area = self._select_ind(inds, classes, is_crowd, area)  # pylint:disable=unbalanced-tuple-unpacking
+    return image, boxes, classes, is_crowd, area, crop_points
+
+  def _mosaic_crop_image(self, image, boxes, classes, is_crowd, area):
+    """Process a patched image in preperation for final output."""
+    if self._mosaic_crop_mode != 'crop':
+      shape = tf.cast(preprocessing_ops.get_image_shape(image), tf.float32)
+      center = shape * self._mosaic_center
+
+      # shift the center of the image by applying a translation to the whole
+      # image
+      ch = tf.math.round(
+          preprocessing_ops.random_uniform_strong(
+              -center[0], center[0], seed=self._seed))
+      cw = tf.math.round(
+          preprocessing_ops.random_uniform_strong(
+              -center[1], center[1], seed=self._seed))
+
+      # clip the boxes to those with in the image
+      image = tfa.image.translate(image, [cw, ch], fill_value=self._pad_value)
+      boxes = box_ops.denormalize_boxes(boxes, shape[:2])
+      boxes = boxes + tf.cast([ch, cw, ch, cw], boxes.dtype)
+      boxes = box_ops.clip_boxes(boxes, shape[:2])
+      inds = box_ops.get_non_empty_box_indices(boxes)
+
+      boxes = box_ops.normalize_boxes(boxes, shape[:2])
+      boxes, classes, is_crowd, area = self._select_ind(inds, boxes, classes,  # pylint:disable=unbalanced-tuple-unpacking
+                                                        is_crowd, area)
+
+    # warp and scale the fully stitched sample
+    image, _, affine = preprocessing_ops.affine_warp_image(
+        image, [self._output_size[0], self._output_size[1]],
+        scale_min=self._aug_scale_min,
+        scale_max=self._aug_scale_max,
+        translate=self._aug_rand_translate,
+        degrees=self._aug_rand_angle,
+        perspective=self._aug_rand_perspective,
+        random_pad=self._random_pad,
+        seed=self._seed)
+    height, width = self._output_size[0], self._output_size[1]
+    image = tf.image.resize(image, (height, width))
+
+    # clip and clean boxes
+    boxes, inds = preprocessing_ops.transform_and_clip_boxes(
+        boxes,
+        None,
+        affine=affine,
+        area_thresh=self._area_thresh,
+        seed=self._seed)
+    classes, is_crowd, area = self._select_ind(inds, classes, is_crowd, area)  # pylint:disable=unbalanced-tuple-unpacking
+    return image, boxes, classes, is_crowd, area, area
+
+  # mosaic full frequency doubles model speed
+  def _process_image(self, sample, shiftx, shifty, cut, ishape):
+    """Process and augment each image."""
+    (image, boxes, classes, is_crowd, area, crop_points) = self._augment_image(
+        sample['image'], sample['groundtruth_boxes'],
+        sample['groundtruth_classes'], sample['groundtruth_is_crowd'],
+        sample['groundtruth_area'], shiftx, shifty, cut)
+
+    (boxes, classes) = self.scale_boxes(image, ishape, boxes, classes,
+                                        1 - shiftx, 1 - shifty)
+
+    sample['image'] = image
+    sample['groundtruth_boxes'] = boxes
+    sample['groundtruth_classes'] = classes
+    sample['groundtruth_is_crowd'] = is_crowd
+    sample['groundtruth_area'] = area
+    sample['shiftx'] = shiftx
+    sample['shifty'] = shifty
+    sample['crop_points'] = crop_points
+    return sample
+
+  def _patch2(self, one, two):
+    """Stitch together 2 images in totality."""
+    sample = one
+    sample['image'] = tf.concat([one['image'], two['image']], axis=-2)
+
+    sample['groundtruth_boxes'] = tf.concat(
+        [one['groundtruth_boxes'], two['groundtruth_boxes']], axis=0)
+    sample['groundtruth_classes'] = tf.concat(
+        [one['groundtruth_classes'], two['groundtruth_classes']], axis=0)
+    sample['groundtruth_is_crowd'] = tf.concat(
+        [one['groundtruth_is_crowd'], two['groundtruth_is_crowd']], axis=0)
+    sample['groundtruth_area'] = tf.concat(
+        [one['groundtruth_area'], two['groundtruth_area']], axis=0)
+    return sample
+
+  def _patch(self, one, two):
+    """Build the full 4 patch of images from sets of 2 images."""
+    image = tf.concat([one['image'], two['image']], axis=-3)
+    boxes = tf.concat([one['groundtruth_boxes'], two['groundtruth_boxes']],
+                      axis=0)
+    classes = tf.concat(
+        [one['groundtruth_classes'], two['groundtruth_classes']], axis=0)
+    is_crowd = tf.concat(
+        [one['groundtruth_is_crowd'], two['groundtruth_is_crowd']], axis=0)
+    area = tf.concat([one['groundtruth_area'], two['groundtruth_area']], axis=0)
+
+    if self._mosaic_crop_mode is not None:
+      image, boxes, classes, is_crowd, area, _ = self._mosaic_crop_image(
+          image, boxes, classes, is_crowd, area)
+
+    sample = one
+    height, width = preprocessing_ops.get_image_shape(image)
+    sample['image'] = tf.cast(image, tf.uint8)
+    sample['groundtruth_boxes'] = boxes
+    sample['groundtruth_area'] = area
+    sample['groundtruth_classes'] = tf.cast(classes,
+                                            sample['groundtruth_classes'].dtype)
+    sample['groundtruth_is_crowd'] = tf.cast(is_crowd, tf.bool)
+    sample['width'] = tf.cast(width, sample['width'].dtype)
+    sample['height'] = tf.cast(height, sample['height'].dtype)
+    sample['num_detections'] = tf.shape(sample['groundtruth_boxes'])[1]
+    sample['is_mosaic'] = tf.cast(1.0, tf.bool)
+
+    del sample['shiftx']
+    del sample['shifty']
+    del sample['crop_points']
+    return sample
+
+  def _mosaic(self, one, two, three, four):
+    """Stitch together 4 images to build a mosaic."""
+    if self._mosaic_frequency >= 1.0:
+      domo = 1.0
+    else:
+      domo = preprocessing_ops.random_uniform_strong(
+          0.0, 1.0, dtype=tf.float32, seed=self._seed)
+      noop = one.copy()
+
+    if domo >= (1 - self._mosaic_frequency):
+      cut, ishape = self._generate_cut()
+      one = self._process_image(one, 1.0, 1.0, cut, ishape)
+      two = self._process_image(two, 0.0, 1.0, cut, ishape)
+      three = self._process_image(three, 1.0, 0.0, cut, ishape)
+      four = self._process_image(four, 0.0, 0.0, cut, ishape)
+      patch1 = self._patch2(one, two)
+      patch2 = self._patch2(three, four)
+      stitched = self._patch(patch1, patch2)
+      return stitched
+    else:
+      return self._add_param(noop)
+
+  def _mixup(self, one, two):
+    """Blend together 2 images for the mixup data augmentation."""
+    if self._mixup_frequency >= 1.0:
+      domo = 1.0
+    else:
+      domo = preprocessing_ops.random_uniform_strong(
+          0.0, 1.0, dtype=tf.float32, seed=self._seed)
+      noop = one.copy()
+
+    if domo >= (1 - self._mixup_frequency):
+      sample = one
+      otype = one['image'].dtype
+      r = preprocessing_ops.random_uniform_strong(
+          0.4, 0.6, tf.float32, seed=self._seed)
+      sample['image'] = (
+          r * tf.cast(one['image'], tf.float32) +
+          (1 - r) * tf.cast(two['image'], tf.float32))
+
+      sample['image'] = tf.cast(sample['image'], otype)
+      sample['groundtruth_boxes'] = tf.concat(
+          [one['groundtruth_boxes'], two['groundtruth_boxes']], axis=0)
+      sample['groundtruth_classes'] = tf.concat(
+          [one['groundtruth_classes'], two['groundtruth_classes']], axis=0)
+      sample['groundtruth_is_crowd'] = tf.concat(
+          [one['groundtruth_is_crowd'], two['groundtruth_is_crowd']], axis=0)
+      sample['groundtruth_area'] = tf.concat(
+          [one['groundtruth_area'], two['groundtruth_area']], axis=0)
+      return sample
+    else:
+      return self._add_param(noop)
+
+  def _add_param(self, sample):
+    """Add parameters to handle skipped images."""
+    sample['is_mosaic'] = tf.cast(0.0, tf.bool)
+    sample['num_detections'] = tf.shape(sample['groundtruth_boxes'])[0]
+    return sample
+
+  def _apply(self, dataset):
+    """Apply mosaic to an input dataset."""
+    determ = self._deterministic
+    dataset = dataset.prefetch(tf.data.AUTOTUNE)
+    one = dataset.shuffle(100, seed=self._seed, reshuffle_each_iteration=True)
+    two = dataset.shuffle(
+        100, seed=self._seed + 1, reshuffle_each_iteration=True)
+    three = dataset.shuffle(
+        100, seed=self._seed + 2, reshuffle_each_iteration=True)
+    four = dataset.shuffle(
+        100, seed=self._seed + 3, reshuffle_each_iteration=True)
+
+    dataset = tf.data.Dataset.zip((one, two, three, four))
+    dataset = dataset.map(
+        self._mosaic, num_parallel_calls=tf.data.AUTOTUNE, deterministic=determ)
+
+    if self._mixup_frequency > 0:
+      one = dataset.shuffle(
+          100, seed=self._seed + 4, reshuffle_each_iteration=True)
+      two = dataset.shuffle(
+          100, seed=self._seed + 5, reshuffle_each_iteration=True)
+      dataset = tf.data.Dataset.zip((one, two))
+      dataset = dataset.map(
+          self._mixup,
+          num_parallel_calls=tf.data.AUTOTUNE,
+          deterministic=determ)
+    return dataset
+
+  def _skip(self, dataset):
+    """Skip samples in a dataset."""
+    determ = self._deterministic
+    return dataset.map(
+        self._add_param,
+        num_parallel_calls=tf.data.AUTOTUNE,
+        deterministic=determ)
+
+  def mosaic_fn(self, is_training=True):
+    """Determine which function to apply based on whether model is training."""
+    if is_training and self._mosaic_frequency > 0.0:
+      return self._apply
+    else:
+      return self._skip