comments addressed

9474c108 · Vishnu Banna · bcd5283d · 9474c108 · 9474c108 · 9474c108
Commit 9474c108 authored Sep 23, 2021 by Vishnu Banna
13 changed files
--- a/official/vision/beta/projects/yolo/configs/backbones.py
+++ b/official/vision/beta/projects/yolo/configs/backbones.py
@@ -22,8 +22,8 @@ from official.vision.beta.configs import backbones
 class Darknet(hyperparams.Config):
  """DarkNet config."""
  model_id: str = 'cspdarknet53'
-  width_scale: int = 1.0
-  depth_scale: int = 1.0
+  width_scale: float = 1.0
+  depth_scale: float = 1.0
  dilate: bool = False
  min_level: int = 3
  max_level: int = 5

--- a/official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py
+++ b/official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py
@@ -59,10 +59,20 @@ class TfExampleDecoder(tf_example_decoder.TfExampleDecoder):
  """Tensorflow Example proto decoder."""

  def __init__(self,
-               coco91_to_80, 
+               coco91_to_80=None, 
               include_mask=False,
               regenerate_source_id=False,
               mask_binarize_threshold=None):
+    """Initialize the example decoder. 
+
+    Args: 
+      coco91_to_80: `bool` indicating whether to convert coco from its 91 class 
+        format to the 80 class format.
+      include_mask: `bool` indicating if the decoder should also decode instance 
+        masks for instance segmentation.
+      regenerate_source_id: `bool` indicating if the source id needs to be 
+        recreated for each image sample. 
+    """
    if coco91_to_80 and include_mask:
      raise ValueError("If masks are included you cannot \
                        convert coco from the 91 class format \

--- a/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/yolo_input.py
-""" Detection Data parser and processing for YOLO.
-Parse image and ground truths in a dataset to training targets and package them
-into (image, labels) tuple for RetinaNet.
-"""
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Detection Data parser and processing for YOLO."""
 import tensorflow as tf
 import numpy as np
 from official.vision.beta.projects.yolo.ops import preprocessing_ops
@@ -19,7 +30,7 @@ class Parser(parser.Parser):
      output_size,
      anchors,
      expanded_strides,
-      level_limit=None,
+      level_limits=None,
      max_num_instances=200,
      area_thresh=0.1,
      aug_rand_hue=1.0,
@@ -48,11 +59,13 @@ class Parser(parser.Parser):
        output_size should be divided by the largest feature stride 2^max_level.
      anchors: `Dict[List[Union[int, float]]]` values for each anchor box.
      expanded_strides: `Dict[int]` for how much the model scales down the 
-        images at the largest level.
-      level_limit: `List` the box sizes that will be allowed at each FPN 
+        images at the largest level. For example, level 3 down samples the image 
+        by a factor of 16, in the expanded strides dictionary, we will pass 
+        along {3: 16} indicating that relative to the original image, the 
+        shapes must be reduced by a factor of 16 to compute the loss.
+      level_limits: `List` the box sizes that will be allowed at each FPN 
        level as is done in the FCOS and YOLOX paper for anchor free box 
-        assignment. Anchor free will perform worse than Anchor based, but only 
-        slightly.
+        assignment.
      max_num_instances: `int` for the number of boxes to compute loss on.
      area_thresh: `float` for the minimum area of a box to allow to pass 
        through for optimization.
@@ -108,20 +121,9 @@ class Parser(parser.Parser):
      assert output_size[1] % expanded_strides[str(key)] == 0
      assert output_size[0] % expanded_strides[str(key)] == 0

-    # scale of each FPN level
-    self._strides = expanded_strides
-
    # Set the width and height properly and base init:
    self._image_w = output_size[1]
    self._image_h = output_size[0]
-
-    # Set the anchor boxes for each scale
-    self._anchors = anchors
-    self._level_limit = level_limit
-
-    # anchor labeling paramters
-    self._use_tie_breaker = use_tie_breaker
-    self._best_match_only = best_match_only
    self._max_num_instances = max_num_instances

    # Image scaling params
@@ -143,33 +145,23 @@ class Parser(parser.Parser):
    self._aug_rand_hue = aug_rand_hue

    # Set the per level values needed for operation
-    self._scale_xy = scale_xy
-    self._anchor_t = anchor_t
    self._darknet = darknet
    self._area_thresh = area_thresh

-    keys = list(self._anchors.keys())
-
-    if self._level_limit is not None:
-      maxim = 2000
-      self._scale_up = {key: maxim // self._max_num_instances for key in keys}
-      self._anchor_t = -0.01
-    elif not self._darknet:
-      self._scale_up = {key: 6 - i for i, key in enumerate(keys)}
-    else:
-      self._scale_up = {key: 1 for key in keys}
-
    self._seed = seed
-
-    # Set the data type based on input string
    self._dtype = dtype

    self._label_builder = anchor.YoloAnchorLabeler(
-      anchors = self._anchors, 
-      match_threshold=self._anchor_t, 
-      best_matches_only=self._best_match_only,
-      use_tie_breaker=self._use_tie_breaker
-    )
+      anchors = anchors, 
+      anchor_free_level_limits = level_limits,
+      level_strides=expanded_strides, 
+      center_radius=scale_xy, 
+      max_num_instances=max_num_instances,
+      match_threshold=anchor_t, 
+      best_matches_only=best_match_only,
+      use_tie_breaker=use_tie_breaker, 
+      darknet=darknet, 
+      dtype=dtype)

  def _pad_infos_object(self, image):
    """Get a Tensor to pad the info object list."""
@@ -307,57 +299,22 @@ class Parser(parser.Parser):
        is_training=False)
    return image, labels

-  def set_shape(self, values, pad_axis=0, pad_value=0, inds=None, scale=1):
+  def set_shape(self, values, pad_axis=0, pad_value=0, inds=None):
    """Calls set shape for all input objects."""
    if inds is not None:
      values = tf.gather(values, inds)
    vshape = values.get_shape().as_list()

-    if pad_value is not None:
-      values = preprocessing_ops.pad_max_instances(
+    values = preprocessing_ops.pad_max_instances(
          values,
          self._max_num_instances,
          pad_axis=pad_axis,
          pad_value=pad_value)

-    vshape[pad_axis] = self._max_num_instances * scale
+    vshape[pad_axis] = self._max_num_instances 
    values.set_shape(vshape)
    return values

-  def _build_grid(self, boxes, classes, width, height):
-    """Private function for building the full scale object and class grid."""
-    indexes = {}
-    updates = {}
-    true_grids = {}
-
-    if self._level_limit is not None:
-      self._level_limit = [0.0] + self._level_limit + [np.inf]
-
-    # for each prediction path generate a properly scaled output prediction map
-    for i, key in enumerate(self._anchors.keys()):
-      if self._level_limit is not None:
-        fpn_limits = self._level_limit[i:i + 2]
-      else:
-        fpn_limits = None
-
-      scale_xy = self._scale_xy[key] if not self._darknet else 1
-
-      indexes[key], updates[key], true_grids[key] = self._label_builder(
-        key, boxes, classes, self._anchors[key], 
-        width, height, self._strides[str(key)],
-        scale_xy, self._max_num_instances * self._scale_up[key], 
-        fpn_limits = fpn_limits)
-
-      # set/fix the shapes
-      indexes[key] = self.set_shape(indexes[key], -2, None, None,
-                                    self._scale_up[key])
-      updates[key] = self.set_shape(updates[key], -2, None, None,
-                                    self._scale_up[key])
-
-      # add all the values to the final dictionary
-      updates[key] = tf.cast(updates[key], dtype=self._dtype)
-    return indexes, updates, true_grids
-
  def _build_label(self,
                   image,
                   gt_boxes,
@@ -376,16 +333,15 @@ class Parser(parser.Parser):
    image.set_shape(imshape)
    
    labels = dict()
-    labels['inds'], labels['upds'], labels['true_conf'] = self._build_grid(
-        gt_boxes, gt_classes, width, height)
+    (labels['inds'], 
+    labels['upds'], labels['true_conf']) = self._label_builder(gt_boxes, 
+                                                               gt_classes, 
+                                                               width, 
+                                                               height)

    # Set/fix the boxes shape.
    boxes = self.set_shape(gt_boxes, pad_axis=0, pad_value=0)
    classes = self.set_shape(gt_classes, pad_axis=0, pad_value=-1)
-    area = self.set_shape(
-        data['groundtruth_area'], pad_axis=0, pad_value=0, inds=inds)
-    is_crowd = self.set_shape(
-        data['groundtruth_is_crowd'], pad_axis=0, pad_value=0, inds=inds)

    # Build the dictionary set.
    labels.update({
@@ -396,6 +352,7 @@ class Parser(parser.Parser):

    # Update the labels dictionary.
    if not is_training:
+
      # Sets up groundtruth data for evaluation.
      groundtruths = {
          'source_id': labels['source_id'],
@@ -405,8 +362,9 @@ class Parser(parser.Parser):
          'image_info': info,
          'boxes': gt_boxes,
          'classes': gt_classes,
-          'areas': area,
-          'is_crowds': tf.cast(is_crowd, tf.int32),
+          'areas': tf.gather(data['groundtruth_area'], inds),
+          'is_crowds': tf.cast(
+            tf.gather(data['groundtruth_is_crowd'], inds), tf.int32),
      }
      groundtruths['source_id'] = utils.process_source_id(
          groundtruths['source_id'])

--- a/official/vision/beta/projects/yolo/losses/yolo_loss.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss.py
@@ -14,13 +14,12 @@

 """Yolo Loss function."""
 import abc
-import collections
 import functools
+import collections

 import tensorflow as tf
-
-from official.vision.beta.projects.yolo.ops import box_ops
 from official.vision.beta.projects.yolo.ops import loss_utils
+from official.vision.beta.projects.yolo.ops import box_ops
 from official.vision.beta.projects.yolo.ops import math_ops


@@ -33,7 +32,6 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):

  def __init__(self,
               classes,
-               mask,
               anchors,
               path_stride=1,
               ignore_thresh=0.7,
@@ -52,8 +50,6 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):

    Args:
      classes: `int` for the number of classes
-      mask: `List[int]` for the output level that this specific model output
-        level
      anchors: `List[List[int]]` for the anchor boxes that are used in the model
        at all levels. For anchor free prediction set the anchor list to be the
        same as the image resolution.
@@ -86,10 +82,9 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
    """
    self._loss_type = loss_type
    self._classes = classes
-    self._num = tf.cast(len(mask), dtype=tf.int32)
+    self._num = tf.cast(len(anchors), dtype=tf.int32)
    self._truth_thresh = truth_thresh
    self._ignore_thresh = ignore_thresh
-    self._masks = mask
    self._anchors = anchors

    self._iou_normalizer = iou_normalizer
@@ -112,7 +107,7 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
    self._decode_boxes = functools.partial(
        loss_utils.get_predicted_box, **box_kwargs)

-    self._search_pairs = None
+    self._search_pairs = lambda *args: (None, None, None, None)
    self._build_per_path_attributes()

  def box_loss(self, true_box, pred_box, darknet=False):
@@ -136,13 +131,18 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
                               scale=None):
    """Search of all groundtruths to associate groundtruths to predictions."""

-    if self._search_pairs is None:
-      return true_conf, tf.ones_like(true_conf)
+    boxes = box_ops.yxyx_to_xcycwh(boxes)
+
+    if scale is not None:
+      boxes = boxes * tf.cast(tf.stop_gradient(scale), boxes.dtype)

    # Search all predictions against ground truths to find mathcing boxes for
    # each pixel.
-    _, _, iou_max, _ = self._search_pairs(
-        pred_boxes, pred_classes, boxes, classes, scale=scale, yxyx=True)
+    _, _, iou_max, _ = self._search_pairs(pred_boxes, pred_classes, 
+                                          boxes, classes)
+
+    if iou_max is None:
+      return true_conf, tf.ones_like(true_conf)

    # Find the exact indexes to ignore and keep.
    ignore_mask = tf.cast(iou_max < self._ignore_thresh, pred_boxes.dtype)
@@ -196,7 +196,7 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
        predictions.
    """
    (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf, ind_mask,
-     grid_mask) = self._compute_loss(true_counts, inds, y_true, boxes, classes,
+     grid_mask) = self._compute_loss(true_counts, inds, y_true, boxes, classes, 
                                     y_pred)

    # Metric compute using done here to save time and resources.
@@ -219,7 +219,8 @@ class YoloLossBase(object, metaclass=abc.ABCMeta):
    """The actual logic to apply to the raw model for optimization."""
    ...

-  def post_path_aggregation(self, loss, ground_truths, predictions):  # pylint:disable=unused-argument
+  def post_path_aggregation(self, 
+      loss, box_loss, conf_loss, class_loss, ground_truths, predictions): # pylint:disable=unused-argument
    """This method allows for post processing of a loss value.

    After the loss has been aggregated across all the FPN levels some post
@@ -277,7 +278,6 @@ class DarknetLoss(YoloLossBase):
    association.
    """
    self._anchor_generator = loss_utils.GridGenerator(
-        masks=self._masks,
        anchors=self._anchors,
        scale_anchors=self._path_stride)

@@ -428,14 +428,13 @@ class ScaledLoss(YoloLossBase):
    association.
    """
    self._anchor_generator = loss_utils.GridGenerator(
-        masks=self._masks,
        anchors=self._anchors,
        scale_anchors=self._path_stride)

    if self._ignore_thresh > 0.0:
      self._search_pairs = loss_utils.PairWiseSearch(
          iou_type=self._loss_type, any_match=False, min_conf=0.25)
-    
+
    self._cls_normalizer = self._cls_normalizer * self._classes/80
    return

@@ -550,7 +549,8 @@ class ScaledLoss(YoloLossBase):
    return (loss, box_loss, conf_loss, class_loss, mean_loss, iou, pred_conf,
            ind_mask, grid_mask)

-  def post_path_aggregation(self, loss, ground_truths, predictions):
+  def post_path_aggregation(self, 
+      loss, box_loss, conf_loss, class_loss, ground_truths, predictions):
    """This method allows for post processing of a loss value.

    By default the model will have about 3 FPN levels {3, 4, 5}, on
@@ -559,19 +559,12 @@ class ScaledLoss(YoloLossBase):
    magintude as the model with 3 FPN levels. This helps to prevent gradient
    explosions.

-    Args:
-      loss: `tf.float` scalar for the actual loss.
-      ground_truths: `Dict` holding all the ground truth tensors.
-      predictions: `Dict` holding all the predicted values.
-
-    Returns:
-      loss: `tf.float` scalar for the scaled loss.
    """
    scale = tf.stop_gradient(3 / len(list(predictions.keys())))
    return loss * scale

  def cross_replica_aggregation(self, loss, num_replicas_in_sync):
-    """In the scaled loss, take the sum of the loss across replicas."""
+    """this method is not specific to each loss path, but each loss type"""
    return loss


@@ -582,7 +575,6 @@ class YoloLoss:
               keys,
               classes,
               anchors,
-               masks=None,
               path_strides=None,
               truth_thresholds=None,
               ignore_thresholds=None,
@@ -606,8 +598,6 @@ class YoloLoss:
      anchors: `List[List[int]]` for the anchor boxes that are used in the model
        at all levels. For anchor free prediction set the anchor list to be the
        same as the image resolution.
-      masks: `List[int]` for the output level that this specific model output
-        level
      path_strides: `Dict[int]` for how much to scale this level to get the
        orginal input shape for each FPN path.
      truth_thresholds: `Dict[float]` for the IOU value over which the loss is
@@ -649,13 +639,12 @@ class YoloLoss:
      loss_type = 'scaled'
    else:
      loss_type = 'darknet'
-
+    
    self._loss_dict = {}
    for key in keys:
      self._loss_dict[key] = losses[loss_type](
          classes=classes,
-          anchors=anchors,
-          mask=masks[key],
+          anchors=anchors[key],
          truth_thresh=truth_thresholds[key],
          ignore_thresh=ignore_thresholds[key],
          loss_type=loss_types[key],
@@ -691,7 +680,7 @@ class YoloLoss:
      # after computing the loss, scale loss as needed for aggregation
      # across FPN levels
      loss = self._loss_dict[key].post_path_aggregation(
-          loss, ground_truth, predictions)
+          loss, loss_box, loss_conf, loss_class, ground_truth, predictions)

      # after completing the scaling of the loss on each replica, handle
      # scaling the loss for mergeing the loss across replicas

--- a/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
+++ b/official/vision/beta/projects/yolo/losses/yolo_loss_test.py
@@ -42,10 +42,9 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
        '5': [1, 13, 13, 255]
    }
    classes = 80
-    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
-    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
-               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
-               [348.0, 340.0]]
+    anchors = {'3': [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0]], 
+               '4': [[46.0, 114.0],[133.0, 127.0], [79.0, 225.0]], 
+               '5': [[301.0, 150.0], [172.0, 286.0], [348.0, 340.0]]}
    keys = ['3', '4', '5']
    path_strides = {key: 2**int(key) for key in keys}

@@ -53,7 +52,6 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
        keys,
        classes,
        anchors,
-        masks=masks,
        path_strides=path_strides,
        truth_thresholds={key: 1.0 for key in keys},
        ignore_thresholds={key: 0.7 for key in keys},

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Contains common building blocks for yolo layer (detection layer)."""
 import tensorflow as tf

@@ -26,7 +25,6 @@ class YoloLayer(tf.keras.Model):
  """Yolo layer (detection generator)."""

  def __init__(self,
-               masks,
               anchors,
               classes,
               iou_thresh=0.0,
@@ -52,8 +50,6 @@ class YoloLayer(tf.keras.Model):
    """Parameters for the loss functions used at each detection head output.

    Args:
-      masks: `List[int]` for the output level that this specific model output
-        level.
      anchors: `List[List[int]]` for the anchor boxes that are used in the
        model.
      classes: `int` for the number of classes.
@@ -107,7 +103,6 @@ class YoloLayer(tf.keras.Model):
      **kwargs: Addtional keyword arguments.
    """
    super().__init__(**kwargs)
-    self._masks = masks
    self._anchors = anchors
    self._thresh = iou_thresh
    self._ignore_thresh = ignore_thresh
@@ -127,30 +122,24 @@ class YoloLayer(tf.keras.Model):

    self._pre_nms_points = pre_nms_points
    self._label_smoothing = label_smoothing
-    self._keys = list(masks.keys())
+
+    self._keys = list(anchors.keys())
    self._len_keys = len(self._keys)
    self._box_type = box_type
-    self._path_scale = path_scale or {
-        key: 2**int(key) for key, _ in masks.items()
-    }
+    self._path_scale = path_scale or {key: 2**int(key) for key in self._keys}

    self._nms_type = nms_type
-    self._scale_xy = scale_xy or {key: 1.0 for key, _ in masks.items()}
+    self._scale_xy = scale_xy or {key: 1.0 for key, _ in anchors.items()}

    self._generator = {}
    self._len_mask = {}
    for key in self._keys:
-      anchors = [self._anchors[mask] for mask in self._masks[key]]
-      self._generator[key] = self.get_generators(anchors, self._path_scale[key],  # pylint: disable=assignment-from-none
-                                                 key)
-      self._len_mask[key] = len(self._masks[key])
+      anchors = self._anchors[key]
+      self._generator[key] = loss_utils.GridGenerator(
+        anchors, scale_anchors=self._path_scale[key])
+      self._len_mask[key] = len(anchors)
    return

-  def get_generators(self, anchors, path_scale, path_key):
-    anchor_generator = loss_utils.GridGenerator(
-        anchors, scale_anchors=path_scale)
-    return anchor_generator
-
  def parse_prediction_path(self, key, inputs):
    shape_ = tf.shape(inputs)
    shape = inputs.get_shape().as_list()
@@ -290,7 +279,6 @@ class YoloLayer(tf.keras.Model):
        keys=self._keys,
        classes=self._classes,
        anchors=self._anchors,
-        masks=self._masks,
        path_strides=self._path_scale,
        truth_thresholds=self._truth_thresh,
        ignore_thresholds=self._ignore_thresh,
@@ -309,7 +297,6 @@ class YoloLayer(tf.keras.Model):

  def get_config(self):
    return {
-        'masks': dict(self._masks),
        'anchors': [list(a) for a in self._anchors],
        'thresh': self._thresh,
        'max_boxes': self._max_boxes,

--- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator_test.py
@@ -14,6 +14,7 @@

 """Tests for yolo detection generator."""

+from official.vision.beta.projects.yolo.ops import anchor
 from absl.testing import parameterized
 import tensorflow as tf

@@ -35,14 +36,13 @@ class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
        '5': [1, 13, 13, 255]
    }
    classes = 80
-    masks = {'3': [0, 1, 2], '4': [3, 4, 5], '5': [6, 7, 8]}
-    anchors = [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0], [46.0, 114.0],
-               [133.0, 127.0], [79.0, 225.0], [301.0, 150.0], [172.0, 286.0],
-               [348.0, 340.0]]
-    box_type = {key: 'scaled' for key in masks.keys()}
-
-    layer = dg.YoloLayer(
-        masks, anchors, classes, box_type=box_type, max_boxes=10)
+    anchors = {'3': [[12.0, 19.0], [31.0, 46.0], [96.0, 54.0]], 
+               '4': [[46.0, 114.0],[133.0, 127.0], [79.0, 225.0]], 
+               '5': [[301.0, 150.0], [172.0, 286.0], [348.0, 340.0]]}
+
+    box_type = {key: 'scaled' for key in anchors.keys()}
+
+    layer = dg.YoloLayer(anchors, classes, box_type=box_type, max_boxes=10)

    inputs = {}
    for key in input_shape:

--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
@@ -1665,7 +1665,13 @@ class DarkRouteProcess(tf.keras.layers.Layer):


 class Reorg(tf.keras.layers.Layer):
-
+  """Splits a high resolution image into 4 lower resolution images. 
+  
+  Used in YOLOR to process very high resolution inputs efficiently. 
+  for example an input image of [1280, 1280, 3] will become [640, 640, 12], 
+  the images are sampled in such a way that the spatial resoltion is 
+  retained.
+  """
  def call(self, x, training=None):
    return tf.concat([x[..., ::2, ::2, :], 
                      x[..., 1::2, ::2, :], 

--- a/official/vision/beta/projects/yolo/modeling/yolo_model.py
+++ b/official/vision/beta/projects/yolo/modeling/yolo_model.py
@@ -16,7 +16,6 @@

 import tensorflow as tf

-
 # static base Yolo Models that do not require configuration
 # similar to a backbone model id.


--- a/official/vision/beta/projects/yolo/ops/anchor.py
+++ b/official/vision/beta/projects/yolo/ops/anchor.py
@@ -13,12 +13,14 @@
 # limitations under the License.

 """Yolo Anchor labler."""
+import numpy as np
 import tensorflow as tf
-from tensorflow.python.ops.gen_math_ops import maximum, minimum
+
 from official.vision.beta.projects.yolo.ops import box_ops
 from official.vision.beta.projects.yolo.ops import preprocessing_ops
 from official.vision.beta.projects.yolo.ops import loss_utils

+INF = 10000000

 def get_best_anchor(y_true,
                    anchors,
@@ -28,15 +30,22 @@ def get_best_anchor(y_true,
                    iou_thresh=0.25,
                    best_match_only=False, 
                    use_tie_breaker=True):
-  """
-  get the correct anchor that is assoiciated with each box using IOU
+  """Get the correct anchor that is assoiciated with each box using IOU.
  
  Args:
-    y_true: tf.Tensor[] for the list of bounding boxes in the yolo format
+    y_true: tf.Tensor[] for the list of bounding boxes in the yolo format.
    anchors: list or tensor for the anchor boxes to be used in prediction
-      found via Kmeans
-    width: int for the image width
-    height: int for the image height
+      found via Kmeans.
+    width: int for the image width.
+    height: int for the image height.
+    iou_thresh: `float` the minimum iou threshold to use for selecting boxes for 
+      each level. 
+    best_match_only: `bool` if the box only has one match and it is less than 
+      the iou threshold, when set to True, this match will be dropped as no 
+      anchors can be linked to it. 
+    use_tie_breaker: `bool` if there is many anchors for a given box, then 
+      attempt to use all of them, if False, only the first matching box will 
+      be used. 
  Return:
    tf.Tensor: y_true with the anchor associated with each ground truth
    box known
@@ -46,7 +55,10 @@ def get_best_anchor(y_true,
    height = tf.cast(height, dtype=tf.float32)
    scaler = tf.convert_to_tensor([width, height])

+    # scale to levels houts width and height
    true_wh = tf.cast(y_true[..., 2:4], dtype=tf.float32) * scaler
+
+    # scale down from large anchor to small anchor type
    anchors = tf.cast(anchors, dtype=tf.float32)/stride

    k = tf.shape(anchors)[0]
@@ -71,7 +83,6 @@ def get_best_anchor(y_true,
      values = -values
      ind_mask = tf.cast(values < iou_thresh, dtype=indexes.dtype)
    else:
-      # iou_raw = box_ops.compute_iou(truth_comp, anchors)
      truth_comp = box_ops.xcycwh_to_yxyx(truth_comp)
      anchors = box_ops.xcycwh_to_yxyx(anchors)
      iou_raw = box_ops.aggregated_comparitive_iou(
@@ -80,7 +91,7 @@ def get_best_anchor(y_true,
          iou_type=3,
      )
      values, indexes = tf.math.top_k(
-          iou_raw,  #tf.transpose(iou_raw, perm=[0, 2, 1]),
+          iou_raw,
          k=tf.cast(k, dtype=tf.int32),
          sorted=True)
      ind_mask = tf.cast(values >= iou_thresh, dtype=indexes.dtype)
@@ -102,18 +113,73 @@ def get_best_anchor(y_true,
  return tf.cast(iou_index, dtype=tf.float32), tf.cast(values, dtype=tf.float32)

 class YoloAnchorLabeler:
+  """Anchor labeler for the Yolo Models"""
+
  def __init__(self, 
               anchors = None, 
+               anchor_free_level_limits = None, 
+               level_strides = None, 
+               center_radius = None, 
+               max_num_instances = 200, 
               match_threshold = 0.25, 
               best_matches_only = False, 
-               use_tie_breaker = True):
+               use_tie_breaker = True, 
+               darknet = False, 
+               dtype = 'float32'):
+    """Initialization for anchor labler. 
+    
+    Args: 
+      anchors: `Dict[List[Union[int, float]]]` values for each anchor box.
+      anchor_free_level_limits: `List` the box sizes that will be allowed at 
+        each FPN level as is done in the FCOS and YOLOX paper for anchor free 
+        box assignment.
+      level_strides: `Dict[int]` for how much the model scales down the 
+        images at the each level.
+      center_radius: `Dict[float]` for radius around each box center to search 
+        for extra centers in each level.
+      max_num_instances: `int` for the number of boxes to compute loss on.
+      match_threshold: `float` indicating the threshold over which an anchor 
+        will be considered for prediction, at zero, all the anchors will be used 
+        and at 1.0 only the best will be used. for anchor thresholds larger than 
+        1.0 we stop using the IOU for anchor comparison and resort directly to 
+        comparing the width and height, this is used for the scaled models.  
+      best_matches_only: `boolean` indicating how boxes are selected for 
+        optimization.  
+      use_tie_breaker: `boolean` indicating whether to use the anchor threshold 
+        value.
+      darknet: `boolean` indicating which data pipeline to use. Setting to True 
+        swaps the pipeline to output images realtive to Yolov4 and older.
+      dtype: `str` indicating the output datatype of the datapipeline selecting 
+        from {"float32", "float16", "bfloat16"}.
+    """
    self.anchors = anchors
    self.masks = self._get_mask()
+    self.anchor_free_level_limits = self._get_level_limits(
+                                                anchor_free_level_limits)
+
+    if darknet and self.anchor_free_level_limits is None:
+      center_radius = None
+    
+    self.keys = self.anchors.keys()
+    if self.anchor_free_level_limits is not None:
+      maxim = 2000
+      match_threshold = -0.01
+      self.num_instances = {key: maxim for key in self.keys}
+    elif not darknet:
+      self.num_instances = {
+        key: (6 - i) * max_num_instances for i, key in enumerate(self.keys)}
+    else:
+      self.num_instances = {key: max_num_instances for key in self.keys}
+
+    self.center_radius = center_radius
+    self.level_strides = level_strides
    self.match_threshold = match_threshold
    self.best_matches_only = best_matches_only
    self.use_tie_breaker = use_tie_breaker
+    self.dtype = dtype

  def _get_mask(self):
+    """For each level get indexs of each anchor for box search across levels."""
    masks = {}
    start = 0

@@ -124,8 +190,21 @@ class YoloAnchorLabeler:
      masks[str(i)] = list(range(start, per_scale + start))
      start += per_scale
    return masks
+  
+  def _get_level_limits(self, level_limits):
+    """For each level receptive feild range for anchor free box placement."""
+    if level_limits is not None:
+      level_limits_dict = {}
+      level_limits = [0.0] + level_limits + [np.inf]
+
+      for i, key in enumerate(self.anchors.keys()):
+        level_limits_dict[key] = level_limits[i:i + 2]
+    else:
+      level_limits_dict = None
+    return level_limits_dict

  def _tie_breaking_search(self, anchors, mask, boxes, classes):
+    """After search, link each anchor ind to the correct map in ground truth."""
    mask = tf.cast(tf.reshape(mask, [1, 1, 1, -1]), anchors.dtype)
    anchors = tf.expand_dims(anchors, axis=-1)
    viable =  tf.where(tf.squeeze(anchors == mask, axis = 0))
@@ -140,10 +219,12 @@ class YoloAnchorLabeler:
    anchor_id = tf.cast(anchor_id, boxes.dtype)
    return boxes, classes, anchor_id

-  def _get_anchor_id(self, key, boxes, classes, anchors, width, height, stride):
+  def _get_anchor_id(self, key, boxes, classes, width, height, stride, 
+                     iou_index = None):
    """Find the object anchor assignments in an anchor based paradigm. """
    
    # find the best anchor
+    anchors = self.anchors[key]
    num_anchors = len(anchors)
    if self.best_matches_only:
      # get the best anchor for each box
@@ -153,28 +234,20 @@ class YoloAnchorLabeler:
                                        iou_thresh=self.match_threshold)
      mask = range(num_anchors)
    else: 
-      # stitch and search boxes across fpn levels
-      anchorsvec = []
-      for stitch in self.anchors.keys():
-        anchorsvec.extend(self.anchors[stitch])
-
-      # get the best anchor for each box
-      iou_index, _ = get_best_anchor(boxes, anchorsvec, stride,
-                                        width=width, height=height, 
-                                        best_match_only=False, 
-                                        use_tie_breaker=self.use_tie_breaker,
-                                        iou_thresh=self.match_threshold)
+      # search is done across FPN levels, get the mask of anchor indexes
+      # corralated to this level.  
      mask = self.masks[key]

    # search for the correct box to use
-    (boxes, 
-    classes,
-    anchors) = self._tie_breaking_search(iou_index, mask, boxes, classes)
+    (boxes, classes, anchors) = self._tie_breaking_search(iou_index, mask, 
+                                                                boxes, classes)
    return boxes, classes, anchors, num_anchors

-  def _get_centers(self, boxes, classes, anchors, width, height, offset):
+  def _get_centers(self, boxes, classes, anchors, width, height, scale_xy):
    """Find the object center assignments in an anchor based paradigm. """
-    grid_xy, wh = tf.split(boxes, 2, axis = -1)
+    offset = tf.cast(0.5 * (scale_xy - 1), boxes.dtype)
+
+    grid_xy, _ = tf.split(boxes, 2, axis = -1)
    wh_scale = tf.cast(tf.convert_to_tensor([width, height]), boxes.dtype)

    grid_xy = grid_xy * wh_scale
@@ -234,16 +307,16 @@ class YoloAnchorLabeler:
    return boxes, classes, centers

  def _get_anchor_free(self,
+                       key, 
                       boxes,
                       classes,
                       height,
                       width,
                       stride,
-                       fpn_limits,
-                       center_radius=2.5):
-    """Find the box assignements in an anchor free paradigm. """
-    gen = loss_utils.GridGenerator(
-      masks=None, anchors=[[1, 1]], scale_anchors=stride)
+                       center_radius):
+    """Find the box assignements in an anchor free paradigm."""
+    level_limits = self.anchor_free_level_limits[key]
+    gen = loss_utils.GridGenerator(anchors=[[1, 1]], scale_anchors=stride)
    grid_points = gen(width, height, 1, boxes.dtype)[0]
    grid_points = tf.squeeze(grid_points, axis=0)
    box_list = boxes
@@ -266,10 +339,10 @@ class YoloAnchorLabeler:
    b_b = tlbr_boxes[..., 2] - y_centers
    b_r = tlbr_boxes[..., 3] - x_centers
    box_delta = tf.stack([b_t, b_l, b_b, b_r], axis=-1)
-    if fpn_limits is not None:
+    if level_limits is not None:
      max_reg_targets_per_im = tf.reduce_max(box_delta, axis=-1)
-      gt_min = max_reg_targets_per_im >= fpn_limits[0]
-      gt_max = max_reg_targets_per_im <= fpn_limits[1]
+      gt_min = max_reg_targets_per_im >= level_limits[0]
+      gt_max = max_reg_targets_per_im <= level_limits[1]
      is_in_boxes = tf.logical_and(gt_min, gt_max)
    else:
      is_in_boxes = tf.reduce_min(box_delta, axis=-1) > 0.0
@@ -290,11 +363,10 @@ class YoloAnchorLabeler:
    is_in_boxes_and_center = tf.logical_and(is_in_index, is_in_boxes_and_center)

    if self.use_tie_breaker:
-      inf = 10000000
      boxes_all = tf.cast(is_in_boxes_and_center, area.dtype) 
-      boxes_all = ((boxes_all * area) + ((1 - boxes_all) * inf))
+      boxes_all = ((boxes_all * area) + ((1 - boxes_all) * INF))
      boxes_min = tf.reduce_min(boxes_all, axis = -1, keepdims = True)
-      boxes_min = tf.where(boxes_min == inf, -1.0, boxes_min)
+      boxes_min = tf.where(boxes_min == INF, -1.0, boxes_min)
      is_in_boxes_and_center = boxes_all == boxes_min

    # construct the index update grid
@@ -314,33 +386,60 @@ class YoloAnchorLabeler:
    indexes = tf.concat([y, x, tf.zeros_like(t)], axis=-1)
    return indexes, samples

-  def __call__(self, 
-               key, 
-               boxes, 
-               classes, 
-               anchors, 
-               width, 
-               height, 
-               stride, 
-               scale_xy, 
-               num_instances, 
-               fpn_limits = None):
+  def build_label_per_path(self, 
+                           key, 
+                           boxes, 
+                           classes, 
+                           width, 
+                           height, 
+                           iou_index = None):
+    """Builds the labels for one path."""
+    stride = self.level_strides[key]
+    scale_xy = self.center_radius[key] if self.center_radius is not None else 1
+
+    width = tf.cast(width//stride, boxes.dtype)
+    height = tf.cast(height//stride, boxes.dtype)
+    
+    if self.anchor_free_level_limits is None:
+      (boxes, classes, 
+       anchors, num_anchors) = self._get_anchor_id(key, boxes, classes, 
+                                                   width, height, stride, 
+                                                   iou_index = iou_index)
+      boxes, classes, centers = self._get_centers(boxes, classes, anchors, 
+                                                  width, height, scale_xy)
+      ind_mask = tf.ones_like(classes)
+      updates = tf.concat([boxes, ind_mask, classes], axis = -1)
+    else:
+      num_anchors = 1
+      (centers, updates) = self._get_anchor_free(key, boxes, classes, height, 
+                                                 width, stride, scale_xy)
+      boxes, ind_mask, classes = tf.split(updates, [4, 1, 1], axis = -1)
+      
+    width = tf.cast(width, tf.int32)
+    height = tf.cast(height, tf.int32)
+    full = tf.zeros([height, width, num_anchors, 1], dtype=classes.dtype)
+    full = tf.tensor_scatter_nd_add(full, centers, ind_mask)
+
+    num_instances = int(self.num_instances[key])
+    centers = preprocessing_ops.pad_max_instances(
+      centers, num_instances, pad_value=0, pad_axis=0)
+    updates = preprocessing_ops.pad_max_instances(
+      updates, num_instances, pad_value=0, pad_axis=0)
+
+    updates = tf.cast(updates, self.dtype)
+    full = tf.cast(full, self.dtype)
+    return centers, updates, full
+
+  def __call__(self, boxes, classes, width, height):
    """Builds the labels for a single image, not functional in batch mode. 
    
    Args: 
      boxes: `Tensor` of shape [None, 4] indicating the object locations in 
        an image. 
      classes: `Tensor` of shape [None] indicating the each objects classes.
-      anchors: `List[List[int, float]]` representing the anchor boxes to build 
-        the model against. 
      width: `int` for the images width. 
      height: `int` for the images height.
-      stride: `int` for how much the image gets scaled at this level.
-      scale_xy: `float` for the center shifts to apply when finding center 
-        assignments for a box. 
      num_instances: `int` for the maximum number of expanded boxes to allow. 
-      fpn_limits: `List[int]` given no anchor boxes this is used to limit the 
-        boxes assied to the each fpn level based on the levels receptive feild. 

    Returns: 
      centers: `Tensor` of shape [None, 3] of indexes in the final grid where 
@@ -349,35 +448,27 @@ class YoloAnchorLabeler:
      full: `Tensor` of [width/stride, height/stride, num_anchors, 1] holding 
        a mask of where boxes are locates for confidence losses. 
    """
-    boxes = box_ops.yxyx_to_xcycwh(boxes)
+    indexes = {}
+    updates = {}
+    true_grids = {}
+    iou_index = None

-    width //= stride
-    height //= stride
-    width = tf.cast(width, boxes.dtype)
-    height = tf.cast(height, boxes.dtype)
-
-    if fpn_limits is None:
-      offset = tf.cast(0.5 * (scale_xy - 1), boxes.dtype)
-      (boxes, classes, 
-       anchors, num_anchors) = self._get_anchor_id(key, boxes, classes, anchors, 
-                                                   width, height, stride)
-      boxes, classes, centers = self._get_centers(boxes, classes, anchors, 
-                                                  width, height, offset)
-      ind_mask = tf.ones_like(classes)
-      updates = tf.concat([boxes, ind_mask, classes], axis = -1)
-    else:
-      (centers, updates) = self._get_anchor_free(boxes, classes, height, 
-                                                   width, stride, fpn_limits)
-      boxes, ind_mask, classes = tf.split(updates, [4, 1, 1], axis = -1)
-      num_anchors = 1
+    boxes = box_ops.yxyx_to_xcycwh(boxes)
+    if not self.best_matches_only and self.anchor_free_level_limits is None: 
+      # stitch and search boxes across fpn levels
+      anchorsvec = []
+      for stitch in self.anchors:
+        anchorsvec.extend(self.anchors[stitch])

+      stride = tf.cast([width, height], boxes.dtype)
+      # get the best anchor for each box
+      iou_index, _ = get_best_anchor(boxes, anchorsvec, stride,
+                                        width=1.0, height=1.0, 
+                                        best_match_only=False, 
+                                        use_tie_breaker=self.use_tie_breaker,
+                                        iou_thresh=self.match_threshold)

-    width = tf.cast(width, tf.int32)
-    height = tf.cast(height, tf.int32)
-    full = tf.zeros([height, width, num_anchors, 1], dtype=classes.dtype)
-    full = tf.tensor_scatter_nd_add(full, centers, ind_mask)
-    centers = preprocessing_ops.pad_max_instances(
-      centers, int(num_instances), pad_value=0, pad_axis=0)
-    updates = preprocessing_ops.pad_max_instances(
-      updates, int(num_instances), pad_value=0, pad_axis=0)
-    return centers, updates, full
+    for key in self.keys:
+      indexes[key], updates[key], true_grids[key] = self.build_label_per_path(
+        key, boxes, classes, width, height, iou_index = iou_index)
+    return indexes, updates, true_grids 
\ No newline at end of file
--- a/official/vision/beta/projects/yolo/ops/loss_utils.py
+++ b/official/vision/beta/projects/yolo/ops/loss_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 """Yolo loss utility functions."""
+
 import numpy as np
 import tensorflow as tf

@@ -129,6 +130,10 @@ def build_grid(indexes, truths, preds, ind_mask, update=False, grid=None):
  indexes = apply_mask(tf.cast(ind_mask, indexes.dtype), indexes)
  indexes = (indexes + (ind_mask - 1))

+  # mask truths
+  truths = apply_mask(tf.cast(ind_mask, truths.dtype), truths)
+  truths = (truths + (tf.cast(ind_mask, truths.dtype) - 1))
+
  # reshape the indexes into the correct shape for the loss,
  # just flatten all indexes but the last
  indexes = tf.reshape(indexes, [-1, 4])
@@ -157,26 +162,16 @@ def build_grid(indexes, truths, preds, ind_mask, update=False, grid=None):
 class GridGenerator:
  """Grid generator that generates anchor grids for box decoding."""

-  def __init__(self, anchors, masks=None, scale_anchors=None):
+  def __init__(self, anchors, scale_anchors=None):
    """Initialize Grid Generator.

    Args:
      anchors: A `List[List[int]]` for the anchor boxes that are used in the
        model at all levels.
-      masks: A `List[int]` for the output level that this specific model output
-        Level.
      scale_anchors: An `int` for how much to scale this level to get the
        original input shape.
    """
    self.dtype = tf.keras.backend.floatx()
-    if masks is not None:
-      self._num = len(masks)
-    else:
-      self._num = tf.shape(anchors)[0]
-
-    if masks is not None:
-      anchors = [anchors[mask] for mask in masks]
-
    self._scale_anchors = scale_anchors
    self._anchors = tf.convert_to_tensor(anchors)
    return
@@ -331,18 +326,10 @@ class PairWiseSearch:
               pred_classes,
               boxes,
               classes,
-               scale=None,
-               yxyx=True,
               clip_thresh=0.0):
    num_boxes = tf.shape(boxes)[-2]
    num_tiles = (num_boxes // TILE_SIZE) - 1

-    if yxyx:
-      boxes = box_ops.yxyx_to_xcycwh(boxes)
-
-    if scale is not None:
-      boxes = boxes * tf.stop_gradient(scale)
-
    if self._min_conf > 0.0:
      pred_classes = tf.cast(pred_classes > self._min_conf, pred_classes.dtype)

@@ -540,7 +527,6 @@ def _anchor_free_scale_boxes(encoded_boxes,
                             height,
                             stride,
                             grid_points,
-                             scale_xy,
                             darknet=False):
  """Decode models boxes using FPN stride under anchor free conditions."""
  # split the boxes
@@ -549,7 +535,6 @@ def _anchor_free_scale_boxes(encoded_boxes,

  # build a scaling tensor to get the offset of th ebox relative to the image
  scaler = tf.convert_to_tensor([height, width, height, width])
-  scale_xy = tf.cast(scale_xy, encoded_boxes.dtype)

  scale_down = lambda x, y: x / y
  scale_up = lambda x, y: x * y
@@ -557,10 +542,6 @@ def _anchor_free_scale_boxes(encoded_boxes,
    scale_down = tf.grad_pass_through(scale_down)
    scale_up = tf.grad_pass_through(scale_up)

-  # scale the centers and find the offset of each box relative to
-  # their center pixel
-  pred_xy = pred_xy * scale_xy - 0.5 * (scale_xy - 1)
-
  # scale the offsets and add them to the grid points or a tensor that is
  # the realtive location of each pixel
  box_xy = (grid_points + pred_xy)
@@ -624,7 +605,7 @@ def get_predicted_box(width,
  if box_type == 'anchor_free':
    (scaler, scaled_box,
     pred_box) = _anchor_free_scale_boxes(encoded_boxes, width, height, stride,
-                                        grid_points, scale_xy, darknet=darknet)
+                                        grid_points, darknet=darknet)
  elif darknet:

    # pylint:disable=unbalanced-tuple-unpacking

--- a/official/vision/beta/projects/yolo/ops/mosaic.py
+++ b/official/vision/beta/projects/yolo/ops/mosaic.py
@@ -17,7 +17,7 @@ import random
 import tensorflow as tf
 import tensorflow_addons as tfa

-from yolo.ops import preprocessing_ops
+from official.vision.beta.projects.yolo.ops import preprocessing_ops
 from official.vision.beta.ops import box_ops
 from official.vision.beta.ops import preprocess_ops

@@ -396,4 +396,3 @@ class Mosaic:
      return self._apply
    else:
      return self._skip
-      
\ No newline at end of file
--- a/official/vision/beta/projects/yolo/ops/preprocessing_ops.py
+++ b/official/vision/beta/projects/yolo/ops/preprocessing_ops.py
@@ -4,8 +4,6 @@ import random
 import os

 import tensorflow_addons as tfa
-from official.vision.beta.projects.yolo.ops import box_ops
-from official.vision.beta.projects.yolo.ops import loss_utils
 from official.vision.beta.ops import box_ops as bbox_ops

 PAD_VALUE = 114
@@ -122,6 +120,11 @@ def pad_max_instances(value, instances, pad_value=0, pad_axis=0):
  nshape = tf.concat([shape[:pad_axis], pad, shape[(pad_axis + 1):]], axis=0)
  pad_tensor = tf.fill(nshape, tf.cast(pad_value, dtype=value.dtype))
  value = tf.concat([value, pad_tensor], axis=pad_axis)
+
+  if isinstance(instances, int):
+    vshape = value.get_shape().as_list()
+    vshape[pad_axis] = instances
+    value.set_shape(vshape)
  return value


@@ -317,10 +320,43 @@ def resize_and_jitter_image(image,
                            cut=None,
                            method=tf.image.ResizeMethod.BILINEAR,
                            seed=None):
-  """WIP"""
+  """Resize, Pad, and distort a given input image.
+  
+  Args:
+    image: a `Tensor` of shape [height, width, 3] representing an image.
+    desired_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the desired actual output image size.
+    jitter: an `int` representing the maximum jittering that can be applied to
+      the image.
+    letter_box: a `bool` representing if letterboxing should be applied.
+    random_pad: a `bool` representing if random padding should be applied.
+    crop_only: a `bool` representing if only cropping will be applied.
+    shiftx: a `float` indicating if the image is in the
+      left or right.
+    shifty: a `float` value indicating if the image is in the
+      top or bottom.
+    cut: a `float` value indicating the desired center of the final patched
+      image.
+    method: function to resize input image to scaled image.
+    seed: seed for random scale jittering.
+  
+  Returns:
+    image_: a `Tensor` of shape [height, width, 3] where [height, width]
+      equals to `desired_size`.
+    infos: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [desired_height, desired_width],
+        [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+      desired_width] is the actual scaled image size, and [y_scale, x_scale] is
+      the scaling factor, which is the ratio of
+      scaled dimension / original dimension.
+    cast([original_width, original_height, width, height, ptop, pleft, pbottom,
+      pright], tf.float32): a `Tensor` containing the information of the image
+        andthe applied preprocessing.
+  """

  def intersection(a, b):
-    """Find the intersection of 2 crop boxes."""
+    """Find the intersection between 2 crops"""
    minx = tf.maximum(a[0], b[0])
    miny = tf.maximum(a[1], b[1])
    maxx = tf.minimum(a[2], b[2])
@@ -328,11 +364,10 @@ def resize_and_jitter_image(image,
    return tf.convert_to_tensor([minx, miny, maxx, maxy])

  def cast(values, dtype):
-    """Cast a list of items to a givne data type to reduce lines of code"""
    return [tf.cast(value, dtype) for value in values]

  if jitter > 0.5 or jitter < 0:
-    raise Exception("maximum change in aspect ratio must be between 0 and 0.5")
+    raise Exception('maximum change in aspect ratio must be between 0 and 0.5')

  with tf.name_scope('resize_and_jitter_image'):
    # Cast all parameters to a usable float data type.