yolo_input.py

# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Detection Data parser and processing for YOLO."""
import tensorflow as tf

from official.projects.yolo.ops import anchor
from official.projects.yolo.ops import preprocessing_ops
from official.vision.dataloaders import parser
from official.vision.dataloaders import utils
from official.vision.ops import box_ops as bbox_ops
from official.vision.ops import preprocess_ops


class Parser(parser.Parser):
  """Parse the dataset in to the YOLO model format."""

  def __init__(self,
               output_size,
               anchors,
               expanded_strides,
               level_limits=None,
               max_num_instances=200,
               area_thresh=0.1,
               aug_rand_hue=1.0,
               aug_rand_saturation=1.0,
               aug_rand_brightness=1.0,
               letter_box=False,
               random_pad=True,
               random_flip=True,
               jitter=0.0,
               aug_scale_min=1.0,
               aug_scale_max=1.0,
               aug_rand_translate=0.0,
               aug_rand_perspective=0.0,
               aug_rand_angle=0.0,
               anchor_t=4.0,
               scale_xy=None,
               best_match_only=False,
               darknet=False,
               use_tie_breaker=True,
               dtype='float32',
               seed=None):
    """Initializes parameters for parsing annotations in the dataset.

    Args:
      output_size: `Tensor` or `List` for [height, width] of output image. The
        output_size should be divided by the largest feature stride 2^max_level.
      anchors: `Dict[List[Union[int, float]]]` of anchor boxes to be bes used in
        each level.
      expanded_strides: `Dict[int]` for how much the model scales down the
        images at the largest level. For example, level 3 down samples the image
        by a factor of 16, in the expanded strides dictionary, we will pass
        along {3: 16} indicating that relative to the original image, the shapes
          must be reduced by a factor of 16 to compute the loss.
      level_limits: `List` the box sizes that will be allowed at each FPN level
        as is done in the FCOS and YOLOX paper for anchor free box assignment.
      max_num_instances: `int` for the number of boxes to compute loss on.
      area_thresh: `float` for the minimum area of a box to allow to pass
        through for optimization.
      aug_rand_hue: `float` indicating the maximum scaling value for hue.
        saturation will be scaled between 1 - value and 1 + value.
      aug_rand_saturation: `float` indicating the maximum scaling value for
        saturation. saturation will be scaled between 1/value and value.
      aug_rand_brightness: `float` indicating the maximum scaling value for
        brightness. brightness will be scaled between 1/value and value.
      letter_box: `boolean` indicating whether upon start of the data pipeline
        regardless of the preprocessing ops that are used, the aspect ratio of
        the images should be preserved.
      random_pad: `bool` indiccating wether to use padding to apply random
        translation, true for darknet yolo false for scaled yolo.
      random_flip: `boolean` indicating whether or not to randomly flip the
        image horizontally.
      jitter: `float` for the maximum change in aspect ratio expected in each
        preprocessing step.
      aug_scale_min: `float` indicating the minimum scaling value for image
        scale jitter.
      aug_scale_max: `float` indicating the maximum scaling value for image
        scale jitter.
      aug_rand_translate: `float` ranging from 0 to 1 indicating the maximum
        amount to randomly translate an image.
      aug_rand_perspective: `float` ranging from 0.000 to 0.001 indicating how
        much to prespective warp the image.
      aug_rand_angle: `float` indicating the maximum angle value for angle.
        angle will be changes between 0 and value.
      anchor_t: `float` indicating the threshold over which an anchor will be
        considered for prediction, at zero, all the anchors will be used and at
        1.0 only the best will be used. for anchor thresholds larger than 1.0 we
        stop using the IOU for anchor comparison and resort directly to
        comparing the width and height, this is used for the scaled models.
      scale_xy: dictionary `float` values inidcating how far each pixel can see
        outside of its containment of 1.0. a value of 1.2 indicates there is a
        20% extended radius around each pixel that this specific pixel can
        predict values for a center at. the center can range from 0 - value/2 to
        1 + value/2, this value is set in the yolo filter, and resused here.
        there should be one value for scale_xy for each level from min_level to
        max_level.
      best_match_only: `boolean` indicating how boxes are selected for
        optimization.
      darknet: `boolean` indicating which data pipeline to use. Setting to True
        swaps the pipeline to output images realtive to Yolov4 and older.
      use_tie_breaker: `boolean` indicating whether to use the anchor threshold
        value.
      dtype: `str` indicating the output datatype of the datapipeline selecting
        from {"float32", "float16", "bfloat16"}.
      seed: `int` the seed for random number generation.
    """
    for key in anchors:
      # Assert that the width and height is viable
      assert output_size[1] % expanded_strides[str(key)] == 0
      assert output_size[0] % expanded_strides[str(key)] == 0

    # Set the width and height properly and base init:
    self._image_w = output_size[1]
    self._image_h = output_size[0]
    self._max_num_instances = max_num_instances

    # Image scaling params
    self._jitter = 0.0 if jitter is None else jitter
    self._aug_scale_min = aug_scale_min
    self._aug_scale_max = aug_scale_max
    self._aug_rand_translate = aug_rand_translate
    self._aug_rand_perspective = aug_rand_perspective

    # Image spatial distortion
    self._random_flip = random_flip
    self._letter_box = letter_box
    self._random_pad = random_pad
    self._aug_rand_angle = aug_rand_angle

    # Color space distortion of the image
    self._aug_rand_saturation = aug_rand_saturation
    self._aug_rand_brightness = aug_rand_brightness
    self._aug_rand_hue = aug_rand_hue

    # Set the per level values needed for operation
    self._darknet = darknet
    self._area_thresh = area_thresh
    self._level_limits = level_limits

    self._seed = seed
    self._dtype = dtype

    self._label_builder = anchor.YoloAnchorLabeler(
        anchors=anchors,
        anchor_free_level_limits=level_limits,
        level_strides=expanded_strides,
        center_radius=scale_xy,
        max_num_instances=max_num_instances,
        match_threshold=anchor_t,
        best_matches_only=best_match_only,
        use_tie_breaker=use_tie_breaker,
        darknet=darknet,
        dtype=dtype)

  def _pad_infos_object(self, image):
    """Get a Tensor to pad the info object list."""
    shape_ = tf.shape(image)
    val = tf.stack([
        tf.cast(shape_[:2], tf.float32),
        tf.cast(shape_[:2], tf.float32),
        tf.ones_like(tf.cast(shape_[:2], tf.float32)),
        tf.zeros_like(tf.cast(shape_[:2], tf.float32)),
    ])
    return val

  def _jitter_scale(self, image, shape, letter_box, jitter, random_pad,
                    aug_scale_min, aug_scale_max, translate, angle,
                    perspective):
    """Distort and scale each input image."""
    infos = []
    if (aug_scale_min != 1.0 or aug_scale_max != 1.0):
      crop_only = True
      # jitter gives you only one info object, resize and crop gives you one,
      # if crop only then there can be 1 form jitter and 1 from crop
      infos.append(self._pad_infos_object(image))
    else:
      crop_only = False
    image, crop_info, _ = preprocessing_ops.resize_and_jitter_image(
        image,
        shape,
        letter_box=letter_box,
        jitter=jitter,
        crop_only=crop_only,
        random_pad=random_pad,
        seed=self._seed,
    )
    infos.extend(crop_info)
    image, _, affine = preprocessing_ops.affine_warp_image(
        image,
        shape,
        scale_min=aug_scale_min,
        scale_max=aug_scale_max,
        translate=translate,
        degrees=angle,
        perspective=perspective,
        random_pad=random_pad,
        seed=self._seed,
    )
    return image, infos, affine

  def _parse_train_data(self, data):
    """Parses data for training."""

    # Initialize the shape constants.
    image = data['image']
    boxes = data['groundtruth_boxes']
    classes = data['groundtruth_classes']

    if self._random_flip:
      # Randomly flip the image horizontally.
      image, boxes, _ = preprocess_ops.random_horizontal_flip(
          image, boxes, seed=self._seed)

    if not data['is_mosaic']:
      image, infos, affine = self._jitter_scale(
          image, [self._image_h, self._image_w], self._letter_box, self._jitter,
          self._random_pad, self._aug_scale_min, self._aug_scale_max,
          self._aug_rand_translate, self._aug_rand_angle,
          self._aug_rand_perspective)

      # Clip and clean boxes.
      boxes, inds = preprocessing_ops.transform_and_clip_boxes(
          boxes,
          infos,
          affine=affine,
          shuffle_boxes=False,
          area_thresh=self._area_thresh,
          filter_and_clip_boxes=True,
          seed=self._seed)
      classes = tf.gather(classes, inds)
      info = infos[-1]
    else:
      image = tf.image.resize(
          image, (self._image_h, self._image_w), method='nearest')
      output_size = tf.cast([self._image_h, self._image_w], tf.float32)
      boxes_ = bbox_ops.denormalize_boxes(boxes, output_size)
      inds = bbox_ops.get_non_empty_box_indices(boxes_)
      boxes = tf.gather(boxes, inds)
      classes = tf.gather(classes, inds)
      info = self._pad_infos_object(image)

    # Apply scaling to the hue saturation and brightness of an image.
    image = tf.cast(image, dtype=self._dtype)
    image = image / 255.0
    image = preprocessing_ops.image_rand_hsv(
        image,
        self._aug_rand_hue,
        self._aug_rand_saturation,
        self._aug_rand_brightness,
        seed=self._seed,
        darknet=self._darknet or self._level_limits is not None)

    # Cast the image to the selcted datatype.
    image, labels = self._build_label(
        image, boxes, classes, info, inds, data, is_training=True)
    return image, labels

  def _parse_eval_data(self, data):
    """Parses data for evaluation."""

    # Get the image shape constants and cast the image to the selcted datatype.
    image = tf.cast(data['image'], dtype=self._dtype)
    boxes = data['groundtruth_boxes']
    classes = data['groundtruth_classes']

    image, infos, _ = preprocessing_ops.resize_and_jitter_image(
        image, [self._image_h, self._image_w],
        letter_box=self._letter_box,
        random_pad=False,
        shiftx=0.5,
        shifty=0.5,
        jitter=0.0)

    # Clip and clean boxes.
    image = image / 255.0
    boxes, inds = preprocessing_ops.transform_and_clip_boxes(
        boxes, infos, shuffle_boxes=False, area_thresh=0.0,
        filter_and_clip_boxes=False)
    classes = tf.gather(classes, inds)
    info = infos[-1]

    image, labels = self._build_label(
        image, boxes, classes, info, inds, data, is_training=False)
    return image, labels

  def set_shape(self, values, pad_axis=0, pad_value=0, inds=None):
    """Calls set shape for all input objects."""
    if inds is not None:
      values = tf.gather(values, inds)
    vshape = values.get_shape().as_list()

    values = preprocessing_ops.pad_max_instances(
        values, self._max_num_instances, pad_axis=pad_axis, pad_value=pad_value)

    vshape[pad_axis] = self._max_num_instances
    values.set_shape(vshape)
    return values

  def _build_label(self,
                   image,
                   gt_boxes,
                   gt_classes,
                   info,
                   inds,
                   data,
                   is_training=True):
    """Label construction for both the train and eval data."""
    width = self._image_w
    height = self._image_h

    # Set the image shape.
    imshape = image.get_shape().as_list()
    imshape[-1] = 3
    image.set_shape(imshape)

    labels = dict()
    (labels['inds'], labels['upds'],
     labels['true_conf']) = self._label_builder(gt_boxes, gt_classes, width,
                                                height)

    # Set/fix the boxes shape.
    boxes = self.set_shape(gt_boxes, pad_axis=0, pad_value=0)
    classes = self.set_shape(gt_classes, pad_axis=0, pad_value=-1)

    # Build the dictionary set.
    labels.update({
        'source_id': utils.process_source_id(data['source_id']),
        'bbox': tf.cast(boxes, dtype=self._dtype),
        'classes': tf.cast(classes, dtype=self._dtype),
    })

    # Update the labels dictionary.
    if not is_training:
      # Sets up groundtruth data for evaluation.
      groundtruths = {
          'source_id':
              labels['source_id'],
          'height':
              data['height'],
          'width':
              data['width'],
          'num_detections':
              tf.shape(data['groundtruth_boxes'])[0],
          'image_info':
              info,
          'boxes':
              bbox_ops.denormalize_boxes(
                  data['groundtruth_boxes'],
                  tf.cast([data['height'], data['width']], gt_boxes.dtype)),
          'classes':
              data['groundtruth_classes'],
          'areas':
              data['groundtruth_area'],
          'is_crowds':
              tf.cast(tf.gather(data['groundtruth_is_crowd'], inds), tf.int32),
      }
      groundtruths['source_id'] = utils.process_source_id(
          groundtruths['source_id'])
      groundtruths = utils.pad_groundtruths_to_fixed_size(
          groundtruths, self._max_num_instances)
      labels['groundtruths'] = groundtruths
    return image, labels