lvis_tools.py

# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrappers for third party lvis to be used within object_detection.

Usage example: given a set of images with ids in the list image_ids
and corresponding lists of numpy arrays encoding groundtruth (boxes,
masks and classes) and detections (masks, scores and classes), where
elements of each list correspond to detections/annotations of a single image,
then evaluation can be invoked as follows:

  groundtruth = lvis_tools.LVISWrapper(groundtruth_dict)
    detections = lvis_results.LVISResults(groundtruth, detections_list)
    evaluator = lvis_tools.LVISEvalWrapper(groundtruth, detections,
      iou_type='segm')
    summary_metrics = evaluator.ComputeMetrics()

TODO(jonathanhuang): Add support for exporting to JSON.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging

from lvis import eval as lvis_eval
from lvis import lvis
import numpy as np
from pycocotools import mask
import six
from six.moves import range


def RleCompress(masks):
  """Compresses mask using Run-length encoding provided by pycocotools.

  Args:
    masks: uint8 numpy array of shape [mask_height, mask_width] with values in
    {0, 1}.

  Returns:
    A pycocotools Run-length encoding of the mask.
  """
  rle = mask.encode(np.asfortranarray(masks))
  rle['counts'] = six.ensure_str(rle['counts'])
  return rle


def _ConvertBoxToCOCOFormat(box):
  """Converts a box in [ymin, xmin, ymax, xmax] format to COCO format.

  This is a utility function for converting from our internal
  [ymin, xmin, ymax, xmax] convention to the convention used by the COCO API
  i.e., [xmin, ymin, width, height].

  Args:
    box: a [ymin, xmin, ymax, xmax] numpy array

  Returns:
    a list of floats representing [xmin, ymin, width, height]
  """
  return [float(box[1]), float(box[0]), float(box[3] - box[1]),
          float(box[2] - box[0])]


class LVISWrapper(lvis.LVIS):
  """Wrapper for the lvis.LVIS class."""

  def __init__(self, dataset, detection_type='bbox'):
    """LVISWrapper constructor.

    See https://www.lvisdataset.org/dataset for a description of the format.
    By default, the coco.COCO class constructor reads from a JSON file.
    This function duplicates the same behavior but loads from a dictionary,
    allowing us to perform evaluation without writing to external storage.

    Args:
      dataset: a dictionary holding bounding box annotations in the COCO format.
      detection_type: type of detections being wrapped. Can be one of ['bbox',
        'segmentation']

    Raises:
      ValueError: if detection_type is unsupported.
    """
    self.logger = logging.getLogger(__name__)
    self.logger.info('Loading annotations.')
    self.dataset = dataset
    self._create_index()


class LVISEvalWrapper(lvis_eval.LVISEval):
  """LVISEval wrapper."""

  def __init__(self, groundtruth=None, detections=None, iou_type='bbox'):
    lvis_eval.LVISEval.__init__(
        self, groundtruth, detections, iou_type=iou_type)
    self._iou_type = iou_type

  def ComputeMetrics(self):
    self.run()
    summary_metrics = {}
    summary_metrics = self.results
    return summary_metrics


def ExportSingleImageGroundtruthToLVIS(image_id,
                                       next_annotation_id,
                                       category_id_set,
                                       groundtruth_boxes,
                                       groundtruth_classes,
                                       groundtruth_masks=None,
                                       groundtruth_area=None):
  """Export groundtruth of a single image to LVIS format.

  This function converts groundtruth detection annotations represented as numpy
  arrays to dictionaries that can be ingested by the LVIS evaluation API. Note
  that the image_ids provided here must match the ones given to
  ExportSingleImageDetectionMasksToLVIS. We assume that boxes, classes and masks
  are in correspondence - that is, e.g., groundtruth_boxes[i, :], and
  groundtruth_classes[i] are associated with the same groundtruth annotation.

  In the exported result, "area" fields are always set to the area of the
  groundtruth bounding box.

  Args:
    image_id: a unique image identifier castable to integer.
    next_annotation_id: integer specifying the first id to use for the
      groundtruth annotations. All annotations are assigned a continuous integer
      id starting from this value.
    category_id_set: A set of valid class ids. Groundtruth with classes not in
      category_id_set are dropped.
    groundtruth_boxes: numpy array (float32) with shape [num_gt_boxes, 4]
    groundtruth_classes: numpy array (int) with shape [num_gt_boxes]
    groundtruth_masks: optional uint8 numpy array of shape [num_detections,
      image_height, image_width] containing detection_masks.
    groundtruth_area: numpy array (float32) with shape [num_gt_boxes]. If
      provided, then the area values (in the original absolute coordinates) will
      be populated instead of calculated from bounding box coordinates.

  Returns:
    a list of groundtruth annotations for a single image in the COCO format.

  Raises:
    ValueError: if (1) groundtruth_boxes and groundtruth_classes do not have the
      right lengths or (2) if each of the elements inside these lists do not
      have the correct shapes or (3) if image_ids are not integers
  """

  if len(groundtruth_classes.shape) != 1:
    raise ValueError('groundtruth_classes is '
                     'expected to be of rank 1.')
  if len(groundtruth_boxes.shape) != 2:
    raise ValueError('groundtruth_boxes is expected to be of '
                     'rank 2.')
  if groundtruth_boxes.shape[1] != 4:
    raise ValueError('groundtruth_boxes should have '
                     'shape[1] == 4.')
  num_boxes = groundtruth_classes.shape[0]
  if num_boxes != groundtruth_boxes.shape[0]:
    raise ValueError('Corresponding entries in groundtruth_classes, '
                     'and groundtruth_boxes should have '
                     'compatible shapes (i.e., agree on the 0th dimension).'
                     'Classes shape: %d. Boxes shape: %d. Image ID: %s' % (
                         groundtruth_classes.shape[0],
                         groundtruth_boxes.shape[0], image_id))

  groundtruth_list = []
  for i in range(num_boxes):
    if groundtruth_classes[i] in category_id_set:
      if groundtruth_area is not None and groundtruth_area[i] > 0:
        area = float(groundtruth_area[i])
      else:
        area = float((groundtruth_boxes[i, 2] - groundtruth_boxes[i, 0]) *
                     (groundtruth_boxes[i, 3] - groundtruth_boxes[i, 1]))
      export_dict = {
          'id':
              next_annotation_id + i,
          'image_id':
              int(image_id),
          'category_id':
              int(groundtruth_classes[i]),
          'bbox':
              list(_ConvertBoxToCOCOFormat(groundtruth_boxes[i, :])),
          'area': area,
      }
      if groundtruth_masks is not None:
        export_dict['segmentation'] = RleCompress(groundtruth_masks[i])

      groundtruth_list.append(export_dict)
  return groundtruth_list


def ExportSingleImageDetectionMasksToLVIS(image_id,
                                          category_id_set,
                                          detection_masks,
                                          detection_scores,
                                          detection_classes):
  """Export detection masks of a single image to LVIS format.

  This function converts detections represented as numpy arrays to dictionaries
  that can be ingested by the LVIS evaluation API. We assume that
  detection_masks, detection_scores, and detection_classes are in correspondence
  - that is: detection_masks[i, :], detection_classes[i] and detection_scores[i]
    are associated with the same annotation.

  Args:
    image_id: unique image identifier castable to integer.
    category_id_set: A set of valid class ids. Detections with classes not in
      category_id_set are dropped.
    detection_masks: uint8 numpy array of shape [num_detections, image_height,
      image_width] containing detection_masks.
    detection_scores: float numpy array of shape [num_detections] containing
      scores for detection masks.
    detection_classes: integer numpy array of shape [num_detections] containing
      the classes for detection masks.

  Returns:
    a list of detection mask annotations for a single image in the COCO format.

  Raises:
    ValueError: if (1) detection_masks, detection_scores and detection_classes
      do not have the right lengths or (2) if each of the elements inside these
      lists do not have the correct shapes or (3) if image_ids are not integers.
  """

  if len(detection_classes.shape) != 1 or len(detection_scores.shape) != 1:
    raise ValueError('All entries in detection_classes and detection_scores'
                     'expected to be of rank 1.')
  num_boxes = detection_classes.shape[0]
  if not num_boxes == len(detection_masks) == detection_scores.shape[0]:
    raise ValueError('Corresponding entries in detection_classes, '
                     'detection_scores and detection_masks should have '
                     'compatible lengths and shapes '
                     'Classes length: %d.  Masks length: %d. '
                     'Scores length: %d' % (
                         detection_classes.shape[0], len(detection_masks),
                         detection_scores.shape[0]
                     ))
  detections_list = []
  for i in range(num_boxes):
    if detection_classes[i] in category_id_set:
      detections_list.append({
          'image_id': int(image_id),
          'category_id': int(detection_classes[i]),
          'segmentation': RleCompress(detection_masks[i]),
          'score': float(detection_scores[i])
      })

  return detections_list