converters.py

# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Converter functions for working with bounding box formats."""

from typing import List
from typing import Optional

import tensorflow as tf


# Internal exception to propagate the fact images was not passed to a converter that
# needs it
class RequiresImagesException(Exception):
    pass


def _encode_box_to_deltas(
    anchors: tf.Tensor,
    boxes: tf.Tensor,
    anchor_format: str,
    box_format: str,
    variance: Optional[List[float]] = None,
):
    """Converts bounding_boxes from `center_yxhw` to delta format."""
    if variance and len(variance) != 4:
        raise ValueError(f"`variance` must be length 4, got {variance}")
    encoded_anchors = convert_format(
        anchors,
        source=anchor_format,
        target="center_yxhw",
    )
    boxes = convert_format(
        boxes,
        source=box_format,
        target="center_yxhw",
    )
    anchor_dimensions = tf.maximum(encoded_anchors[..., 2:], tf.keras.backend.epsilon())
    box_dimensions = tf.maximum(boxes[..., 2:], tf.keras.backend.epsilon())
    # anchors be unbatched, boxes can either be batched or unbatched.
    boxes_delta = tf.concat(
        [
            (boxes[..., :2] - encoded_anchors[..., :2]) / anchor_dimensions,
            tf.math.log(box_dimensions / anchor_dimensions),
        ],
        axis=-1,
    )
    if variance:
        boxes_delta /= variance
    return boxes_delta


def _decode_deltas_to_boxes(
    anchors: tf.Tensor,
    boxes_delta: tf.Tensor,
    anchor_format: str,
    box_format: str,
    variance: Optional[List[float]] = None,
):
    """Converts bounding_boxes from delta format to `center_yxhw`."""
    if variance and len(variance) != 4:
        raise ValueError(f"`variance` must be length 4, got {variance}")
    tf.nest.assert_same_structure(anchors, boxes_delta)

    def decode_single_level(anchor, box_delta):
        encoded_anchor = convert_format(
            anchor,
            source=anchor_format,
            target="center_yxhw",
        )
        if variance:
            box_delta = box_delta * variance
        # anchors be unbatched, boxes can either be batched or unbatched.
        box = tf.concat(
            [
                box_delta[..., :2] * encoded_anchor[..., 2:] + encoded_anchor[..., :2],
                tf.math.exp(box_delta[..., 2:]) * encoded_anchor[..., 2:],
            ],
            axis=-1,
        )
        box = convert_format(box, source="center_yxhw", target=box_format)
        return box

    if isinstance(anchors, dict) and isinstance(boxes_delta, dict):
        boxes = {}
        for lvl, anchor in anchors.items():
            boxes[lvl] = decode_single_level(anchor, boxes_delta[lvl])
        return boxes
    else:
        return decode_single_level(anchors, boxes_delta)


def _center_yxhw_to_xyxy(boxes, images=None, image_shape=None):
    y, x, height, width, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    return tf.concat(
        [x - width / 2.0, y - height / 2.0, x + width / 2.0, y + height / 2.0, rest],
        axis=-1,
    )


def _center_xywh_to_xyxy(boxes, images=None, image_shape=None):
    x, y, width, height, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    return tf.concat(
        [x - width / 2.0, y - height / 2.0, x + width / 2.0, y + height / 2.0, rest],
        axis=-1,
    )


def _xywh_to_xyxy(boxes, images=None, image_shape=None):
    x, y, width, height, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    return tf.concat([x, y, x + width, y + height, rest], axis=-1)


def _xyxy_to_center_yxhw(boxes, images=None, image_shape=None):
    left, top, right, bottom, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    return tf.concat(
        [(top + bottom) / 2.0, (left + right) / 2.0, bottom - top, right - left, rest],
        axis=-1,
    )


def _rel_xywh_to_xyxy(boxes, images=None, image_shape=None):
    image_height, image_width = _image_shape(images, image_shape, boxes)
    x, y, width, height, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    return tf.concat(
        [
            image_width * x,
            image_height * y,
            image_width * (x + width),
            image_height * (y + height),
            rest,
        ],
        axis=-1,
    )


def _xyxy_no_op(boxes, images=None, image_shape=None):
    return boxes


def _xyxy_to_xywh(boxes, images=None, image_shape=None):
    left, top, right, bottom, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    return tf.concat(
        [left, top, right - left, bottom - top, rest],
        axis=-1,
    )


def _xyxy_to_rel_xywh(boxes, images=None, image_shape=None):
    image_height, image_width = _image_shape(images, image_shape, boxes)
    left, top, right, bottom, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    left, right = (
        left / image_width,
        right / image_width,
    )
    top, bottom = top / image_height, bottom / image_height
    return tf.concat(
        [left, top, right - left, bottom - top, rest],
        axis=-1,
    )


def _xyxy_to_center_xywh(boxes, images=None, image_shape=None):
    left, top, right, bottom, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    return tf.concat(
        [(left + right) / 2.0, (top + bottom) / 2.0, right - left, bottom - top, rest],
        axis=-1,
    )


def _rel_xyxy_to_xyxy(boxes, images=None, image_shape=None):
    image_height, image_width = _image_shape(images, image_shape, boxes)
    left, top, right, bottom, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    left, right = left * image_width, right * image_width
    top, bottom = top * image_height, bottom * image_height
    return tf.concat(
        [left, top, right, bottom, rest],
        axis=-1,
    )


def _xyxy_to_rel_xyxy(boxes, images=None, image_shape=None):
    image_height, image_width = _image_shape(images, image_shape, boxes)
    left, top, right, bottom, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    left, right = left / image_width, right / image_width
    top, bottom = top / image_height, bottom / image_height
    return tf.concat(
        [left, top, right, bottom, rest],
        axis=-1,
    )


def _yxyx_to_xyxy(boxes, images=None, image_shape=None):
    y1, x1, y2, x2, rest = tf.split(boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1)
    return tf.concat([x1, y1, x2, y2, rest], axis=-1)


def _rel_yxyx_to_xyxy(boxes, images=None, image_shape=None):
    image_height, image_width = _image_shape(images, image_shape, boxes)
    top, left, bottom, right, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    left, right = left * image_width, right * image_width
    top, bottom = top * image_height, bottom * image_height
    return tf.concat(
        [left, top, right, bottom, rest],
        axis=-1,
    )


def _xyxy_to_yxyx(boxes, images=None, image_shape=None):
    x1, y1, x2, y2, rest = tf.split(boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1)
    return tf.concat([y1, x1, y2, x2, rest], axis=-1)


def _xyxy_to_rel_yxyx(boxes, images=None, image_shape=None):
    image_height, image_width = _image_shape(images, image_shape, boxes)
    left, top, right, bottom, rest = tf.split(
        boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
    )
    left, right = left / image_width, right / image_width
    top, bottom = top / image_height, bottom / image_height
    return tf.concat(
        [top, left, bottom, right, rest],
        axis=-1,
    )


TO_XYXY_CONVERTERS = {
    "xywh": _xywh_to_xyxy,
    "center_xywh": _center_xywh_to_xyxy,
    "center_yxhw": _center_yxhw_to_xyxy,
    "rel_xywh": _rel_xywh_to_xyxy,
    "xyxy": _xyxy_no_op,
    "rel_xyxy": _rel_xyxy_to_xyxy,
    "yxyx": _yxyx_to_xyxy,
    "rel_yxyx": _rel_yxyx_to_xyxy,
}

FROM_XYXY_CONVERTERS = {
    "xywh": _xyxy_to_xywh,
    "center_xywh": _xyxy_to_center_xywh,
    "center_yxhw": _xyxy_to_center_yxhw,
    "rel_xywh": _xyxy_to_rel_xywh,
    "xyxy": _xyxy_no_op,
    "rel_xyxy": _xyxy_to_rel_xyxy,
    "yxyx": _xyxy_to_yxyx,
    "rel_yxyx": _xyxy_to_rel_yxyx,
}


def convert_format(
    boxes, source, target, images=None, image_shape=None, dtype="float32"
):
    f"""Converts bounding_boxes from one format to another.

    Supported formats are:
    - `"xyxy"`, also known as `corners` format.  In this format the first four axes
        represent [left, top, right, bottom] in that order.
    - `"rel_xyxy"`.  In this format, the axes are the same as `"xyxy"` but the x
        coordinates are normalized using the image width, and the y axes the image
        height.  All values in `rel_xyxy` are in the range (0, 1).
    - `"xywh"`.  In this format the first four axes represent
        [left, top, width, height].
    - `"rel_xywh".  In this format the first four axes represent
        [left, top, width, height], just like `"xywh"`.  Unlike `"xywh"`, the values
        are in the range (0, 1) instead of absolute pixel values.
    - `"center_xyWH"`.  In this format the first two coordinates represent the x and y
        coordinates of the center of the bounding box, while the last two represent
        the width and height of the bounding box.
    - `"center_yxHW"`.  In this format the first two coordinates represent the y and x
        coordinates of the center of the bounding box, while the last two represent
        the height and width of the bounding box.
    - `"yxyx"`.  In this format the first four axes represent [top, left, bottom, right]
        in that order.
    - `"rel_yxyx"`.  In this format, the axes are the same as `"yxyx"` but the x
        coordinates are normalized using the image width, and the y axes the image
        height.  All values in `rel_yxyx` are in the range (0, 1).
    Formats are case insensitive.  It is recommended that you capitalize width and
    height to maximize the visual difference between `"xyWH"` and `"xyxy"`.

    Relative formats, abbreviated `rel`, make use of the shapes of the `images` passed.
    In these formats, the coordinates, widths, and heights are all specified as
    percentages of the host image.  `images` may be a ragged Tensor.  Note that using a
    ragged Tensor for images may cause a substantial performance loss, as each image
    will need to be processed separately due to the mismatching image shapes.

    Usage:

    ```python
    boxes = load_coco_dataset()
    boxes_in_xywh = keras_cv.bounding_box.convert_format(
        boxes,
        source='xyxy',
        target='xyWH'
    )
    ```

    Args:
        boxes: tf.Tensor representing bounding boxes in the format specified in the
            `source` parameter.  `boxes` can optionally have extra dimensions stacked on
             the final axis to store metadata.  boxes should be a 3D Tensor, with the
             shape `[batch_size, num_boxes, *]`.
        source: One of {" ".join([f'"{f}"' for f in TO_XYXY_CONVERTERS.keys()])}.  Used
            to specify the original format of the `boxes` parameter.
        target: One of {" ".join([f'"{f}"' for f in TO_XYXY_CONVERTERS.keys()])}.  Used
            to specify the destination format of the `boxes` parameter.
        images: (Optional) a batch of images aligned with `boxes` on the first axis.
            Should be at least 3 dimensions, with the first 3 dimensions representing:
            `[batch_size, height, width]`.  Used in some converters to compute relative
            pixel values of the bounding box dimensions.  Required when transforming
            from a rel format to a non-rel format.
        dtype: the data type to use when transforming the boxes.  Defaults to
            `tf.float32`.
    """
    if images is not None and image_shape is not None:
        raise ValueError(
            "convert_format() expects either `images` or `image_shape`, "
            f"but not both.  Received images={images} image_shape={image_shape}"
        )

    _validate_image_shape(image_shape)

    source = source.lower()
    target = target.lower()
    if source not in TO_XYXY_CONVERTERS:
        raise ValueError(
            f"`convert_format()` received an unsupported format for the argument "
            f"`source`.  `source` should be one of {TO_XYXY_CONVERTERS.keys()}. "
            f"Got source={source}"
        )
    if target not in FROM_XYXY_CONVERTERS:
        raise ValueError(
            f"`convert_format()` received an unsupported format for the argument "
            f"`target`.  `target` should be one of {FROM_XYXY_CONVERTERS.keys()}. "
            f"Got target={target}"
        )

    boxes = tf.cast(boxes, dtype)
    if source == target:
        return boxes

    # rel->rel conversions should not require images
    if source.startswith("rel") and target.startswith("rel"):
        source = source.replace("rel_", "", 1)
        target = target.replace("rel_", "", 1)

    boxes, images, squeeze = _format_inputs(boxes, images)
    to_xyxy_fn = TO_XYXY_CONVERTERS[source]
    from_xyxy_fn = FROM_XYXY_CONVERTERS[target]

    try:
        in_xyxy = to_xyxy_fn(boxes, images=images, image_shape=image_shape)
        result = from_xyxy_fn(in_xyxy, images=images, image_shape=image_shape)
    except RequiresImagesException:
        raise ValueError(
            "convert_format() must receive `images` or `image_shape` when transforming "
            f"between relative and absolute formats."
            f"convert_format() received source=`{format}`, target=`{format}, "
            f"but images={images} and image_shape={image_shape}."
        )

    return _format_outputs(result, squeeze)


def _format_inputs(boxes, images):
    boxes_rank = len(boxes.shape)
    if boxes_rank > 3:
        raise ValueError(
            "Expected len(boxes.shape)=2, or len(boxes.shape)=3, got "
            f"len(boxes.shape)={boxes_rank}"
        )
    boxes_includes_batch = boxes_rank == 3
    # Determine if images needs an expand_dims() call
    if images is not None:
        images_rank = len(images.shape)
        if images_rank > 4:
            raise ValueError(
                "Expected len(images.shape)=2, or len(images.shape)=3, got "
                f"len(images.shape)={images_rank}"
            )
        images_include_batch = images_rank == 4
        if boxes_includes_batch != images_include_batch:
            raise ValueError(
                "convert_format() expects both boxes and images to be batched, or both "
                f"boxes and images to be unbatched.  Received len(boxes.shape)={boxes_rank}, "
                f"len(images.shape)={images_rank}.  Expected either len(boxes.shape)=2 AND "
                "len(images.shape)=3, or len(boxes.shape)=3 AND len(images.shape)=4."
            )
        if not images_include_batch:
            images = tf.expand_dims(images, axis=0)

    if not boxes_includes_batch:
        return tf.expand_dims(boxes, axis=0), images, True
    return boxes, images, False


def _validate_image_shape(image_shape):
    # Escape early if image_shape is None and skip validation.
    if image_shape is None:
        return
    # tuple/list
    if isinstance(image_shape, (tuple, list)):
        if len(image_shape) != 3:
            raise ValueError(
                "image_shape should be of length 3, but got "
                f"image_shape={image_shape}"
            )
        return

    # tensor
    if isinstance(image_shape, tf.Tensor):
        if len(image_shape.shape) > 1:
            raise ValueError(
                "image_shape.shape should be (3), but got "
                f"image_shape.shape={image_shape.shape}"
            )
        if image_shape.shape[0] != 3:
            raise ValueError(
                "image_shape.shape should be (3), but got "
                f"image_shape.shape={image_shape.shape}"
            )
        return

    # Warn about failure cases
    raise ValueError(
        "Expected image_shape to be either a tuple, list, Tensor.  "
        f"Received image_shape={image_shape}"
    )


def _format_outputs(boxes, squeeze):
    if squeeze:
        return tf.squeeze(boxes, axis=0)
    return boxes


def _image_shape(images, image_shape, boxes):
    if images is None and image_shape is None:
        raise RequiresImagesException()

    if image_shape is None:
        if not isinstance(images, tf.RaggedTensor):
            image_shape = tf.shape(images)
            height, width = image_shape[1], image_shape[2]
        else:
            height = tf.reshape(images.row_lengths(), (-1, 1))
            width = tf.reshape(tf.reduce_max(images.row_lengths(axis=2), 1), (-1, 1))
    else:
        height, width = image_shape[0], image_shape[1]
    return tf.cast(height, boxes.dtype), tf.cast(width, boxes.dtype)