preprocessing.py

# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from tensorflow.keras import backend

from keras_cv import core


def transform_value_range(images, original_range, target_range, dtype=tf.float32):
    """transforms values in input tensor from original_range to target_range.
    This function is intended to be used in preprocessing layers that
    rely upon color values.  This allows us to assume internally that
    the input tensor is always in the range [0, 255].

    Args:
        images: the set of images to transform to the target range range.
        original_range: the value range to transform from.
        target_range: the value range to transform to.
        dtype: the dtype to compute the conversion with.  Defaults to tf.float32.

    Returns:
        a new Tensor with values in the target range.

    Usage:
    ```python
    original_range = [0, 1]
    target_range = [0, 255]
    images = keras_cv.utils.preprocessing.transform_value_range(
        images,
        original_range,
        target_range
    )
    images = tf.math.minimum(images + 10, 255)
    images = keras_cv.utils.preprocessing.transform_value_range(
        images,
        target_range,
        original_range
    )
    ```
    """
    if original_range[0] == target_range[0] and original_range[1] == target_range[1]:
        return images

    images = tf.cast(images, dtype=dtype)
    original_min_value, original_max_value = _unwrap_value_range(
        original_range, dtype=dtype
    )
    target_min_value, target_max_value = _unwrap_value_range(target_range, dtype=dtype)

    # images in the [0, 1] scale
    images = (images - original_min_value) / (original_max_value - original_min_value)

    scale_factor = target_max_value - target_min_value
    return (images * scale_factor) + target_min_value


def _unwrap_value_range(value_range, dtype=tf.float32):
    min_value, max_value = value_range
    min_value = tf.cast(min_value, dtype=dtype)
    max_value = tf.cast(max_value, dtype=dtype)
    return min_value, max_value


def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor:
    """Blend image1 and image2 using 'factor'.

    FactorSampler should be in the range [0, 1].  A value of 0.0 means only image1
    is used. A value of 1.0 means only image2 is used.  A value between 0.0
    and 1.0 means we linearly interpolate the pixel values between the two
    images.  A value greater than 1.0 "extrapolates" the difference
    between the two pixel values, and we clip the results to values
    between 0 and 255.
    Args:
      image1: An image Tensor of type tf.float32 with value range [0, 255].
      image2: An image Tensor of type tf.float32 with value range [0, 255].
      factor: A floating point value above 0.0.
    Returns:
      A blended image Tensor.
    """
    difference = image2 - image1
    scaled = factor * difference
    temp = image1 + scaled
    return tf.clip_by_value(temp, 0.0, 255.0)


def parse_factor(param, min_value=0.0, max_value=1.0, param_name="factor", seed=None):
    if isinstance(param, core.FactorSampler):
        return param

    if isinstance(param, float) or isinstance(param, int):
        param = (min_value, param)

    if param[0] > param[1]:
        raise ValueError(
            f"`{param_name}[0] > {param_name}[1]`, `{param_name}[0]` must be <= "
            f"`{param_name}[1]`.  Got `{param_name}={param}`"
        )
    if (min_value is not None and param[0] < min_value) or (
        max_value is not None and param[1] > max_value
    ):
        raise ValueError(
            f"`{param_name}` should be inside of range [{min_value}, {max_value}]. "
            f"Got {param_name}={param}"
        )

    if param[0] == param[1]:
        return core.ConstantFactorSampler(param[0])

    return core.UniformFactorSampler(param[0], param[1], seed=seed)


def random_inversion(random_generator):
    """Randomly returns a -1 or a 1 based on the provided random_generator.

    This can be used by KPLs to randomly invert sampled values.

    Args:
        random_generator: a Keras random number generator.  An instance can be passed
        from the `self._random_generator` attribute of a `BaseImageAugmentationLayer`.

    Returns:
        either -1, or -1.
    """
    negate = random_generator.random_uniform((), 0, 1, dtype=tf.float32) > 0.5
    negate = tf.cond(negate, lambda: -1.0, lambda: 1.0)
    return negate


def get_rotation_matrix(angles, image_height, image_width, name=None):
    """Returns projective transform(s) for the given angle(s).
    Args:
      angles: A scalar angle to rotate all images by, or (for batches of images) a
        vector with an angle to rotate each image in the batch. The rank must be
        statically known (the shape is not `TensorShape(None)`).
      image_height: Height of the image(s) to be transformed.
      image_width: Width of the image(s) to be transformed.
      name: The name of the op.
    Returns:
      A tensor of shape (num_images, 8). Projective transforms which can be given
        to operation `image_projective_transform_v2`. If one row of transforms is
         [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point
         `(x, y)` to a transformed *input* point
         `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
         where `k = c0 x + c1 y + 1`.
    """
    with backend.name_scope(name or "rotation_matrix"):
        x_offset = (
            (image_width - 1)
            - (tf.cos(angles) * (image_width - 1) - tf.sin(angles) * (image_height - 1))
        ) / 2.0
        y_offset = (
            (image_height - 1)
            - (tf.sin(angles) * (image_width - 1) + tf.cos(angles) * (image_height - 1))
        ) / 2.0
        num_angles = tf.shape(angles)[0]
        return tf.concat(
            values=[
                tf.cos(angles)[:, None],
                -tf.sin(angles)[:, None],
                x_offset[:, None],
                tf.sin(angles)[:, None],
                tf.cos(angles)[:, None],
                y_offset[:, None],
                tf.zeros((num_angles, 2), tf.float32),
            ],
            axis=1,
        )


def get_translation_matrix(translations, name=None):
    """Returns projective transform(s) for the given translation(s).
    Args:
      translations: A matrix of 2-element lists representing `[dx, dy]`
        to translate for each image (for a batch of images).
      name: The name of the op.
    Returns:
      A tensor of shape `(num_images, 8)` projective transforms which can be given
        to `transform`.
    """
    with backend.name_scope(name or "translation_matrix"):
        num_translations = tf.shape(translations)[0]
        # The translation matrix looks like:
        #     [[1 0 -dx]
        #      [0 1 -dy]
        #      [0 0 1]]
        # where the last entry is implicit.
        # Translation matrices are always float32.
        return tf.concat(
            values=[
                tf.ones((num_translations, 1), tf.float32),
                tf.zeros((num_translations, 1), tf.float32),
                -translations[:, 0, None],
                tf.zeros((num_translations, 1), tf.float32),
                tf.ones((num_translations, 1), tf.float32),
                -translations[:, 1, None],
                tf.zeros((num_translations, 2), tf.float32),
            ],
            axis=1,
        )


def transform(
    images,
    transforms,
    fill_mode="reflect",
    fill_value=0.0,
    interpolation="bilinear",
    output_shape=None,
    name=None,
):
    """Applies the given transform(s) to the image(s).

    Args:
      images: A tensor of shape
        `(num_images, num_rows, num_columns, num_channels)` (NHWC). The rank must
        be statically known (the shape is not `TensorShape(None)`).
      transforms: Projective transform matrix/matrices. A vector of length 8 or
        tensor of size N x 8. If one row of transforms is [a0, a1, a2, b0, b1, b2,
        c0, c1], then it maps the *output* point `(x, y)` to a transformed *input*
        point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
        `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to the
        transform mapping input points to output points. Note that gradients are
        not backpropagated into transformation parameters.
      fill_mode: Points outside the boundaries of the input are filled according
        to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`).
      fill_value: a float represents the value to be filled outside the boundaries
        when `fill_mode="constant"`.
      interpolation: Interpolation mode. Supported values: `"nearest"`,
        `"bilinear"`.
      output_shape: Output dimension after the transform, `[height, width]`.
        If `None`, output is the same size as input image.
      name: The name of the op.

    Fill mode behavior for each valid value is as follows:

    - reflect (d c b a | a b c d | d c b a)
    The input is extended by reflecting about the edge of the last pixel.

    - constant (k k k k | a b c d | k k k k)
    The input is extended by filling all
    values beyond the edge with the same constant value k = 0.

    - wrap (a b c d | a b c d | a b c d)
    The input is extended by wrapping around to the opposite edge.

    - nearest (a a a a | a b c d | d d d d)
    The input is extended by the nearest pixel.

    Input shape:
      4D tensor with shape: `(samples, height, width, channels)`,
        in `"channels_last"` format.

    Output shape:
      4D tensor with shape: `(samples, height, width, channels)`,
        in `"channels_last"` format.

    Returns:
      Image(s) with the same type and shape as `images`, with the given
      transform(s) applied. Transformed coordinates outside of the input image
      will be filled with zeros.

    Raises:
      TypeError: If `image` is an invalid type.
      ValueError: If output shape is not 1-D int32 Tensor.
    """
    with backend.name_scope(name or "transform"):
        if output_shape is None:
            output_shape = tf.shape(images)[1:3]
            if not tf.executing_eagerly():
                output_shape_value = tf.get_static_value(output_shape)
                if output_shape_value is not None:
                    output_shape = output_shape_value

        output_shape = tf.convert_to_tensor(output_shape, tf.int32, name="output_shape")

        if not output_shape.get_shape().is_compatible_with([2]):
            raise ValueError(
                "output_shape must be a 1-D Tensor of 2 elements: "
                "new_height, new_width, instead got "
                "{}".format(output_shape)
            )

        fill_value = tf.convert_to_tensor(fill_value, tf.float32, name="fill_value")

        return tf.raw_ops.ImageProjectiveTransformV3(
            images=images,
            output_shape=output_shape,
            fill_value=fill_value,
            transforms=transforms,
            fill_mode=fill_mode.upper(),
            interpolation=interpolation.upper(),
        )


def ensure_tensor(inputs, dtype=None):
    """Ensures the input is a Tensor, SparseTensor or RaggedTensor."""
    if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor, tf.SparseTensor)):
        inputs = tf.convert_to_tensor(inputs, dtype)
    if dtype is not None and inputs.dtype != dtype:
        inputs = tf.cast(inputs, dtype)
    return inputs


def check_fill_mode_and_interpolation(fill_mode, interpolation):
    if fill_mode not in {"reflect", "wrap", "constant", "nearest"}:
        raise NotImplementedError(
            " Want fillmode  to be one of `reflect`, `wrap`, "
            "`constant` or `nearest`. Got `fill_mode` {}. ".format(fill_mode)
        )
    if interpolation not in {"nearest", "bilinear"}:
        raise NotImplementedError(
            "Unknown `interpolation` {}. Only `nearest` and "
            "`bilinear` are supported.".format(interpolation)
        )