# Copyright 2022 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Preprocessing ops for yolo.""" import random import numpy as np import tensorflow as tf import tensorflow_addons as tfa from official.vision.ops import box_ops as bbox_ops PAD_VALUE = 114 GLOBAL_SEED_SET = False def set_random_seeds(seed=0): """Sets all accessible global seeds to properly apply randomization. This is not the same as passing the seed as a variable to each call to tf.random.For more, see the documentation for tf.random on the tensorflow website https://www.tensorflow.org/api_docs/python/tf/random/set_seed. Note that passing the seed to each random number generator will not give you the expected behavior if you use more than one generator in a single function. Args: seed: `Optional[int]` representing the seed you want to use. """ if seed is not None: global GLOBAL_SEED_SET random.seed(seed) GLOBAL_SEED_SET = True tf.random.set_seed(seed) np.random.seed(seed) def random_uniform_strong(minval, maxval, dtype=tf.float32, seed=None, shape=None): """A unified function for consistent random number generation. Equivalent to tf.random.uniform, except that minval and maxval are flipped if minval is greater than maxval. Seed Safe random number generator. Args: minval: An `int` for a lower or upper endpoint of the interval from which to choose the random number. maxval: An `int` for the other endpoint. dtype: The output type of the tensor. seed: An `int` used to set the seed. shape: List or 1D tf.Tensor, output shape of the random generator. Returns: A random tensor of type `dtype` that falls between `minval` and `maxval` excluding the larger one. """ if GLOBAL_SEED_SET: seed = None if minval > maxval: minval, maxval = maxval, minval return tf.random.uniform( shape=shape or [], minval=minval, maxval=maxval, seed=seed, dtype=dtype) def random_scale(val, dtype=tf.float32, seed=None): """Generates a random number for scaling a parameter by multiplication. Generates a random number for the scale. Half of the time, the value is between [1.0, val) with uniformly distributed probability. In the other half, the value is the reciprocal of this value. The function is identical to the one in the original implementation: https://github.com/AlexeyAB/darknet/blob/a3714d0a/src/utils.c#L708-L713 Args: val: A float representing the maximum scaling allowed. dtype: The output type of the tensor. seed: An `int` used to set the seed. Returns: The random scale. """ scale = random_uniform_strong(1.0, val, dtype=dtype, seed=seed) do_ret = random_uniform_strong(minval=0, maxval=2, dtype=tf.int32, seed=seed) if do_ret == 1: return scale return 1.0 / scale def pad_max_instances(value, instances, pad_value=0, pad_axis=0): """Pad or clip the tensor value to a fixed length along a given axis. Pads a dimension of the tensor to have a maximum number of instances filling additional entries with the `pad_value`. Allows for selection of the padding axis. Args: value: An input tensor. instances: An `int` representing the maximum number of instances. pad_value: An `int` representing the value used for padding until the maximum number of instances is obtained. pad_axis: An `int` representing the axis index to pad. Returns: The output tensor whose dimensions match the input tensor except with the size along the `pad_axis` replaced by `instances`. """ # get the real shape of value shape = tf.shape(value) # compute the padding axis if pad_axis < 0: pad_axis = tf.rank(value) + pad_axis # determin how much of the tensor value to keep dim1 = shape[pad_axis] take = tf.math.reduce_min([instances, dim1]) value, _ = tf.split(value, [take, -1], axis=pad_axis) # pad the clipped tensor to the right shape pad = tf.convert_to_tensor([tf.math.reduce_max([instances - dim1, 0])]) nshape = tf.concat([shape[:pad_axis], pad, shape[(pad_axis + 1):]], axis=0) pad_tensor = tf.fill(nshape, tf.cast(pad_value, dtype=value.dtype)) value = tf.concat([value, pad_tensor], axis=pad_axis) if isinstance(instances, int): vshape = value.get_shape().as_list() vshape[pad_axis] = instances value.set_shape(vshape) return value def get_image_shape(image): """Consistently gets the width and height of the image. Gets the shape of the image regardless of if the image is in the (batch_size, x, y, c) format or the (x, y, c) format. Args: image: A tensor who has either 3 or 4 dimensions. Returns: A tuple (height, width), where height is the height of the image and width is the width of the image. """ shape = tf.shape(image) if shape.get_shape().as_list()[0] == 4: width = shape[2] height = shape[1] else: width = shape[1] height = shape[0] return height, width def _augment_hsv_darknet(image, rh, rs, rv, seed=None): """Randomize the hue, saturation, and brightness via the darknet method.""" if rh > 0.0: deltah = random_uniform_strong(-rh, rh, seed=seed) image = tf.image.adjust_hue(image, deltah) if rs > 0.0: deltas = random_scale(rs, seed=seed) image = tf.image.adjust_saturation(image, deltas) if rv > 0.0: deltav = random_scale(rv, seed=seed) image *= tf.cast(deltav, image.dtype) # clip the values of the image between 0.0 and 1.0 image = tf.clip_by_value(image, 0.0, 1.0) return image def _augment_hsv_torch(image, rh, rs, rv, seed=None): """Randomize the hue, saturation, and brightness via the pytorch method.""" dtype = image.dtype image = tf.cast(image, tf.float32) image = tf.image.rgb_to_hsv(image) gen_range = tf.cast([rh, rs, rv], image.dtype) scale = tf.cast([180, 255, 255], image.dtype) r = random_uniform_strong( -1, 1, shape=[3], dtype=image.dtype, seed=seed) * gen_range + 1 image = tf.math.floor(tf.cast(image, scale.dtype) * scale) image = tf.math.floor(tf.cast(image, r.dtype) * r) h, s, v = tf.split(image, 3, axis=-1) h = h % 180 s = tf.clip_by_value(s, 0, 255) v = tf.clip_by_value(v, 0, 255) image = tf.concat([h, s, v], axis=-1) image = tf.cast(image, scale.dtype) / scale image = tf.image.hsv_to_rgb(image) return tf.cast(image, dtype) def image_rand_hsv(image, rh, rs, rv, seed=None, darknet=False): """Randomly alters the hue, saturation, and brightness of an image. Args: image: `Tensor` of shape [None, None, 3] that needs to be altered. rh: `float32` used to indicate the maximum delta that can be multiplied to the hue. rs: `float32` used to indicate the maximum delta that can be multiplied to the saturation. rv: `float32` used to indicate the maximum delta that can be multiplied to the brightness. seed: `Optional[int]` for the seed to use in the random number generation. darknet: `bool` indicating whether the model was originally built in the Darknet or PyTorch library. Returns: The HSV altered image in the same datatype as the input image. """ if darknet: image = _augment_hsv_darknet(image, rh, rs, rv, seed=seed) else: image = _augment_hsv_torch(image, rh, rs, rv, seed=seed) return image def mosaic_cut(image, original_width, original_height, width, height, center, ptop, pleft, pbottom, pright, shiftx, shifty): """Generates a random center location to use for the mosaic operation. Given a center location, cuts the input image into a slice that will be concatenated with other slices with the same center in order to construct a final mosaicked image. Args: image: `Tensor` of shape [None, None, 3] that needs to be altered. original_width: `float` value indicating the original width of the image. original_height: `float` value indicating the original height of the image. width: `float` value indicating the final width of the image. height: `float` value indicating the final height of the image. center: `float` value indicating the desired center of the final patched image. ptop: `float` value indicating the top of the image without padding. pleft: `float` value indicating the left of the image without padding. pbottom: `float` value indicating the bottom of the image without padding. pright: `float` value indicating the right of the image without padding. shiftx: `float` 0.0 or 1.0 value indicating if the image is on the left or right. shifty: `float` 0.0 or 1.0 value indicating if the image is at the top or bottom. Returns: image: The cropped image in the same datatype as the input image. crop_info: `float` tensor that is applied to the boxes in order to select the boxes still contained within the image. """ def cast(values, dtype): return [tf.cast(value, dtype) for value in values] with tf.name_scope('mosaic_cut'): center = tf.cast(center, width.dtype) zero = tf.cast(0.0, width.dtype) cut_x, cut_y = center[1], center[0] # Select the crop of the image to use left_shift = tf.minimum( tf.minimum(cut_x, tf.maximum(zero, -pleft * width / original_width)), width - cut_x) top_shift = tf.minimum( tf.minimum(cut_y, tf.maximum(zero, -ptop * height / original_height)), height - cut_y) right_shift = tf.minimum( tf.minimum(width - cut_x, tf.maximum(zero, -pright * width / original_width)), cut_x) bot_shift = tf.minimum( tf.minimum(height - cut_y, tf.maximum(zero, -pbottom * height / original_height)), cut_y) (left_shift, top_shift, right_shift, bot_shift, zero) = cast([left_shift, top_shift, right_shift, bot_shift, zero], tf.float32) # Build a crop offset and a crop size tensor to use for slicing. crop_offset = [zero, zero, zero] crop_size = [zero - 1, zero - 1, zero - 1] if shiftx == 0.0 and shifty == 0.0: crop_offset = [top_shift, left_shift, zero] crop_size = [cut_y, cut_x, zero - 1] elif shiftx == 1.0 and shifty == 0.0: crop_offset = [top_shift, cut_x - right_shift, zero] crop_size = [cut_y, width - cut_x, zero - 1] elif shiftx == 0.0 and shifty == 1.0: crop_offset = [cut_y - bot_shift, left_shift, zero] crop_size = [height - cut_y, cut_x, zero - 1] elif shiftx == 1.0 and shifty == 1.0: crop_offset = [cut_y - bot_shift, cut_x - right_shift, zero] crop_size = [height - cut_y, width - cut_x, zero - 1] # Contain and crop the image. ishape = tf.cast(tf.shape(image)[:2], crop_size[0].dtype) crop_size[0] = tf.minimum(crop_size[0], ishape[0]) crop_size[1] = tf.minimum(crop_size[1], ishape[1]) crop_offset = tf.cast(crop_offset, tf.int32) crop_size = tf.cast(crop_size, tf.int32) image = tf.slice(image, crop_offset, crop_size) crop_info = tf.stack([ tf.cast(ishape, tf.float32), tf.cast(tf.shape(image)[:2], dtype=tf.float32), tf.ones_like(ishape, dtype=tf.float32), tf.cast(crop_offset[:2], tf.float32) ]) return image, crop_info def resize_and_jitter_image(image, desired_size, jitter=0.0, letter_box=None, random_pad=True, crop_only=False, shiftx=0.5, shifty=0.5, cut=None, method=tf.image.ResizeMethod.BILINEAR, seed=None): """Resize, Pad, and distort a given input image. Args: image: a `Tensor` of shape [height, width, 3] representing an image. desired_size: a `Tensor` or `int` list/tuple of two elements representing [height, width] of the desired actual output image size. jitter: an `int` representing the maximum jittering that can be applied to the image. letter_box: a `bool` representing if letterboxing should be applied. random_pad: a `bool` representing if random padding should be applied. crop_only: a `bool` representing if only cropping will be applied. shiftx: a `float` indicating if the image is in the left or right. shifty: a `float` value indicating if the image is in the top or bottom. cut: a `float` value indicating the desired center of the final patched image. method: function to resize input image to scaled image. seed: seed for random scale jittering. Returns: image_: a `Tensor` of shape [height, width, 3] where [height, width] equals to `desired_size`. infos: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [desired_height, desired_width], [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, desired_width] is the actual scaled image size, and [y_scale, x_scale] is the scaling factor, which is the ratio of scaled dimension / original dimension. cast([original_width, original_height, width, height, ptop, pleft, pbottom, pright], tf.float32): a `Tensor` containing the information of the image andthe applied preprocessing. """ def intersection(a, b): """Finds the intersection between 2 crops.""" minx = tf.maximum(a[0], b[0]) miny = tf.maximum(a[1], b[1]) maxx = tf.minimum(a[2], b[2]) maxy = tf.minimum(a[3], b[3]) return tf.convert_to_tensor([minx, miny, maxx, maxy]) def cast(values, dtype): return [tf.cast(value, dtype) for value in values] if jitter > 0.5 or jitter < 0: raise ValueError('maximum change in aspect ratio must be between 0 and 0.5') with tf.name_scope('resize_and_jitter_image'): # Cast all parameters to a usable float data type. jitter = tf.cast(jitter, tf.float32) original_dtype, original_dims = image.dtype, tf.shape(image)[:2] # original width, original height, desigered width, desired height original_width, original_height, width, height = cast( [original_dims[1], original_dims[0], desired_size[1], desired_size[0]], tf.float32) # Compute the random delta width and height etc. and randomize the # location of the corner points. jitter_width = original_width * jitter jitter_height = original_height * jitter pleft = random_uniform_strong( -jitter_width, jitter_width, jitter_width.dtype, seed=seed) pright = random_uniform_strong( -jitter_width, jitter_width, jitter_width.dtype, seed=seed) ptop = random_uniform_strong( -jitter_height, jitter_height, jitter_height.dtype, seed=seed) pbottom = random_uniform_strong( -jitter_height, jitter_height, jitter_height.dtype, seed=seed) # Letter box the image. if letter_box: (image_aspect_ratio, input_aspect_ratio) = original_width / original_height, width / height distorted_aspect = image_aspect_ratio / input_aspect_ratio delta_h, delta_w = 0.0, 0.0 pullin_h, pullin_w = 0.0, 0.0 if distorted_aspect > 1: delta_h = ((original_width / input_aspect_ratio) - original_height) / 2 else: delta_w = ((original_height * input_aspect_ratio) - original_width) / 2 ptop = ptop - delta_h - pullin_h pbottom = pbottom - delta_h - pullin_h pright = pright - delta_w - pullin_w pleft = pleft - delta_w - pullin_w # Compute the width and height to crop or pad too, and clip all crops to # to be contained within the image. swidth = original_width - pleft - pright sheight = original_height - ptop - pbottom src_crop = intersection([ptop, pleft, sheight + ptop, swidth + pleft], [0, 0, original_height, original_width]) # Random padding used for mosaic. h_ = src_crop[2] - src_crop[0] w_ = src_crop[3] - src_crop[1] if random_pad: rmh = tf.maximum(0.0, -ptop) rmw = tf.maximum(0.0, -pleft) else: rmw = (swidth - w_) * shiftx rmh = (sheight - h_) * shifty # Cast cropping params to usable dtype. src_crop = tf.cast(src_crop, tf.int32) # Compute padding parmeters. dst_shape = [rmh, rmw, rmh + h_, rmw + w_] ptop, pleft, pbottom, pright = dst_shape pad = dst_shape * tf.cast([1, 1, -1, -1], ptop.dtype) pad += tf.cast([0, 0, sheight, swidth], ptop.dtype) pad = tf.cast(pad, tf.int32) infos = [] # Crop the image to desired size. cropped_image = tf.slice( image, [src_crop[0], src_crop[1], 0], [src_crop[2] - src_crop[0], src_crop[3] - src_crop[1], -1]) crop_info = tf.stack([ tf.cast(original_dims, tf.float32), tf.cast(tf.shape(cropped_image)[:2], dtype=tf.float32), tf.ones_like(original_dims, dtype=tf.float32), tf.cast(src_crop[:2], tf.float32) ]) infos.append(crop_info) if crop_only: if not letter_box: h_, w_ = cast(get_image_shape(cropped_image), width.dtype) width = tf.cast(tf.round((w_ * width) / swidth), tf.int32) height = tf.cast(tf.round((h_ * height) / sheight), tf.int32) cropped_image = tf.image.resize( cropped_image, [height, width], method=method) cropped_image = tf.cast(cropped_image, original_dtype) return cropped_image, infos, cast([ original_width, original_height, width, height, ptop, pleft, pbottom, pright ], tf.int32) # Pad the image to desired size. image_ = tf.pad( cropped_image, [[pad[0], pad[2]], [pad[1], pad[3]], [0, 0]], constant_values=PAD_VALUE) # Pad and scale info isize = tf.cast(tf.shape(image_)[:2], dtype=tf.float32) osize = tf.cast((desired_size[0], desired_size[1]), dtype=tf.float32) pad_info = tf.stack([ tf.cast(tf.shape(cropped_image)[:2], tf.float32), osize, osize/isize, (-tf.cast(pad[:2], tf.float32)*osize/isize) ]) infos.append(pad_info) temp = tf.shape(image_)[:2] cond = temp > tf.cast(desired_size, temp.dtype) if tf.reduce_any(cond): size = tf.cast(desired_size, temp.dtype) size = tf.where(cond, size, temp) image_ = tf.image.resize( image_, (size[0], size[1]), method=tf.image.ResizeMethod.AREA) image_ = tf.cast(image_, original_dtype) image_ = tf.image.resize( image_, (desired_size[0], desired_size[1]), method=tf.image.ResizeMethod.BILINEAR, antialias=False) image_ = tf.cast(image_, original_dtype) if cut is not None: image_, crop_info = mosaic_cut(image_, original_width, original_height, width, height, cut, ptop, pleft, pbottom, pright, shiftx, shifty) infos.append(crop_info) return image_, infos, cast([ original_width, original_height, width, height, ptop, pleft, pbottom, pright ], tf.float32) def _build_transform(image, perspective=0.00, degrees=0.0, scale_min=1.0, scale_max=1.0, translate=0.0, random_pad=False, desired_size=None, seed=None): """Builds a unified affine transformation to spatially augment the image.""" height, width = get_image_shape(image) ch = height = tf.cast(height, tf.float32) cw = width = tf.cast(width, tf.float32) deg_to_rad = lambda x: tf.cast(x, tf.float32) * np.pi / 180.0 if desired_size is not None: desired_size = tf.cast(desired_size, tf.float32) ch = desired_size[0] cw = desired_size[1] # Compute the center of the image in the output resulution. center = tf.eye(3, dtype=tf.float32) center = tf.tensor_scatter_nd_update(center, [[0, 2], [1, 2]], [-cw / 2, -ch / 2]) center_boxes = tf.tensor_scatter_nd_update(center, [[0, 2], [1, 2]], [cw / 2, ch / 2]) # Compute a random rotation to apply. rotation = tf.eye(3, dtype=tf.float32) a = deg_to_rad(random_uniform_strong(-degrees, degrees, seed=seed)) cos = tf.math.cos(a) sin = tf.math.sin(a) rotation = tf.tensor_scatter_nd_update(rotation, [[0, 0], [0, 1], [1, 0], [1, 1]], [cos, -sin, sin, cos]) rotation_boxes = tf.tensor_scatter_nd_update(rotation, [[0, 0], [0, 1], [1, 0], [1, 1]], [cos, sin, -sin, cos]) # Compute a random prespective change to apply. prespective_warp = tf.eye(3) px = random_uniform_strong(-perspective, perspective, seed=seed) py = random_uniform_strong(-perspective, perspective, seed=seed) prespective_warp = tf.tensor_scatter_nd_update(prespective_warp, [[2, 0], [2, 1]], [px, py]) prespective_warp_boxes = tf.tensor_scatter_nd_update(prespective_warp, [[2, 0], [2, 1]], [-px, -py]) # Compute a random scaling to apply. scale = tf.eye(3, dtype=tf.float32) s = random_uniform_strong(scale_min, scale_max, seed=seed) scale = tf.tensor_scatter_nd_update(scale, [[0, 0], [1, 1]], [1 / s, 1 / s]) scale_boxes = tf.tensor_scatter_nd_update(scale, [[0, 0], [1, 1]], [s, s]) # Compute a random Translation to apply. translation = tf.eye(3) if (random_pad and height * s < ch and width * s < cw): # The image is contained within the image and arbitrarily translated to # locations with in the image. center = center_boxes = tf.eye(3, dtype=tf.float32) tx = random_uniform_strong(-1, 0, seed=seed) * (cw / s - width) ty = random_uniform_strong(-1, 0, seed=seed) * (ch / s - height) else: # The image can be translated outside of the output resolution window # but the image is translated relative to the output resolution not the # input image resolution. tx = random_uniform_strong(0.5 - translate, 0.5 + translate, seed=seed) ty = random_uniform_strong(0.5 - translate, 0.5 + translate, seed=seed) # Center and Scale the image such that the window of translation is # contained to the output resolution. dx, dy = (width - cw / s) / width, (height - ch / s) / height sx, sy = 1 - dx, 1 - dy bx, by = dx / 2, dy / 2 tx, ty = bx + (sx * tx), by + (sy * ty) # Scale the translation to width and height of the image. tx *= width ty *= height translation = tf.tensor_scatter_nd_update(translation, [[0, 2], [1, 2]], [tx, ty]) translation_boxes = tf.tensor_scatter_nd_update(translation, [[0, 2], [1, 2]], [-tx, -ty]) # Use repeated matric multiplications to combine all the image transforamtions # into a single unified augmentation operation M is applied to the image # Mb is to apply to the boxes. The order of matrix multiplication is # important. First, Translate, then Scale, then Rotate, then Center, then # finally alter the Prepsective. affine = (translation @ scale @ rotation @ center @ prespective_warp) affine_boxes = ( prespective_warp_boxes @ center_boxes @ rotation_boxes @ scale_boxes @ translation_boxes) return affine, affine_boxes, s def affine_warp_image(image, desired_size, perspective=0.00, degrees=0.0, scale_min=1.0, scale_max=1.0, translate=0.0, random_pad=False, seed=None): """Applies random spatial augmentation to the image. Args: image: A `Tensor` for the image. desired_size: A `tuple` for desired output image size. perspective: An `int` for the maximum that can be applied to random perspective change. degrees: An `int` for the maximum degrees that can be applied to random rotation. scale_min: An `int` for the minimum scaling factor that can be applied to random scaling. scale_max: An `int` for the maximum scaling factor that can be applied to random scaling. translate: An `int` for the maximum translation that can be applied to random translation. random_pad: A `bool` for using random padding. seed: An `Optional[int]` for the seed to use in random number generation. Returns: image: A `Tensor` representing the augmented image. affine_matrix: A `Tensor` representing the augmenting matrix for the image. affine_info: A `List` containing the size of the original image, the desired output_size of the image and the augmenting matrix for the boxes. """ # Build an image transformation matrix. image_size = tf.cast(get_image_shape(image), tf.float32) affine_matrix, affine_boxes, _ = _build_transform( image, perspective=perspective, degrees=degrees, scale_min=scale_min, scale_max=scale_max, translate=translate, random_pad=random_pad, desired_size=desired_size, seed=seed) affine = tf.reshape(affine_matrix, [-1]) affine = tf.cast(affine[:-1], tf.float32) # Apply the transformation to image. image = tfa.image.transform( image, affine, fill_value=PAD_VALUE, output_shape=desired_size, interpolation='bilinear') desired_size = tf.cast(desired_size, tf.float32) affine_info = [image_size, desired_size, affine_boxes] return image, affine_matrix, affine_info def affine_warp_boxes(affine, boxes, output_size, box_history): """Applies random rotation, random perspective change and random translation. and random scaling to the boxes. Args: affine: A `Tensor` for the augmenting matrix for the boxes. boxes: A `Tensor` for the boxes. output_size: A `list` of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. box_history: A `Tensor` for the boxes history, which are the boxes that undergo the same augmentations as `boxes`, but no clipping was applied. We can keep track of how much changes are done to the boxes by keeping track of this tensor. Returns: clipped_boxes: A `Tensor` representing the augmented boxes. box_history: A `Tensor` representing the augmented box_history. """ def _get_corners(box): """Get the corner of each box as a tuple of (x, y) coordinates.""" ymi, xmi, yma, xma = tf.split(box, 4, axis=-1) tl = tf.concat([xmi, ymi], axis=-1) bl = tf.concat([xmi, yma], axis=-1) tr = tf.concat([xma, ymi], axis=-1) br = tf.concat([xma, yma], axis=-1) return tf.concat([tl, bl, tr, br], axis=-1) def _corners_to_boxes(corner): """Convert (x, y) corners back into boxes [ymin, xmin, ymax, xmax].""" corner = tf.reshape(corner, [-1, 4, 2]) y = corner[..., 1] x = corner[..., 0] y_min = tf.reduce_min(y, axis=-1) x_min = tf.reduce_min(x, axis=-1) y_max = tf.reduce_max(y, axis=-1) x_max = tf.reduce_max(x, axis=-1) return tf.stack([y_min, x_min, y_max, x_max], axis=-1) def _aug_boxes(affine_matrix, box): """Apply an affine transformation matrix M to the boxes augment boxes.""" corners = _get_corners(box) corners = tf.reshape(corners, [-1, 4, 2]) z = tf.expand_dims(tf.ones_like(corners[..., 1]), axis=-1) corners = tf.concat([corners, z], axis=-1) corners = tf.transpose( tf.matmul(affine_matrix, corners, transpose_b=True), perm=(0, 2, 1)) corners, p = tf.split(corners, [2, 1], axis=-1) corners /= p corners = tf.reshape(corners, [-1, 8]) box = _corners_to_boxes(corners) return box boxes = _aug_boxes(affine, boxes) box_history = _aug_boxes(affine, box_history) clipped_boxes = bbox_ops.clip_boxes(boxes, output_size) return clipped_boxes, box_history def boxes_candidates(clipped_boxes, box_history, wh_thr=2, ar_thr=20, area_thr=0.1): """Filters the boxes that don't satisfy the width/height and area constraints. Args: clipped_boxes: A `Tensor` for the boxes. box_history: A `Tensor` for the boxes history, which are the boxes that undergo the same augmentations as `boxes`, but no clipping was applied. We can keep track of how much changes are done to the boxes by keeping track of this tensor. wh_thr: An `int` for the width/height threshold. ar_thr: An `int` for the aspect ratio threshold. area_thr: An `int` for the area threshold. Returns: indices[:, 0]: A `Tensor` representing valid boxes after filtering. """ if area_thr == 0.0: wh_thr = 0 ar_thr = np.inf area_thr = tf.math.abs(area_thr) # Get the scaled and shifted heights of the original # unclipped boxes. og_height = tf.maximum(box_history[:, 2] - box_history[:, 0], 0.0) og_width = tf.maximum(box_history[:, 3] - box_history[:, 1], 0.0) # Get the scaled and shifted heights of the clipped boxes. clipped_height = tf.maximum(clipped_boxes[:, 2] - clipped_boxes[:, 0], 0.0) clipped_width = tf.maximum(clipped_boxes[:, 3] - clipped_boxes[:, 1], 0.0) # Determine the aspect ratio of the clipped boxes. ar = tf.maximum(clipped_width / (clipped_height + 1e-16), clipped_height / (clipped_width + 1e-16)) # Ensure the clipped width adn height are larger than a preset threshold. conda = clipped_width >= wh_thr condb = clipped_height >= wh_thr # Ensure the area of the clipped box is larger than the area threshold. area = (clipped_height * clipped_width) / (og_width * og_height + 1e-16) condc = area > area_thr # Ensure the aspect ratio is not too extreme. condd = ar < ar_thr cond = tf.expand_dims( tf.logical_and( tf.logical_and(conda, condb), tf.logical_and(condc, condd)), axis=-1) # Set all the boxes that fail the test to be equal to zero. indices = tf.where(cond) return indices[:, 0] def resize_and_crop_boxes(boxes, image_scale, output_size, offset, box_history): """Resizes and crops the boxes. Args: boxes: A `Tensor` for the boxes. image_scale: A `Tensor` for the scaling factor of the image. output_size: A `list` of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. offset: A `Tensor` for how much translation was applied to the image. box_history: A `Tensor` for the boxes history, which are the boxes that undergo the same augmentations as `boxes`, but no clipping was applied. We can keep track of how much changes are done to the boxes by keeping track of this tensor. Returns: clipped_boxes: A `Tensor` representing the augmented boxes. box_history: A `Tensor` representing the augmented box_history. """ # Shift and scale the input boxes. boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) # Check the hitory of the boxes. box_history *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) box_history -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) # Clip the shifted and scaled boxes. clipped_boxes = bbox_ops.clip_boxes(boxes, output_size) return clipped_boxes, box_history def transform_and_clip_boxes(boxes, infos, affine=None, shuffle_boxes=False, area_thresh=0.1, seed=None, filter_and_clip_boxes=True): """Clips and cleans the boxes. Args: boxes: A `Tensor` for the boxes. infos: A `list` that contains the image infos. affine: A `list` that contains parameters for resize and crop. shuffle_boxes: A `bool` for shuffling the boxes. area_thresh: An `int` for the area threshold. seed: seed for random number generation. filter_and_clip_boxes: A `bool` for filtering and clipping the boxes to [0, 1]. Returns: boxes: A `Tensor` representing the augmented boxes. ind: A `Tensor` valid box indices. """ # Clip and clean boxes. def get_valid_boxes(boxes): """Get indices for non-empty boxes.""" # Convert the boxes to center width height formatting. height = boxes[:, 2] - boxes[:, 0] width = boxes[:, 3] - boxes[:, 1] base = tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)) return base # Initialize history to track operation applied to boxes box_history = boxes # Make sure all boxes are valid to start, clip to [0, 1] and get only the # valid boxes. output_size = None if filter_and_clip_boxes: boxes = tf.math.maximum(tf.math.minimum(boxes, 1.0), 0.0) cond = get_valid_boxes(boxes) if infos is None: infos = [] for info in infos: # Denormalize the boxes. boxes = bbox_ops.denormalize_boxes(boxes, info[0]) box_history = bbox_ops.denormalize_boxes(box_history, info[0]) # Shift and scale all boxes, and keep track of box history with no # box clipping, history is used for removing boxes that have become # too small or exit the image area. (boxes, box_history) = resize_and_crop_boxes( boxes, info[2, :], info[1, :], info[3, :], box_history=box_history) # Get all the boxes that still remain in the image and store # in a bit vector for later use. cond = tf.logical_and(get_valid_boxes(boxes), cond) # Normalize the boxes to [0, 1]. output_size = info[1] boxes = bbox_ops.normalize_boxes(boxes, output_size) box_history = bbox_ops.normalize_boxes(box_history, output_size) if affine is not None: # Denormalize the boxes. boxes = bbox_ops.denormalize_boxes(boxes, affine[0]) box_history = bbox_ops.denormalize_boxes(box_history, affine[0]) # Clipped final boxes. (boxes, box_history) = affine_warp_boxes( affine[2], boxes, affine[1], box_history=box_history) # Get all the boxes that still remain in the image and store # in a bit vector for later use. cond = tf.logical_and(get_valid_boxes(boxes), cond) # Normalize the boxes to [0, 1]. output_size = affine[1] boxes = bbox_ops.normalize_boxes(boxes, output_size) box_history = bbox_ops.normalize_boxes(box_history, output_size) # Remove the bad boxes. boxes *= tf.cast(tf.expand_dims(cond, axis=-1), boxes.dtype) # Threshold the existing boxes. if filter_and_clip_boxes: if output_size is not None: boxes_ = bbox_ops.denormalize_boxes(boxes, output_size) box_history_ = bbox_ops.denormalize_boxes(box_history, output_size) inds = boxes_candidates(boxes_, box_history_, area_thr=area_thresh) else: inds = boxes_candidates( boxes, box_history, wh_thr=0.0, area_thr=area_thresh) # Select and gather the good boxes. if shuffle_boxes: inds = tf.random.shuffle(inds, seed=seed) else: inds = bbox_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, inds) return boxes, inds