Commit 0016b0a7 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'dtk22.04' into 'main'

Dtk22.04

See merge request dcutoolkit/deeplearing/dlexamples_new!49
parents 17bc28d5 7a382d5d
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import demo_utils
from keras_cv.layers import RandomlyZoomedCrop
def main():
many_elephants = demo_utils.load_elephant_tensor(output_size=(300, 300))
layer = RandomlyZoomedCrop(
target_size=(224, 224),
zoom_factor=(0.8, 1.2),
aspect_ratio_factor=(3.0 / 4.0, 4.0 / 3.0),
)
augmented = layer(many_elephants)
demo_utils.gallery_show(augmented.numpy())
layer = RandomlyZoomedCrop(
target_size=(224, 224),
zoom_factor=(0.08, 2.0),
aspect_ratio_factor=(3.0 / 4.0, 4.0 / 3.0),
)
augmented = layer(many_elephants)
demo_utils.gallery_show(augmented.numpy())
if __name__ == "__main__":
main()
"""
Title: Generate an image from a text prompt using StableDiffusion
Author: fchollet
Date created: 2022/09/24
Last modified: 2022/09/24
Description: Use StableDiffusion to generate an image according to a short text description.
"""
from PIL import Image
from keras_cv.models import StableDiffusion
model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
img = model.text_to_image("Photograph of a beautiful horse running through a field")
Image.fromarray(img[0]).save("horse.png")
print("Saved at horse.png")
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Title: Train an Object Detection Model on Pascal VOC 2007 using KerasCV
Author: [tanzhenyu](https://github.com/tanzhenyu)
Date created: 2022/09/27
Last modified: 2022/09/27
Description: Use KerasCV to train a RetinaNet on Pascal VOC 2007.
"""
import sys
import tensorflow as tf
import tensorflow_datasets as tfds
from absl import flags
import keras_cv
flags.DEFINE_string(
"weights_path",
"weights_{epoch:02d}.h5",
"Directory which will be used to store weight checkpoints.",
)
flags.DEFINE_string(
"tensorboard_path",
"logs",
"Directory which will be used to store tensorboard logs.",
)
FLAGS = flags.FLAGS
FLAGS(sys.argv)
# parameters from FasterRCNN [paper](https://arxiv.org/pdf/1506.01497.pdf)
# Try to detect an available TPU. If none is present, default to MirroredStrategy
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
# MirroredStrategy is best for a single machine with one or multiple GPUs
strategy = tf.distribute.MirroredStrategy()
print("Number of accelerators: ", strategy.num_replicas_in_sync)
local_batch = 4
global_batch = local_batch * strategy.num_replicas_in_sync
base_lr = 0.01 * global_batch / 16
image_size = [640, 640, 3]
train_ds = tfds.load(
"voc/2007", split="train+test", with_info=False, shuffle_files=True
)
train_ds = train_ds.concatenate(
tfds.load("voc/2012", split="train+validation", with_info=False, shuffle_files=True)
)
eval_ds = tfds.load("voc/2007", split="test", with_info=False)
with strategy.scope():
model = keras_cv.models.FasterRCNN(classes=20, bounding_box_format="yxyx")
# TODO: migrate to KPL.
def resize_and_crop_image(
image,
desired_size,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR,
):
with tf.name_scope("resize_and_crop_image"):
image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
random_jittering = aug_scale_min != 1.0 or aug_scale_max != 1.0
if random_jittering:
random_scale = tf.random.uniform(
[], aug_scale_min, aug_scale_max, seed=seed
)
scaled_size = tf.round(random_scale * desired_size)
else:
scaled_size = desired_size
scale = tf.minimum(
scaled_size[0] / image_size[0], scaled_size[1] / image_size[1]
)
scaled_size = tf.round(image_size * scale)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset
)
offset = max_offset * tf.random.uniform(
[
2,
],
0,
1,
seed=seed,
)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method
)
if random_jittering:
scaled_image = scaled_image[
offset[0] : offset[0] + desired_size[0],
offset[1] : offset[1] + desired_size[1],
:,
]
output_image = tf.image.pad_to_bounding_box(
scaled_image, 0, 0, padded_size[0], padded_size[1]
)
image_info = tf.stack(
[
image_size,
tf.constant(desired_size, dtype=tf.float32),
image_scale,
tf.cast(offset, tf.float32),
]
)
return output_image, image_info
def resize_and_crop_boxes(boxes, image_scale, output_size, offset):
with tf.name_scope("resize_and_crop_boxes"):
# Adjusts box coordinates based on image_scale and offset.
boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
# Clips the boxes.
boxes = clip_boxes(boxes, output_size)
return boxes
def clip_boxes(boxes, image_shape):
if boxes.shape[-1] != 4:
raise ValueError(
"boxes.shape[-1] is {:d}, but must be 4.".format(boxes.shape[-1])
)
with tf.name_scope("clip_boxes"):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
max_length = [height, width, height, width]
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.unstack(image_shape, axis=-1)
max_length = tf.stack([height, width, height, width], axis=-1)
clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
return clipped_boxes
def get_non_empty_box_indices(boxes):
# Selects indices if box height or width is 0.
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
indices = tf.where(tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
return indices[:, 0]
def resize_fn(image, gt_boxes, gt_classes):
image, image_info = resize_and_crop_image(
image, image_size[:2], image_size[:2], 0.8, 1.25
)
gt_boxes = resize_and_crop_boxes(
gt_boxes, image_info[2, :], image_info[1, :], image_info[3, :]
)
indices = get_non_empty_box_indices(gt_boxes)
gt_boxes = tf.gather(gt_boxes, indices)
gt_classes = tf.gather(gt_classes, indices)
return image, gt_boxes, gt_classes
def flip_fn(image, boxes):
if tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32) > 0.5:
image = tf.image.flip_left_right(image)
y1, x1, y2, x2 = tf.split(boxes, num_or_size_splits=4, axis=-1)
boxes = tf.concat([y1, 1.0 - x2, y2, 1.0 - x1], axis=-1)
return image, boxes
def proc_train_fn(bounding_box_format, img_size):
def apply(inputs):
image = inputs["image"]
image = tf.cast(image, tf.float32)
image = tf.keras.applications.resnet50.preprocess_input(image)
gt_boxes = inputs["objects"]["bbox"]
image, gt_boxes = flip_fn(image, gt_boxes)
gt_boxes = keras_cv.bounding_box.convert_format(
gt_boxes,
images=image,
source="rel_yxyx",
target=bounding_box_format,
)
gt_classes = tf.cast(inputs["objects"]["label"], tf.float32)
image, gt_boxes, gt_classes = resize_fn(image, gt_boxes, gt_classes)
gt_classes = tf.expand_dims(gt_classes, axis=-1)
return {
"images": image,
"gt_boxes": gt_boxes,
"gt_classes": gt_classes,
}
return apply
def pad_fn(examples):
gt_boxes = examples.pop("gt_boxes")
gt_classes = examples.pop("gt_classes")
gt_boxes = gt_boxes.to_tensor(default_value=-1.0, shape=[global_batch, 32, 4])
gt_classes = gt_classes.to_tensor(default_value=-1.0, shape=[global_batch, 32, 1])
return examples["images"], {"gt_boxes": gt_boxes, "gt_classes": gt_classes}
train_ds = train_ds.map(
proc_train_fn(bounding_box_format="yxyx", img_size=image_size),
num_parallel_calls=tf.data.AUTOTUNE,
)
train_ds = train_ds.apply(
tf.data.experimental.dense_to_ragged_batch(global_batch, drop_remainder=True)
)
train_ds = train_ds.map(pad_fn, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.shuffle(8)
train_ds = train_ds.prefetch(2)
eval_ds = eval_ds.map(
proc_train_fn(bounding_box_format="yxyx", img_size=image_size),
num_parallel_calls=tf.data.AUTOTUNE,
)
eval_ds = eval_ds.apply(
tf.data.experimental.dense_to_ragged_batch(global_batch, drop_remainder=True)
)
eval_ds = eval_ds.map(pad_fn, num_parallel_calls=tf.data.AUTOTUNE)
eval_ds = eval_ds.prefetch(2)
with strategy.scope():
lr_decay = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
boundaries=[12000 * 16 / global_batch, 16000 * 16 / global_batch],
values=[base_lr, 0.1 * base_lr, 0.01 * base_lr],
)
optimizer = tf.keras.optimizers.SGD(
learning_rate=lr_decay, momentum=0.9, global_clipnorm=10.0
)
weight_decay = 0.0001
step = 0
callbacks = [
tf.keras.callbacks.ModelCheckpoint(FLAGS.weights_path, save_weights_only=True),
tf.keras.callbacks.TensorBoard(
log_dir=FLAGS.tensorboard_path, write_steps_per_second=True
),
]
model.compile(
optimizer=optimizer,
box_loss="Huber",
classification_loss="SparseCategoricalCrossentropy",
rpn_box_loss="Huber",
rpn_classification_loss="BinaryCrossentropy",
)
model.fit(train_ds, epochs=18, validation_data=eval_ds, callbacks=callbacks)
import resource
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import keras_cv
low, high = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (high, high))
BATCH_SIZE = 8
EPOCHS = 100
CHECKPOINT_PATH = "checkpoint/"
class_ids = [
"Aeroplane",
"Bicycle",
"Bird",
"Boat",
"Bottle",
"Bus",
"Car",
"Cat",
"Chair",
"Cow",
"Dining Table",
"Dog",
"Horse",
"Motorbike",
"Person",
"Potted Plant",
"Sheep",
"Sofa",
"Train",
"Tvmonitor",
"Total",
]
class_mapping = dict(zip(range(len(class_ids)), class_ids))
image_size = [640, 640, 3]
train_ds = tfds.load(
"voc/2007", split="train+test", with_info=False, shuffle_files=True
)
train_ds = train_ds.concatenate(
tfds.load("voc/2012", split="train+validation", with_info=False, shuffle_files=True)
)
eval_ds = tfds.load("voc/2007", split="test", with_info=False)
# TODO: migrate to KPL.
def resize_and_crop_image(
image,
desired_size,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR,
):
with tf.name_scope("resize_and_crop_image"):
image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
random_jittering = aug_scale_min != 1.0 or aug_scale_max != 1.0
if random_jittering:
random_scale = tf.random.uniform(
[], aug_scale_min, aug_scale_max, seed=seed
)
scaled_size = tf.round(random_scale * desired_size)
else:
scaled_size = desired_size
scale = tf.minimum(
scaled_size[0] / image_size[0], scaled_size[1] / image_size[1]
)
scaled_size = tf.round(image_size * scale)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset
)
offset = max_offset * tf.random.uniform(
[
2,
],
0,
1,
seed=seed,
)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method
)
if random_jittering:
scaled_image = scaled_image[
offset[0] : offset[0] + desired_size[0],
offset[1] : offset[1] + desired_size[1],
:,
]
output_image = tf.image.pad_to_bounding_box(
scaled_image, 0, 0, padded_size[0], padded_size[1]
)
image_info = tf.stack(
[
image_size,
tf.constant(desired_size, dtype=tf.float32),
image_scale,
tf.cast(offset, tf.float32),
]
)
return output_image, image_info
def resize_and_crop_boxes(boxes, image_scale, output_size, offset):
with tf.name_scope("resize_and_crop_boxes"):
# Adjusts box coordinates based on image_scale and offset.
boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
# Clips the boxes.
boxes = clip_boxes(boxes, output_size)
return boxes
def clip_boxes(boxes, image_shape):
if boxes.shape[-1] != 4:
raise ValueError(
"boxes.shape[-1] is {:d}, but must be 4.".format(boxes.shape[-1])
)
with tf.name_scope("clip_boxes"):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
max_length = [height, width, height, width]
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.unstack(image_shape, axis=-1)
max_length = tf.stack([height, width, height, width], axis=-1)
clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
return clipped_boxes
def get_non_empty_box_indices(boxes):
# Selects indices if box height or width is 0.
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
indices = tf.where(tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
return indices[:, 0]
def resize_fn(image, gt_boxes, gt_classes):
image, image_info = resize_and_crop_image(
image, image_size[:2], image_size[:2], 0.8, 1.25
)
gt_boxes = resize_and_crop_boxes(
gt_boxes, image_info[2, :], image_info[1, :], image_info[3, :]
)
indices = get_non_empty_box_indices(gt_boxes)
gt_boxes = tf.gather(gt_boxes, indices)
gt_classes = tf.gather(gt_classes, indices)
return image, gt_boxes, gt_classes
def flip_fn(image, boxes):
if tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32) > 0.5:
image = tf.image.flip_left_right(image)
y1, x1, y2, x2 = tf.split(boxes, num_or_size_splits=4, axis=-1)
boxes = tf.concat([y1, 1.0 - x2, y2, 1.0 - x1], axis=-1)
return image, boxes
def proc_train_fn(bounding_box_format, img_size):
def apply(inputs):
image = inputs["image"]
image = tf.cast(image, tf.float32)
gt_boxes = inputs["objects"]["bbox"]
image, gt_boxes = flip_fn(image, gt_boxes)
gt_boxes = keras_cv.bounding_box.convert_format(
gt_boxes,
images=image,
source="rel_yxyx",
target="yxyx",
)
gt_classes = tf.cast(inputs["objects"]["label"], tf.float32)
image, gt_boxes, gt_classes = resize_fn(image, gt_boxes, gt_classes)
gt_classes = tf.expand_dims(gt_classes, axis=-1)
bounding_boxes = tf.concat([gt_boxes, gt_classes], axis=-1)
bounding_boxes = keras_cv.bounding_box.convert_format(
bounding_boxes, images=image, source="yxyx", target=bounding_box_format
)
return {"images": image, "bounding_boxes": bounding_boxes}
return apply
train_ds = train_ds.map(
proc_train_fn("xywh", image_size), num_parallel_calls=tf.data.AUTOTUNE
)
train_ds = train_ds.apply(
tf.data.experimental.dense_to_ragged_batch(BATCH_SIZE, drop_remainder=True)
)
eval_ds = eval_ds.map(
proc_train_fn(bounding_box_format="xywh", img_size=image_size),
num_parallel_calls=tf.data.AUTOTUNE,
)
eval_ds = eval_ds.apply(
tf.data.experimental.dense_to_ragged_batch(BATCH_SIZE, drop_remainder=True)
)
"""
Looks like everything is structured as expected. Now we can move on to constructing our
data augmentation pipeline.
"""
train_ds = train_ds.prefetch(2)
train_ds = train_ds.shuffle(BATCH_SIZE)
eval_ds = eval_ds.prefetch(2)
def unpackage_dict(inputs):
return inputs["images"], inputs["bounding_boxes"]
train_ds = train_ds.map(unpackage_dict, num_parallel_calls=tf.data.AUTOTUNE)
eval_ds = eval_ds.map(unpackage_dict, num_parallel_calls=tf.data.AUTOTUNE)
"""
Our data pipeline is now complete. We can now move on to model creation and training.
"""
"""
## Model creation
We'll use the KerasCV API to construct a RetinaNet model. In this tutorial we use
a pretrained ResNet50 backbone using weights. In order to perform fine-tuning, we
freeze the backbone before training. When `include_rescaling=True` is set, inputs to
the model are expected to be in the range `[0, 255]`.
"""
model = keras_cv.models.RetinaNet(
# number of classes to be used in box classification
classes=len(class_ids),
# For more info on supported bounding box formats, visit
# https://keras.io/api/keras_cv/bounding_box/
bounding_box_format="xywh",
# KerasCV offers a set of pre-configured backbones
backbone="resnet50",
# Each backbone comes with multiple pre-trained weights
# These weights match the weights available in the `keras_cv.model` class.
backbone_weights="imagenet",
# include_rescaling tells the model whether your input images are in the default
# pixel range (0, 255) or if you have already rescaled your inputs to the range
# (0, 1). In our case, we feed our model images with inputs in the range (0, 255).
include_rescaling=True,
# Typically, you'll want to set this to False when training a real model.
# evaluate_train_time_metrics=True makes `train_step()` incompatible with TPU,
# and also causes a massive performance hit. It can, however be useful to produce
# train time metrics when debugging your model training pipeline.
evaluate_train_time_metrics=False,
)
# Fine-tuning a RetinaNet is as simple as setting backbone.trainable = False
model.backbone.trainable = False
metrics = [
keras_cv.metrics.COCOMeanAveragePrecision(
class_ids=range(21),
bounding_box_format="xywh",
name="Mean Average Precision",
),
keras_cv.metrics.COCORecall(
class_ids=range(21),
bounding_box_format="xywh",
max_detections=100,
name="Recall",
),
]
optimizer = tf.optimizers.SGD(global_clipnorm=10.0)
model.compile(
classification_loss="focal",
box_loss="smoothl1",
optimizer=optimizer,
metrics=metrics,
)
callbacks = [
keras.callbacks.TensorBoard(log_dir="logs"),
keras.callbacks.ReduceLROnPlateau(patience=5),
keras.callbacks.EarlyStopping(patience=10),
keras.callbacks.ModelCheckpoint(CHECKPOINT_PATH, save_weights_only=True),
]
history = model.fit(
train_ds,
validation_data=eval_ds.take(10),
epochs=100,
callbacks=callbacks,
)
"""
## Evaluation
"""
coco_suite = [
keras_cv.metrics.COCOMeanAveragePrecision(
bounding_box_format="xywh",
class_ids=range(len(class_ids)),
max_detections=100,
name="MaP Standard",
),
keras_cv.metrics.COCOMeanAveragePrecision(
bounding_box_format="xywh",
class_ids=range(len(class_ids)),
iou_thresholds=[0.75],
max_detections=100,
name="MaP@IoU=0.75",
),
keras_cv.metrics.COCOMeanAveragePrecision(
bounding_box_format="xywh",
class_ids=range(len(class_ids)),
iou_thresholds=[0.5],
max_detections=100,
name="MaP@IoU=0.5",
),
keras_cv.metrics.COCOMeanAveragePrecision(
bounding_box_format="xywh",
class_ids=range(len(class_ids)),
area_range=(0, 32**2),
max_detections=100,
name="MaP Small",
),
keras_cv.metrics.COCOMeanAveragePrecision(
bounding_box_format="xywh",
class_ids=range(len(class_ids)),
area_range=(32**2, 96**2),
max_detections=100,
name="MaP Medium",
),
keras_cv.metrics.COCOMeanAveragePrecision(
bounding_box_format="xywh",
class_ids=range(len(class_ids)),
area_range=(96**2, 1e5**2),
max_detections=100,
name="MaP Large",
),
keras_cv.metrics.COCORecall(
class_ids=range(len(class_ids)),
bounding_box_format="xywh",
max_detections=100,
name="Recall Standard",
),
keras_cv.metrics.COCORecall(
class_ids=range(len(class_ids)),
bounding_box_format="xywh",
max_detections=1,
name="Recall MaxDets=1",
),
keras_cv.metrics.COCORecall(
class_ids=range(len(class_ids)),
bounding_box_format="xywh",
max_detections=10,
name="Recall MaxDets=10",
),
keras_cv.metrics.COCORecall(
class_ids=range(len(class_ids)),
bounding_box_format="xywh",
max_detections=100,
area_range=(0, 32**2),
name="Recall Small",
),
keras_cv.metrics.COCORecall(
class_ids=range(len(class_ids)),
bounding_box_format="xywh",
max_detections=100,
area_range=(32**2, 96**2),
name="Recall Medium",
),
keras_cv.metrics.COCORecall(
class_ids=range(len(class_ids)),
bounding_box_format="xywh",
max_detections=100,
area_range=(96**2, 1e5**2),
name="Recall Large",
),
]
model.compile(
classification_loss=keras_cv.losses.FocalLoss(from_logits=True, reduction="none"),
box_loss=keras_cv.losses.SmoothL1Loss(l1_cutoff=1.0, reduction="none"),
optimizer=optimizer,
metrics=coco_suite,
)
model.load_weights(CHECKPOINT_PATH)
def proc_eval_fn(bounding_box_format, target_size):
def apply(inputs):
raw_image = inputs["image"]
raw_image = tf.cast(raw_image, tf.float32)
img_size = tf.shape(raw_image)
height = img_size[0]
width = img_size[1]
target_height = tf.cond(
height > width,
lambda: 640.0,
lambda: tf.cast(height / width * 640.0, tf.float32),
)
target_width = tf.cond(
width > height,
lambda: 640.0,
lambda: tf.cast(width / height * 640.0, tf.float32),
)
image = tf.image.resize(
raw_image, (target_height, target_width), antialias=False
)
gt_boxes = keras_cv.bounding_box.convert_format(
inputs["objects"]["bbox"],
images=image,
source="rel_yxyx",
target="xyxy",
)
image = tf.image.pad_to_bounding_box(
image, 0, 0, target_size[0], target_size[1]
)
gt_boxes = keras_cv.bounding_box.convert_format(
gt_boxes,
images=image,
source="xyxy",
target=bounding_box_format,
)
gt_classes = tf.cast(inputs["objects"]["label"], tf.float32)
gt_classes = tf.expand_dims(gt_classes, axis=-1)
bounding_boxes = tf.concat([gt_boxes, gt_classes], axis=-1)
return image, bounding_boxes
return apply
eval_ds = tfds.load("voc/2007", split="test", with_info=False, shuffle_files=True)
eval_ds = eval_ds.map(
proc_eval_fn("xywh", [640, 640, 3]), num_parallel_calls=tf.data.AUTOTUNE
)
eval_ds = eval_ds.apply(
tf.data.experimental.dense_to_ragged_batch(BATCH_SIZE, drop_remainder=True)
)
eval_ds = eval_ds.prefetch(tf.data.AUTOTUNE)
keras_cv_metrics = model.evaluate(eval_ds, return_dict=True)
print("Metrics:", keras_cv_metrics)
1 简介
该测试用例为keras版本分类网络测试,使用TFrecord格式的ImageNet数据集
2 环境部署要求
DTK22.04.2
TensorFlow-2.9.0
3 安装
执行 python setup.py build
python setup.py build
测试 进入python环境 执行import tensorflow as tf
import keras_cv
4 训练执行 train_keras_cv.sh脚本或将脚本中命令单独取出执行
\ No newline at end of file
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Title: Training a KerasCV model for Imagenet Classification
Author: [ianjjohnson](https://github.com/ianjjohnson)
Date created: 2022/07/25
Last modified: 2022/07/25
Description: Use KerasCV to train an image classifier using modern best practices
"""
import math
import sys
import tensorflow as tf
from absl import flags
from tensorflow import keras
from tensorflow.keras import callbacks
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
import keras_cv
from keras_cv import models
from keras_cv.datasets import imagenet
"""
## Overview
KerasCV makes training state-of-the-art classification models easy by providing implementations of modern models, preprocessing techniques, and layers.
In this tutorial, we walk through training a model against the Imagenet dataset using Keras and KerasCV.
This tutorial requires you to have KerasCV installed:
```shell
pip install keras-cv
```
"""
"""
## Setup, constants and flags
"""
flags.DEFINE_string(
"model_name", None, "The name of the model in KerasCV.models to use."
)
flags.DEFINE_string("imagenet_path", None, "Directory from which to load Imagenet.")
flags.DEFINE_string(
"backup_path", None, "Directory which will be used for training backups."
)
flags.DEFINE_string(
"weights_path", None, "Directory which will be used to store weight checkpoints."
)
flags.DEFINE_string(
"tensorboard_path", None, "Directory which will be used to store tensorboard logs."
)
flags.DEFINE_integer(
"batch_size",
128,
"Batch size for training and evaluation. This will be multiplied by the number of accelerators in use.",
)
flags.DEFINE_boolean(
"use_xla", True, "Whether or not to use XLA (jit_compile) for training."
)
flags.DEFINE_boolean(
"use_mixed_precision",
False,
"Whether or not to use FP16 mixed precision for training.",
)
flags.DEFINE_float(
"initial_learning_rate",
0.05,
"Initial learning rate which will reduce on plateau. This will be multiplied by the number of accelerators in use",
)
flags.DEFINE_string(
"model_kwargs",
"{}",
"Keyword argument dictionary to pass to the constructor of the model being trained",
)
flags.DEFINE_string(
"learning_rate_schedule",
"ReduceOnPlateau",
"String denoting the type of learning rate schedule to be used",
)
flags.DEFINE_float(
"warmup_steps_percentage",
0.1,
"For how many steps expressed in percentage (0..1 float) of total steps should the schedule warm up if we're using the warmup schedule",
)
flags.DEFINE_float(
"warmup_hold_steps_percentage",
0.1,
"For how many steps expressed in percentage (0..1 float) of total steps should the schedule hold the initial learning rate after warmup is finished, and before applying cosine decay.",
)
# An upper bound for number of epochs (this script uses EarlyStopping).
flags.DEFINE_integer("epochs", 1000, "Epochs to train for")
FLAGS = flags.FLAGS
FLAGS(sys.argv)
CLASSES = 1000
IMAGE_SIZE = (224, 224)
REDUCE_ON_PLATEAU = "ReduceOnPlateau"
COSINE_DECAY_WITH_WARMUP = "CosineDecayWithWarmup"
if FLAGS.model_name not in models.__dict__:
raise ValueError(f"Invalid model name: {FLAGS.model_name}")
if FLAGS.use_mixed_precision:
keras.mixed_precision.set_global_policy("mixed_float16")
"""
We start by detecting the type of accelerators we have available and picking an
appropriate distribution strategy accordingly. We scale our learning rate and
batch size based on the number of accelerators being used.
"""
# Try to detect an available TPU. If none is present, default to MirroredStrategy
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
strategy = tf.distribute.TPUStrategy(tpu)
if FLAGS.use_mixed_precision:
keras.mixed_precision.set_global_policy("mixed_bfloat16")
except ValueError:
# MirroredStrategy is best for a single machine with one or multiple GPUs
strategy = tf.distribute.MirroredStrategy()
print("Number of accelerators: ", strategy.num_replicas_in_sync)
BATCH_SIZE = FLAGS.batch_size * strategy.num_replicas_in_sync
INITIAL_LEARNING_RATE = FLAGS.initial_learning_rate * strategy.num_replicas_in_sync
"""TFRecord-based tf.data.Dataset loads lazily so we can't get the length of the dataset. Temporary."""
NUM_IMAGES = 1281167
"""
## Data loading
This guide uses the
[Imagenet dataset](https://www.tensorflow.org/datasets/catalog/imagenet2012).
Note that this requires manual download and preprocessing. You can find more
information about preparing this dataset at keras_cv/datasets/imagenet/README.md
"""
train_ds = imagenet.load(
split="train",
tfrecord_path=FLAGS.imagenet_path,
shuffle_buffer=BATCH_SIZE * 2,
)
test_ds = imagenet.load(
split="validation",
tfrecord_path=FLAGS.imagenet_path,
batch_size=BATCH_SIZE,
img_size=IMAGE_SIZE,
)
"""
Next, we augment our dataset.
We define a set of augmentation layers and then apply them to our input dataset.
"""
random_crop_and_resize = keras_cv.layers.RandomCropAndResize(
target_size=IMAGE_SIZE,
crop_area_factor=(0.8, 1),
aspect_ratio_factor=(3 / 4, 4 / 3),
)
@tf.function
def crop_and_resize(img, label):
inputs = {"images": img, "labels": label}
inputs = random_crop_and_resize(inputs)
return inputs["images"], inputs["labels"]
AUGMENT_LAYERS = [
keras_cv.layers.RandomFlip(mode="horizontal"),
keras_cv.layers.RandAugment(value_range=(0, 255), magnitude=0.3),
keras_cv.layers.CutMix(),
]
@tf.function
def augment(img, label):
inputs = {"images": img, "labels": label}
for layer in AUGMENT_LAYERS:
inputs = layer(inputs)
return inputs["images"], inputs["labels"]
train_ds = (
train_ds.map(crop_and_resize, num_parallel_calls=tf.data.AUTOTUNE)
.batch(BATCH_SIZE)
.map(augment, num_parallel_calls=tf.data.AUTOTUNE)
.prefetch(tf.data.AUTOTUNE)
)
test_ds = test_ds.prefetch(tf.data.AUTOTUNE)
"""
Now we can begin training our model. We begin by loading a model from KerasCV.
"""
with strategy.scope():
model = models.__dict__[FLAGS.model_name]
model = model(
include_rescaling=True,
include_top=True,
classes=CLASSES,
input_shape=IMAGE_SIZE + (3,),
**eval(FLAGS.model_kwargs),
)
"""
Optional LR schedule with cosine decay instead of ReduceLROnPlateau
TODO: Replace with Core Keras LRWarmup when it's released. This is a temporary solution.
Convinience method for calculating LR at given timestep, for the WarmUpCosineDecay class.
"""
def lr_warmup_cosine_decay(
global_step, warmup_steps, hold=0, total_steps=0, start_lr=0.0, target_lr=1e-2
):
# Cosine decay
learning_rate = (
0.5
* target_lr
* (
1
+ tf.cos(
tf.constant(math.pi)
* tf.cast(global_step - warmup_steps - hold, tf.float32)
/ float(total_steps - warmup_steps - hold)
)
)
)
target_lr = tf.cast(target_lr, tf.float32)
warmup_lr = tf.cast(target_lr * (global_step / warmup_steps), tf.float32)
if hold > 0:
learning_rate = tf.where(
global_step > warmup_steps + hold, learning_rate, target_lr
)
learning_rate = tf.where(global_step < warmup_steps, warmup_lr, learning_rate)
return learning_rate
"""
LearningRateSchedule implementing the learning rate warmup with cosine decay strategy.
Learning rate warmup should help with initial training instability,
while the decay strategy may be variable, cosine being a popular choice.
The schedule will start from 0.0 (or supplied start_lr) and gradually "warm up" linearly to the target_lr.
From there, it will apply a cosine decay to the learning rate, after an optional holding period.
args:
- [float] start_lr: default 0.0, the starting learning rate at the beginning of training from which the warmup starts
- [float] target_lr: default 1e-2, the target (initial) learning rate from which you'd usually start without a LR warmup schedule
- [int] warmup_steps: number of training steps to warm up for expressed in batches
- [int] total_steps: the total steps (epochs * number of batches per epoch) in the dataset
- [int] hold: optional argument to hold the target_lr before applying cosine decay on it
"""
class WarmUpCosineDecay(keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, warmup_steps, total_steps, hold, start_lr=0.0, target_lr=1e-2):
super().__init__()
self.start_lr = start_lr
self.target_lr = target_lr
self.warmup_steps = warmup_steps
self.total_steps = total_steps
self.hold = hold
def __call__(self, step):
lr = lr_warmup_cosine_decay(
global_step=step,
total_steps=self.total_steps,
warmup_steps=self.warmup_steps,
start_lr=self.start_lr,
target_lr=self.target_lr,
hold=self.hold,
)
return tf.where(step > self.total_steps, 0.0, lr, name="learning_rate")
total_steps = (NUM_IMAGES // BATCH_SIZE) * FLAGS.epochs
warmup_steps = int(FLAGS.warmup_steps_percentage * total_steps)
hold_steps = int(FLAGS.warmup_hold_steps_percentage * total_steps)
schedule = WarmUpCosineDecay(
start_lr=0.0,
target_lr=INITIAL_LEARNING_RATE,
warmup_steps=warmup_steps,
total_steps=total_steps,
hold=hold_steps,
)
"""
Next, we pick an optimizer. Here we use SGD.
Note that learning rate will decrease over time due to the ReduceLROnPlateau callback or with the LRWarmup scheduler.
"""
if FLAGS.learning_rate_schedule == COSINE_DECAY_WITH_WARMUP:
optimizer = optimizers.SGD(learning_rate=schedule, momentum=0.9)
else:
optimizer = optimizers.SGD(
learning_rate=INITIAL_LEARNING_RATE, momentum=0.9, global_clipnorm=10
)
"""
Next, we pick a loss function. We use CategoricalCrossentropy with label smoothing.
"""
loss_fn = losses.CategoricalCrossentropy(label_smoothing=0.1)
"""
Next, we specify the metrics that we want to track. For this example, we track accuracy.
"""
with strategy.scope():
training_metrics = [metrics.CategoricalAccuracy()]
"""
As a last piece of configuration, we configure callbacks for the method.
We use EarlyStopping, BackupAndRestore, and a model checkpointing callback.
"""
model_callbacks = [
callbacks.EarlyStopping(patience=20),
callbacks.BackupAndRestore(FLAGS.backup_path),
callbacks.ModelCheckpoint(FLAGS.weights_path, save_weights_only=True),
callbacks.TensorBoard(log_dir=FLAGS.tensorboard_path, write_steps_per_second=True),
]
if FLAGS.learning_rate_schedule == REDUCE_ON_PLATEAU:
model_callbacks.append(
callbacks.ReduceLROnPlateau(
monitor="val_loss", factor=0.1, patience=10, min_delta=0.001, min_lr=0.0001
)
)
"""
We can now compile the model and fit it to the training dataset.
"""
model.compile(
optimizer=optimizer,
loss=loss_fn,
metrics=training_metrics,
jit_compile=FLAGS.use_xla,
)
model.fit(
train_ds,
batch_size=BATCH_SIZE,
epochs=FLAGS.epochs,
callbacks=model_callbacks,
validation_data=test_ds,
)
{
"densenet121": {
"v0": {
"accelerators": 2,
"args": {
"batch_size": "64"
},
"contributor": "ianstenbit",
"epochs_trained": 84,
"script": {
"name": "basic_training.py",
"version": "90d4c3548a2e989fe52d6cf7ae7439af794f0ae6"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/K5Q0gAk0RayXwP0WsLPpMA/",
"validation_accuracy": "0.6771"
}
},
"densenet169": {
"v0": {
"accelerators": 2,
"args": {
"batch_size": "64"
},
"contributor": "ianstenbit",
"epochs_trained": 50,
"script": {
"name": "basic_training.py",
"version": "90d4c3548a2e989fe52d6cf7ae7439af794f0ae6"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/aQIvxQEgTqajldKxp688Nw/",
"validation_accuracy": "0.6613"
}
},
"densenet201": {
"v0": {
"accelerators": 8,
"args": {
"batch_size": "512"
},
"contributor": "ianstenbit",
"epochs_trained": 166,
"script": {
"name": "basic_training.py",
"version": "b0b349612e00ab34c25af5467ddd3b48d6fbf7a3"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/6iLPGz5RSEiyPymgzJbKIQ/",
"validation_accuracy": "0.7469"
}
},
"efficientnetv2b0": {
"v0": {
"accelerators": 8,
"args": {
"batch_size": "64",
"initial_learning_rate": ".0125"
},
"contributor": "ianstenbit",
"epochs_trained": 320,
"script": {
"name": "basic_training.py",
"version": "e349ca5563b05548996f438fa03b2f34a8231ca3"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/kBs9YZkwQAeVNfv8JPKCLw/",
"validation_accuracy": "0.7527"
}
},
"efficientnetv2b1": {
"v0": {
"accelerators": 8,
"args": {
"batch_size": "64",
"initial_learning_rate": ".0125"
},
"contributor": "ianstenbit",
"epochs_trained": 288,
"script": {
"name": "basic_training.py",
"version": "e349ca5563b05548996f438fa03b2f34a8231ca3"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/jQAQBh6LQUep18CDayP8ww/",
"validation_accuracy": "0.7560"
}
},
"efficientnetv2b2": {
"v0": {
"accelerators": 8,
"args": {
"batch_size": "64",
"initial_learning_rate": ".0125"
},
"contributor": "ianstenbit",
"epochs_trained": 313,
"script": {
"name": "basic_training.py",
"version": "02b41ea91b972cdd29c27dbc4d79e6a0b4e90de2"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/iyhN2qvIRrqj6C0Q328drg/",
"validation_accuracy": "0.7699"
}
},
"resnet50v2": {
"v0": {
"accelerators": 2,
"args": {
"batch_size": "64",
"initial_learning_rate": "0.005"
},
"contributor": "ianstenbit",
"epochs_trained": 132,
"script": {
"name": "basic_training.py",
"version": "3288c3ab31ce1c35fe7505e245fdfa9c593af78e"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/QlkKjMkqQxm3jbzOlzBvWA/",
"validation_accuracy": "0.6337"
},
"v1": {
"accelerators": 2,
"args": {
"batch_size": "128"
},
"contributor": "ianstenbit",
"epochs_trained": 168,
"script": {
"name": "basic_training.py",
"version": "8fcffd9ee81ca9892f73d8ec3ac0ba475d2f1426"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/TQ5r1EhXS4SDDagBD84rgA/",
"validation_accuracy": "0.7550"
},
"v2": {
"accelerators": 8,
"args": {
"batch_size": "64",
"initial_learning_rate": ".0125"
},
"contributor": "ianstenbit",
"epochs_trained": 150,
"script": {
"name": "basic_training.py",
"version": "02b41ea91b972cdd29c27dbc4d79e6a0b4e90de2"
},
"tensorboard_logs": "https://tensorboard.dev/experiment/ReyWQHwETwah0nqlXl8BOA/",
"validation_accuracy": "0.7612"
}
},
"script_authors": {
"basic_training.py": [
"ianstenbit",
"DavidLandup0"
]
}
}
\ No newline at end of file
#!/bin/bash
export export HIP_VISIBLE_DEVICES=0,1,2,3
nohup python basic_training.py --model_name=RegNetX064 \
--imagenet_path=./imagenet \
--backup_path=./RegNetX064_tfmodel/ \
--weights_path=./RegNetX064_tfmodel/model \
--tensorboard_path=./RegNetX064_tfmodel/tensorboard \
--use_xla=False \
--initial_learning_rate=0.05 \
--learning_rate_schedule=ReduceOnPlateau \
--batch_size=64 \
> logfile_RegNetX064_bs_64_k4 2>&1&
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
config_setting(
name = "windows",
constraint_values = ["@bazel_tools//platforms:windows"],
)
py_library(
name = "keras_cv",
srcs = glob(["**/*.py"]),
data = [
"//keras_cv/custom_ops:_keras_cv_custom_ops.so",
]
)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# isort:off
from keras_cv import version_check
version_check.check_tf_version()
# isort:on
from keras_cv import datasets
from keras_cv import layers
from keras_cv import losses
from keras_cv import metrics
from keras_cv import models
from keras_cv import training
from keras_cv import utils
from keras_cv.core import ConstantFactorSampler
from keras_cv.core import FactorSampler
from keras_cv.core import NormalFactorSampler
from keras_cv.core import UniformFactorSampler
__version__ = "0.3.4"
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Benchmarks for training KerasCV models against the MNIST dataset."""
import time
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
from keras_cv import models
# isort: off
from tensorflow.python.platform.benchmark import (
ParameterizedBenchmark,
)
class ClassificationTrainingBenchmark(
tf.test.Benchmark, metaclass=ParameterizedBenchmark
):
"""Benchmarks for classification models using `tf.test.Benchmark`."""
_benchmark_parameters = [
("ResNet50V2", models.ResNet50V2),
("DenseNet121", models.DenseNet121),
]
def __init__(self):
super().__init__()
self.num_classes = 10
self.batch_size = 64
self.dataset = (
tfds.load("mnist", split="test")
.map(
lambda x: (
tf.image.resize(x["image"], (56, 56)),
tf.one_hot(x["label"], self.num_classes),
),
num_parallel_calls=tf.data.AUTOTUNE,
)
.batch(self.batch_size)
)
self.epochs = 1
def benchmark_classification_training_single_gpu(self, app):
self._run_benchmark(app, tf.distribute.OneDeviceStrategy("/gpu:0"))
def benchmark_classification_training_multi_gpu(self, app):
self._run_benchmark(app, tf.distribute.MirroredStrategy())
def _run_benchmark(self, app, strategy):
with strategy.scope():
t0 = time.time()
model = app(
include_top=True,
classes=self.num_classes,
input_shape=(56, 56, 1),
include_rescaling=True,
)
model.compile(
optimizer=tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9),
loss="categorical_crossentropy",
metrics=["accuracy"],
)
compile_time = time.time() - t0
train_start_time = time.time()
training_results = model.fit(
self.dataset,
batch_size=self.batch_size,
epochs=self.epochs,
)
train_end_time = time.time()
training_time = train_end_time - train_start_time
total_time = train_end_time - t0
metrics = []
metrics.append({"name": "compile_time", "value": compile_time})
metrics.append({"name": "avg_epoch_time", "value": training_time / self.epochs})
metrics.append({"name": "epochs", "value": self.epochs})
metrics.append(
{"name": "accuracy", "value": training_results.history["accuracy"][0]}
)
self.report_benchmark(wall_time=total_time, metrics=metrics)
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from keras_cv.bounding_box.converters import _decode_deltas_to_boxes
from keras_cv.bounding_box.converters import _encode_box_to_deltas
from keras_cv.bounding_box.converters import convert_format
from keras_cv.bounding_box.formats import CENTER_XYWH
from keras_cv.bounding_box.formats import REL_XYXY
from keras_cv.bounding_box.formats import REL_YXYX
from keras_cv.bounding_box.formats import XYWH
from keras_cv.bounding_box.formats import XYXY
from keras_cv.bounding_box.formats import YXYX
from keras_cv.bounding_box.iou import compute_iou
from keras_cv.bounding_box.pad_batch_to_shape import pad_batch_to_shape
from keras_cv.bounding_box.utils import add_class_id
from keras_cv.bounding_box.utils import clip_to_image
from keras_cv.bounding_box.utils import filter_sentinels
from keras_cv.bounding_box.utils import pad_with_sentinels
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Converter functions for working with bounding box formats."""
from typing import List
from typing import Optional
import tensorflow as tf
# Internal exception to propagate the fact images was not passed to a converter that
# needs it
class RequiresImagesException(Exception):
pass
def _encode_box_to_deltas(
anchors: tf.Tensor,
boxes: tf.Tensor,
anchor_format: str,
box_format: str,
variance: Optional[List[float]] = None,
):
"""Converts bounding_boxes from `center_yxhw` to delta format."""
if variance and len(variance) != 4:
raise ValueError(f"`variance` must be length 4, got {variance}")
encoded_anchors = convert_format(
anchors,
source=anchor_format,
target="center_yxhw",
)
boxes = convert_format(
boxes,
source=box_format,
target="center_yxhw",
)
anchor_dimensions = tf.maximum(encoded_anchors[..., 2:], tf.keras.backend.epsilon())
box_dimensions = tf.maximum(boxes[..., 2:], tf.keras.backend.epsilon())
# anchors be unbatched, boxes can either be batched or unbatched.
boxes_delta = tf.concat(
[
(boxes[..., :2] - encoded_anchors[..., :2]) / anchor_dimensions,
tf.math.log(box_dimensions / anchor_dimensions),
],
axis=-1,
)
if variance:
boxes_delta /= variance
return boxes_delta
def _decode_deltas_to_boxes(
anchors: tf.Tensor,
boxes_delta: tf.Tensor,
anchor_format: str,
box_format: str,
variance: Optional[List[float]] = None,
):
"""Converts bounding_boxes from delta format to `center_yxhw`."""
if variance and len(variance) != 4:
raise ValueError(f"`variance` must be length 4, got {variance}")
tf.nest.assert_same_structure(anchors, boxes_delta)
def decode_single_level(anchor, box_delta):
encoded_anchor = convert_format(
anchor,
source=anchor_format,
target="center_yxhw",
)
if variance:
box_delta = box_delta * variance
# anchors be unbatched, boxes can either be batched or unbatched.
box = tf.concat(
[
box_delta[..., :2] * encoded_anchor[..., 2:] + encoded_anchor[..., :2],
tf.math.exp(box_delta[..., 2:]) * encoded_anchor[..., 2:],
],
axis=-1,
)
box = convert_format(box, source="center_yxhw", target=box_format)
return box
if isinstance(anchors, dict) and isinstance(boxes_delta, dict):
boxes = {}
for lvl, anchor in anchors.items():
boxes[lvl] = decode_single_level(anchor, boxes_delta[lvl])
return boxes
else:
return decode_single_level(anchors, boxes_delta)
def _center_yxhw_to_xyxy(boxes, images=None, image_shape=None):
y, x, height, width, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
return tf.concat(
[x - width / 2.0, y - height / 2.0, x + width / 2.0, y + height / 2.0, rest],
axis=-1,
)
def _center_xywh_to_xyxy(boxes, images=None, image_shape=None):
x, y, width, height, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
return tf.concat(
[x - width / 2.0, y - height / 2.0, x + width / 2.0, y + height / 2.0, rest],
axis=-1,
)
def _xywh_to_xyxy(boxes, images=None, image_shape=None):
x, y, width, height, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
return tf.concat([x, y, x + width, y + height, rest], axis=-1)
def _xyxy_to_center_yxhw(boxes, images=None, image_shape=None):
left, top, right, bottom, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
return tf.concat(
[(top + bottom) / 2.0, (left + right) / 2.0, bottom - top, right - left, rest],
axis=-1,
)
def _rel_xywh_to_xyxy(boxes, images=None, image_shape=None):
image_height, image_width = _image_shape(images, image_shape, boxes)
x, y, width, height, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
return tf.concat(
[
image_width * x,
image_height * y,
image_width * (x + width),
image_height * (y + height),
rest,
],
axis=-1,
)
def _xyxy_no_op(boxes, images=None, image_shape=None):
return boxes
def _xyxy_to_xywh(boxes, images=None, image_shape=None):
left, top, right, bottom, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
return tf.concat(
[left, top, right - left, bottom - top, rest],
axis=-1,
)
def _xyxy_to_rel_xywh(boxes, images=None, image_shape=None):
image_height, image_width = _image_shape(images, image_shape, boxes)
left, top, right, bottom, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
left, right = (
left / image_width,
right / image_width,
)
top, bottom = top / image_height, bottom / image_height
return tf.concat(
[left, top, right - left, bottom - top, rest],
axis=-1,
)
def _xyxy_to_center_xywh(boxes, images=None, image_shape=None):
left, top, right, bottom, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
return tf.concat(
[(left + right) / 2.0, (top + bottom) / 2.0, right - left, bottom - top, rest],
axis=-1,
)
def _rel_xyxy_to_xyxy(boxes, images=None, image_shape=None):
image_height, image_width = _image_shape(images, image_shape, boxes)
left, top, right, bottom, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
left, right = left * image_width, right * image_width
top, bottom = top * image_height, bottom * image_height
return tf.concat(
[left, top, right, bottom, rest],
axis=-1,
)
def _xyxy_to_rel_xyxy(boxes, images=None, image_shape=None):
image_height, image_width = _image_shape(images, image_shape, boxes)
left, top, right, bottom, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
left, right = left / image_width, right / image_width
top, bottom = top / image_height, bottom / image_height
return tf.concat(
[left, top, right, bottom, rest],
axis=-1,
)
def _yxyx_to_xyxy(boxes, images=None, image_shape=None):
y1, x1, y2, x2, rest = tf.split(boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1)
return tf.concat([x1, y1, x2, y2, rest], axis=-1)
def _rel_yxyx_to_xyxy(boxes, images=None, image_shape=None):
image_height, image_width = _image_shape(images, image_shape, boxes)
top, left, bottom, right, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
left, right = left * image_width, right * image_width
top, bottom = top * image_height, bottom * image_height
return tf.concat(
[left, top, right, bottom, rest],
axis=-1,
)
def _xyxy_to_yxyx(boxes, images=None, image_shape=None):
x1, y1, x2, y2, rest = tf.split(boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1)
return tf.concat([y1, x1, y2, x2, rest], axis=-1)
def _xyxy_to_rel_yxyx(boxes, images=None, image_shape=None):
image_height, image_width = _image_shape(images, image_shape, boxes)
left, top, right, bottom, rest = tf.split(
boxes, [1, 1, 1, 1, boxes.shape[-1] - 4], axis=-1
)
left, right = left / image_width, right / image_width
top, bottom = top / image_height, bottom / image_height
return tf.concat(
[top, left, bottom, right, rest],
axis=-1,
)
TO_XYXY_CONVERTERS = {
"xywh": _xywh_to_xyxy,
"center_xywh": _center_xywh_to_xyxy,
"center_yxhw": _center_yxhw_to_xyxy,
"rel_xywh": _rel_xywh_to_xyxy,
"xyxy": _xyxy_no_op,
"rel_xyxy": _rel_xyxy_to_xyxy,
"yxyx": _yxyx_to_xyxy,
"rel_yxyx": _rel_yxyx_to_xyxy,
}
FROM_XYXY_CONVERTERS = {
"xywh": _xyxy_to_xywh,
"center_xywh": _xyxy_to_center_xywh,
"center_yxhw": _xyxy_to_center_yxhw,
"rel_xywh": _xyxy_to_rel_xywh,
"xyxy": _xyxy_no_op,
"rel_xyxy": _xyxy_to_rel_xyxy,
"yxyx": _xyxy_to_yxyx,
"rel_yxyx": _xyxy_to_rel_yxyx,
}
def convert_format(
boxes, source, target, images=None, image_shape=None, dtype="float32"
):
f"""Converts bounding_boxes from one format to another.
Supported formats are:
- `"xyxy"`, also known as `corners` format. In this format the first four axes
represent [left, top, right, bottom] in that order.
- `"rel_xyxy"`. In this format, the axes are the same as `"xyxy"` but the x
coordinates are normalized using the image width, and the y axes the image
height. All values in `rel_xyxy` are in the range (0, 1).
- `"xywh"`. In this format the first four axes represent
[left, top, width, height].
- `"rel_xywh". In this format the first four axes represent
[left, top, width, height], just like `"xywh"`. Unlike `"xywh"`, the values
are in the range (0, 1) instead of absolute pixel values.
- `"center_xyWH"`. In this format the first two coordinates represent the x and y
coordinates of the center of the bounding box, while the last two represent
the width and height of the bounding box.
- `"center_yxHW"`. In this format the first two coordinates represent the y and x
coordinates of the center of the bounding box, while the last two represent
the height and width of the bounding box.
- `"yxyx"`. In this format the first four axes represent [top, left, bottom, right]
in that order.
- `"rel_yxyx"`. In this format, the axes are the same as `"yxyx"` but the x
coordinates are normalized using the image width, and the y axes the image
height. All values in `rel_yxyx` are in the range (0, 1).
Formats are case insensitive. It is recommended that you capitalize width and
height to maximize the visual difference between `"xyWH"` and `"xyxy"`.
Relative formats, abbreviated `rel`, make use of the shapes of the `images` passed.
In these formats, the coordinates, widths, and heights are all specified as
percentages of the host image. `images` may be a ragged Tensor. Note that using a
ragged Tensor for images may cause a substantial performance loss, as each image
will need to be processed separately due to the mismatching image shapes.
Usage:
```python
boxes = load_coco_dataset()
boxes_in_xywh = keras_cv.bounding_box.convert_format(
boxes,
source='xyxy',
target='xyWH'
)
```
Args:
boxes: tf.Tensor representing bounding boxes in the format specified in the
`source` parameter. `boxes` can optionally have extra dimensions stacked on
the final axis to store metadata. boxes should be a 3D Tensor, with the
shape `[batch_size, num_boxes, *]`.
source: One of {" ".join([f'"{f}"' for f in TO_XYXY_CONVERTERS.keys()])}. Used
to specify the original format of the `boxes` parameter.
target: One of {" ".join([f'"{f}"' for f in TO_XYXY_CONVERTERS.keys()])}. Used
to specify the destination format of the `boxes` parameter.
images: (Optional) a batch of images aligned with `boxes` on the first axis.
Should be at least 3 dimensions, with the first 3 dimensions representing:
`[batch_size, height, width]`. Used in some converters to compute relative
pixel values of the bounding box dimensions. Required when transforming
from a rel format to a non-rel format.
dtype: the data type to use when transforming the boxes. Defaults to
`tf.float32`.
"""
if images is not None and image_shape is not None:
raise ValueError(
"convert_format() expects either `images` or `image_shape`, "
f"but not both. Received images={images} image_shape={image_shape}"
)
_validate_image_shape(image_shape)
source = source.lower()
target = target.lower()
if source not in TO_XYXY_CONVERTERS:
raise ValueError(
f"`convert_format()` received an unsupported format for the argument "
f"`source`. `source` should be one of {TO_XYXY_CONVERTERS.keys()}. "
f"Got source={source}"
)
if target not in FROM_XYXY_CONVERTERS:
raise ValueError(
f"`convert_format()` received an unsupported format for the argument "
f"`target`. `target` should be one of {FROM_XYXY_CONVERTERS.keys()}. "
f"Got target={target}"
)
boxes = tf.cast(boxes, dtype)
if source == target:
return boxes
# rel->rel conversions should not require images
if source.startswith("rel") and target.startswith("rel"):
source = source.replace("rel_", "", 1)
target = target.replace("rel_", "", 1)
boxes, images, squeeze = _format_inputs(boxes, images)
to_xyxy_fn = TO_XYXY_CONVERTERS[source]
from_xyxy_fn = FROM_XYXY_CONVERTERS[target]
try:
in_xyxy = to_xyxy_fn(boxes, images=images, image_shape=image_shape)
result = from_xyxy_fn(in_xyxy, images=images, image_shape=image_shape)
except RequiresImagesException:
raise ValueError(
"convert_format() must receive `images` or `image_shape` when transforming "
f"between relative and absolute formats."
f"convert_format() received source=`{format}`, target=`{format}, "
f"but images={images} and image_shape={image_shape}."
)
return _format_outputs(result, squeeze)
def _format_inputs(boxes, images):
boxes_rank = len(boxes.shape)
if boxes_rank > 3:
raise ValueError(
"Expected len(boxes.shape)=2, or len(boxes.shape)=3, got "
f"len(boxes.shape)={boxes_rank}"
)
boxes_includes_batch = boxes_rank == 3
# Determine if images needs an expand_dims() call
if images is not None:
images_rank = len(images.shape)
if images_rank > 4:
raise ValueError(
"Expected len(images.shape)=2, or len(images.shape)=3, got "
f"len(images.shape)={images_rank}"
)
images_include_batch = images_rank == 4
if boxes_includes_batch != images_include_batch:
raise ValueError(
"convert_format() expects both boxes and images to be batched, or both "
f"boxes and images to be unbatched. Received len(boxes.shape)={boxes_rank}, "
f"len(images.shape)={images_rank}. Expected either len(boxes.shape)=2 AND "
"len(images.shape)=3, or len(boxes.shape)=3 AND len(images.shape)=4."
)
if not images_include_batch:
images = tf.expand_dims(images, axis=0)
if not boxes_includes_batch:
return tf.expand_dims(boxes, axis=0), images, True
return boxes, images, False
def _validate_image_shape(image_shape):
# Escape early if image_shape is None and skip validation.
if image_shape is None:
return
# tuple/list
if isinstance(image_shape, (tuple, list)):
if len(image_shape) != 3:
raise ValueError(
"image_shape should be of length 3, but got "
f"image_shape={image_shape}"
)
return
# tensor
if isinstance(image_shape, tf.Tensor):
if len(image_shape.shape) > 1:
raise ValueError(
"image_shape.shape should be (3), but got "
f"image_shape.shape={image_shape.shape}"
)
if image_shape.shape[0] != 3:
raise ValueError(
"image_shape.shape should be (3), but got "
f"image_shape.shape={image_shape.shape}"
)
return
# Warn about failure cases
raise ValueError(
"Expected image_shape to be either a tuple, list, Tensor. "
f"Received image_shape={image_shape}"
)
def _format_outputs(boxes, squeeze):
if squeeze:
return tf.squeeze(boxes, axis=0)
return boxes
def _image_shape(images, image_shape, boxes):
if images is None and image_shape is None:
raise RequiresImagesException()
if image_shape is None:
if not isinstance(images, tf.RaggedTensor):
image_shape = tf.shape(images)
height, width = image_shape[1], image_shape[2]
else:
height = tf.reshape(images.row_lengths(), (-1, 1))
width = tf.reshape(tf.reduce_max(images.row_lengths(axis=2), 1), (-1, 1))
else:
height, width = image_shape[0], image_shape[1]
return tf.cast(height, boxes.dtype), tf.cast(width, boxes.dtype)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import numpy as np
import tensorflow as tf
from absl.testing import parameterized
from keras_cv import bounding_box
xyxy_box = tf.constant([[[10, 20, 110, 120], [20, 30, 120, 130]]], dtype=tf.float32)
yxyx_box = tf.constant([[[20, 10, 120, 110], [30, 20, 130, 120]]], dtype=tf.float32)
rel_xyxy_box = tf.constant(
[[[0.01, 0.02, 0.11, 0.12], [0.02, 0.03, 0.12, 0.13]]], dtype=tf.float32
)
rel_xyxy_box_ragged_images = tf.constant(
[[[0.10, 0.20, 1.1, 1.20], [0.40, 0.6, 2.40, 2.6]]], dtype=tf.float32
)
rel_yxyx_box = tf.constant(
[[[0.02, 0.01, 0.12, 0.11], [0.03, 0.02, 0.13, 0.12]]], dtype=tf.float32
)
rel_yxyx_box_ragged_images = tf.constant(
[[[0.2, 0.1, 1.2, 1.1], [0.6, 0.4, 2.6, 2.4]]], dtype=tf.float32
)
center_xywh_box = tf.constant(
[[[60, 70, 100, 100], [70, 80, 100, 100]]], dtype=tf.float32
)
xywh_box = tf.constant([[[10, 20, 100, 100], [20, 30, 100, 100]]], dtype=tf.float32)
rel_xywh_box = tf.constant(
[[[0.01, 0.02, 0.1, 0.1], [0.02, 0.03, 0.1, 0.1]]], dtype=tf.float32
)
rel_xywh_box_ragged_images = tf.constant(
[[[0.1, 0.2, 1, 1], [0.4, 0.6, 2, 2]]], dtype=tf.float32
)
ragged_images = tf.ragged.constant(
[np.ones(shape=[100, 100, 3]), np.ones(shape=[50, 50, 3])], # 2 images
ragged_rank=2,
)
images = tf.ones([2, 1000, 1000, 3])
boxes = {
"xyxy": xyxy_box,
"center_xywh": center_xywh_box,
"rel_xywh": rel_xywh_box,
"xywh": xywh_box,
"rel_xyxy": rel_xyxy_box,
"yxyx": yxyx_box,
"rel_yxyx": rel_yxyx_box,
}
boxes_ragged_images = {
"xyxy": xyxy_box,
"center_xywh": center_xywh_box,
"rel_xywh": rel_xywh_box_ragged_images,
"xywh": xywh_box,
"rel_xyxy": rel_xyxy_box_ragged_images,
"yxyx": yxyx_box,
"rel_yxyx": rel_yxyx_box_ragged_images,
}
test_cases = [
(f"{source}_{target}", source, target)
for (source, target) in itertools.permutations(boxes.keys(), 2)
] + [("xyxy_xyxy", "xyxy", "xyxy")]
test_image_ragged = [
(f"{source}_{target}", source, target)
for (source, target) in itertools.permutations(boxes_ragged_images.keys(), 2)
] + [("xyxy_xyxy", "xyxy", "xyxy")]
class ConvertersTestCase(tf.test.TestCase, parameterized.TestCase):
@parameterized.named_parameters(*test_cases)
def test_converters(self, source, target):
source_box = boxes[source]
target_box = boxes[target]
self.assertAllClose(
bounding_box.convert_format(
source_box, source=source, target=target, images=images
),
target_box,
)
@parameterized.named_parameters(*test_image_ragged)
def test_converters_ragged_images(self, source, target):
source_box = boxes_ragged_images[source]
target_box = boxes_ragged_images[target]
self.assertAllClose(
bounding_box.convert_format(
source_box, source=source, target=target, images=ragged_images
),
target_box,
)
@parameterized.named_parameters(*test_cases)
def test_converters_unbatched(self, source, target):
source_box = boxes[source][0]
target_box = boxes[target][0]
self.assertAllClose(
bounding_box.convert_format(
source_box, source=source, target=target, images=images[0]
),
target_box,
)
def test_raises_with_different_image_rank(self):
source_box = boxes["xyxy"][0]
with self.assertRaises(ValueError):
bounding_box.convert_format(
source_box, source="xyxy", target="xywh", images=images
)
def test_without_images(self):
source_box = boxes["xyxy"]
target_box = boxes["xywh"]
self.assertAllClose(
bounding_box.convert_format(source_box, source="xyxy", target="xywh"),
target_box,
)
def test_rel_to_rel_without_images(self):
source_box = boxes["rel_xyxy"]
target_box = boxes["rel_yxyx"]
self.assertAllClose(
bounding_box.convert_format(
source_box, source="rel_xyxy", target="rel_yxyx"
),
target_box,
)
@parameterized.named_parameters(*test_cases)
def test_ragged_bounding_box(self, source, target):
source_box = _raggify(boxes[source])
target_box = _raggify(boxes[target])
self.assertAllClose(
bounding_box.convert_format(
source_box, source=source, target=target, images=images
),
target_box,
)
@parameterized.named_parameters(*test_image_ragged)
def test_ragged_bounding_box_ragged_images(self, source, target):
source_box = _raggify(boxes_ragged_images[source])
target_box = _raggify(boxes_ragged_images[target])
self.assertAllClose(
bounding_box.convert_format(
source_box, source=source, target=target, images=ragged_images
),
target_box,
)
@parameterized.named_parameters(*test_cases)
def test_ragged_bounding_box_with_image_shape(self, source, target):
source_box = _raggify(boxes[source])
target_box = _raggify(boxes[target])
self.assertAllClose(
bounding_box.convert_format(
source_box, source=source, target=target, image_shape=(1000, 1000, 3)
),
target_box,
)
def _raggify(tensor, row_lengths=[[2, 0], [0, 0]]):
return tf.RaggedTensor.from_row_lengths(tensor[0], [2])
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
formats.py contains axis information for each supported format.
"""
class XYXY:
"""XYXY contains axis indices for the XYXY format.
All values in the XYXY format should be absolute pixel values.
The XYXY format consists of the following required indices:
- LEFT: left hand side of the bounding box
- TOP: top of the bounding box
- RIGHT: right of the bounding box
- BOTTOM: bottom of the bounding box
and the following optional indices, used in some KerasCV components:
- CLASS: class of the object contained in the bounding box
- CONFIDENCE: confidence that the box is valid, used in predictions
"""
LEFT = 0
TOP = 1
RIGHT = 2
BOTTOM = 3
CLASS = 4
CONFIDENCE = 5
class REL_XYXY:
"""REL_XYXY contains axis indices for the REL_XYXY format.
REL_XYXY is like XYXY, but each value is relative to the width and height of the
origin image. Values are percentages of the origin images' width and height
respectively.
The REL_XYXY format consists of the following required indices:
- LEFT: left hand side of the bounding box
- TOP: top of the bounding box
- RIGHT: right of the bounding box
- BOTTOM: bottom of the bounding box
and the following optional indices, used in some KerasCV components:
- CLASS: class of the object contained in the bounding box
- CONFIDENCE: confidence that the box is valid, used in predictions
"""
LEFT = 0
TOP = 1
RIGHT = 2
BOTTOM = 3
CLASS = 4
CONFIDENCE = 5
class CENTER_XYWH:
"""CENTER_XYWH contains axis indices for the CENTER_XYWH format.
All values in the CENTER_XYWH format should be absolute pixel values.
The CENTER_XYWH format consists of the following required indices:
- X: X coordinate of the center of the bounding box
- Y: Y coordinate of the center of the bounding box
- WIDTH: width of the bounding box
- HEIGHT: height of the bounding box
and the following optional indices, used in some KerasCV components:
- 4: class of the object contained in the bounding box
- 5: confidence that the box is valid, used in predictions
"""
X = 0
Y = 1
WIDTH = 2
HEIGHT = 3
CLASS = 4
CONFIDENCE = 5
class XYWH:
"""XYWH contains axis indices for the XYWH format.
All values in the XYWH format should be absolute pixel values.
The XYWH format consists of the following required indices:
- X: X coordinate of the left of the bounding box
- Y: Y coordinate of the top of the bounding box
- WIDTH: width of the bounding box
- HEIGHT: height of the bounding box
and the following optional indices, used in some KerasCV components:
- 4: class of the object contained in the bounding box
- 5: confidence that the box is valid, used in predictions
"""
X = 0
Y = 1
WIDTH = 2
HEIGHT = 3
CLASS = 4
CONFIDENCE = 5
class REL_XYWH:
"""REL_XYWH contains axis indices for the XYWH format.
REL_XYXY is like XYWH, but each value is relative to the width and height of the
origin image. Values are percentages of the origin images' width and height
respectively.
- X: X coordinate of the left of the bounding box
- Y: Y coordinate of the top of the bounding box
- WIDTH: width of the bounding box
- HEIGHT: height of the bounding box
and the following optional indices, used in some KerasCV components:
- 4: class of the object contained in the bounding box
- 5: confidence that the box is valid, used in predictions
"""
X = 0
Y = 1
WIDTH = 2
HEIGHT = 3
CLASS = 4
CONFIDENCE = 5
class YXYX:
"""YXYX contains axis indices for the YXYX format.
All values in the YXYX format should be absolute pixel values.
The YXYX format consists of the following required indices:
- TOP: top of the bounding box
- LEFT: left hand side of the bounding box
- BOTTOM: bottom of the bounding box
- RIGHT: right of the bounding box
and the following optional indices, used in some KerasCV components:
- CLASS: class of the object contained in the bounding box
- CONFIDENCE: confidence that the box is valid, used in predictions
"""
TOP = 0
LEFT = 1
BOTTOM = 2
RIGHT = 3
CLASS = 4
CONFIDENCE = 5
class REL_YXYX:
"""REL_YXYX contains axis indices for the REL_YXYX format.
REL_YXYX is like YXYX, but each value is relative to the width and height of the
origin image. Values are percentages of the origin images' width and height
respectively.
The REL_YXYX format consists of the following required indices:
- TOP: top of the bounding box
- LEFT: left hand side of the bounding box
- BOTTOM: bottom of the bounding box
- RIGHT: right of the bounding box
and the following optional indices, used in some KerasCV components:
- CLASS: class of the object contained in the bounding box
- CONFIDENCE: confidence that the box is valid, used in predictions
"""
TOP = 0
LEFT = 1
BOTTOM = 2
RIGHT = 3
CLASS = 4
CONFIDENCE = 5
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains functions to compute ious of bounding boxes."""
import tensorflow as tf
from keras_cv import bounding_box
def _compute_area(box):
"""Computes area for bounding boxes
Args:
box: [N, 4] or [batch_size, N, 4] float Tensor, either batched
or unbatched boxes.
Returns:
a float Tensor of [N] or [batch_size, N]
"""
y_min, x_min, y_max, x_max = tf.split(box[..., :4], num_or_size_splits=4, axis=-1)
return tf.squeeze((y_max - y_min) * (x_max - x_min), axis=-1)
def _compute_intersection(boxes1, boxes2):
"""Computes intersection area between two sets of boxes.
Args:
boxes1: [N, 4] or [batch_size, N, 4] float Tensor boxes.
boxes2: [M, 4] or [batch_size, M, 4] float Tensor boxes.
Returns:
a [N, M] or [batch_size, N, M] float Tensor.
"""
y_min1, x_min1, y_max1, x_max1 = tf.split(
boxes1[..., :4], num_or_size_splits=4, axis=-1
)
y_min2, x_min2, y_max2, x_max2 = tf.split(
boxes2[..., :4], num_or_size_splits=4, axis=-1
)
boxes2_rank = len(boxes2.shape)
perm = [1, 0] if boxes2_rank == 2 else [0, 2, 1]
# [N, M] or [batch_size, N, M]
intersect_ymax = tf.minimum(y_max1, tf.transpose(y_max2, perm))
intersect_ymin = tf.maximum(y_min1, tf.transpose(y_min2, perm))
intersect_xmax = tf.minimum(x_max1, tf.transpose(x_max2, perm))
intersect_xmin = tf.maximum(x_min1, tf.transpose(x_min2, perm))
intersect_height = intersect_ymax - intersect_ymin
intersect_width = intersect_xmax - intersect_xmin
zeros_t = tf.cast(0, intersect_height.dtype)
intersect_height = tf.maximum(zeros_t, intersect_height)
intersect_width = tf.maximum(zeros_t, intersect_width)
return intersect_height * intersect_width
def compute_iou(
boxes1,
boxes2,
bounding_box_format,
use_masking=False,
mask_val=-1,
):
"""Computes a lookup table vector containing the ious for a given set boxes.
The lookup vector is to be indexed by [`boxes1_index`,`boxes2_index`] if boxes
are unbatched and by [`batch`, `boxes1_index`,`boxes2_index`] if the boxes are
batched.
The users can pass `boxes1` and `boxes2` to be different ranks. For example:
1) `boxes1`: [batch_size, M, 4], `boxes2`: [batch_size, N, 4] -> return [batch_size, M, N].
2) `boxes1`: [batch_size, M, 4], `boxes2`: [N, 4] -> return [batch_size, M, N]
3) `boxes1`: [M, 4], `boxes2`: [batch_size, N, 4] -> return [batch_size, M, N]
4) `boxes1`: [M, 4], `boxes2`: [N, 4] -> return [M, N]
Args:
boxes1: a list of bounding boxes in 'corners' format. Can be batched or unbatched.
boxes2: a list of bounding boxes in 'corners' format. Can be batched or unbatched.
bounding_box_format: a case-insensitive string which is one of `"xyxy"`,
`"rel_xyxy"`, `"xyWH"`, `"center_xyWH"`, `"yxyx"`, `"rel_yxyx"`.
For detailed information on the supported format, see the
[KerasCV bounding box documentation](https://keras.io/api/keras_cv/bounding_box/formats/).
use_masking: whether masking will be applied. This will mask all `boxes1` or `boxes2` that
have values less then 0 in all its 4 dimensions. Default to `False`.
mask_val: int to mask those returned IOUs if the masking is True. Default to -1.
Returns:
iou_lookup_table: a vector containing the pairwise ious of boxes1 and
boxes2.
"""
boxes1_rank = len(boxes1.shape)
boxes2_rank = len(boxes2.shape)
if boxes1_rank not in [2, 3]:
raise ValueError(
"compute_iou() expects boxes1 to be batched, or "
f"to be unbatched. Received len(boxes1.shape)={boxes1_rank}, "
f"len(boxes2.shape)={boxes2_rank}. Expected either len(boxes1.shape)=2 AND "
"or len(boxes1.shape)=3."
)
if boxes2_rank not in [2, 3]:
raise ValueError(
"compute_iou() expects boxes2 to be batched, or "
f"to be unbatched. Received len(boxes1.shape)={boxes1_rank}, "
f"len(boxes2.shape)={boxes2_rank}. Expected either len(boxes2.shape)=2 AND "
"or len(boxes2.shape)=3."
)
if bounding_box_format.startswith("rel"):
target = "rel_yxyx"
else:
target = "yxyx"
boxes1 = bounding_box.convert_format(
boxes1, source=bounding_box_format, target=target
)
boxes2 = bounding_box.convert_format(
boxes2, source=bounding_box_format, target=target
)
intersect_area = _compute_intersection(boxes1, boxes2)
boxes1_area = _compute_area(boxes1)
boxes2_area = _compute_area(boxes2)
boxes2_area_rank = len(boxes2_area.shape)
boxes2_axis = 1 if (boxes2_area_rank == 2) else 0
boxes1_area = tf.expand_dims(boxes1_area, axis=-1)
boxes2_area = tf.expand_dims(boxes2_area, axis=boxes2_axis)
union_area = boxes1_area + boxes2_area - intersect_area
res = tf.math.divide_no_nan(intersect_area, union_area)
if boxes1_rank == 2:
perm = [1, 0]
else:
perm = [0, 2, 1]
if not use_masking:
return res
mask_val_t = tf.cast(mask_val, res.dtype) * tf.ones_like(res)
boxes1_mask = tf.less(tf.reduce_max(boxes1, axis=-1, keepdims=True), 0.0)
boxes2_mask = tf.less(tf.reduce_max(boxes2, axis=-1, keepdims=True), 0.0)
background_mask = tf.logical_or(boxes1_mask, tf.transpose(boxes2_mask, perm))
iou_lookup_table = tf.where(background_mask, mask_val_t, res)
return iou_lookup_table
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for iou functions."""
import numpy as np
import tensorflow as tf
from keras_cv.bounding_box import iou as iou_lib
class IoUTest(tf.test.TestCase):
def test_compute_single_iou(self):
bb1 = tf.constant([[100, 101, 200, 201]], dtype=tf.float32)
bb1_off_by_1 = tf.constant([[101, 102, 201, 202]], dtype=tf.float32)
# area of bb1 and bb1_off_by_1 are each 10000.
# intersection area is 99*99=9801
# iou=9801/(2*10000 - 9801)=0.96097656633
print(iou_lib.compute_iou(bb1, bb1_off_by_1, "yxyx"))
self.assertAlmostEqual(
iou_lib.compute_iou(bb1, bb1_off_by_1, "yxyx")[0], 0.96097656633
)
def test_compute_iou(self):
bb1 = [100, 101, 200, 201]
bb1_off_by_1_pred = [101, 102, 201, 202]
iou_bb1_bb1_off = 0.96097656633
top_left_bounding_box = [0, 2, 1, 3]
far_away_box = [1300, 1400, 1500, 1401]
another_far_away_pred = [1000, 1400, 1200, 1401]
# Rows represent predictions, columns ground truths
expected_result = np.array(
[[iou_bb1_bb1_off, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]],
dtype=np.float32,
)
sample_y_true = tf.constant(
[bb1, top_left_bounding_box, far_away_box], dtype=tf.float32
)
sample_y_pred = tf.constant(
[bb1_off_by_1_pred, top_left_bounding_box, another_far_away_pred],
dtype=tf.float32,
)
result = iou_lib.compute_iou(sample_y_true, sample_y_pred, "yxyx")
self.assertAllClose(expected_result, result.numpy())
def test_batched_compute_iou(self):
bb1 = [100, 101, 200, 201]
bb1_off_by_1_pred = [101, 102, 201, 202]
iou_bb1_bb1_off = 0.96097656633
top_left_bounding_box = [0, 2, 1, 3]
far_away_box = [1300, 1400, 1500, 1401]
another_far_away_pred = [1000, 1400, 1200, 1401]
# Rows represent predictions, columns ground truths
expected_result = np.array(
[
[[iou_bb1_bb1_off, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]],
[[iou_bb1_bb1_off, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]],
],
dtype=np.float32,
)
sample_y_true = tf.constant(
[
[bb1, top_left_bounding_box, far_away_box],
[bb1, top_left_bounding_box, far_away_box],
],
dtype=tf.float32,
)
sample_y_pred = tf.constant(
[
[bb1_off_by_1_pred, top_left_bounding_box, another_far_away_pred],
[bb1_off_by_1_pred, top_left_bounding_box, another_far_away_pred],
],
dtype=tf.float32,
)
result = iou_lib.compute_iou(sample_y_true, sample_y_pred, "yxyx")
self.assertAllClose(expected_result, result.numpy())
def test_batched_boxes1_unbatched_boxes2(self):
bb1 = [100, 101, 200, 201]
bb1_off_by_1_pred = [101, 102, 201, 202]
iou_bb1_bb1_off = 0.96097656633
top_left_bounding_box = [0, 2, 1, 3]
far_away_box = [1300, 1400, 1500, 1401]
another_far_away_pred = [1000, 1400, 1200, 1401]
# Rows represent predictions, columns ground truths
expected_result = np.array(
[
[[iou_bb1_bb1_off, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]],
[[iou_bb1_bb1_off, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]],
],
dtype=np.float32,
)
sample_y_true = tf.constant(
[
[bb1, top_left_bounding_box, far_away_box],
[bb1, top_left_bounding_box, far_away_box],
],
dtype=tf.float32,
)
sample_y_pred = tf.constant(
[bb1_off_by_1_pred, top_left_bounding_box, another_far_away_pred],
dtype=tf.float32,
)
result = iou_lib.compute_iou(sample_y_true, sample_y_pred, "yxyx")
self.assertAllClose(expected_result, result.numpy())
def test_unbatched_boxes1_batched_boxes2(self):
bb1 = [100, 101, 200, 201]
bb1_off_by_1_pred = [101, 102, 201, 202]
iou_bb1_bb1_off = 0.96097656633
top_left_bounding_box = [0, 2, 1, 3]
far_away_box = [1300, 1400, 1500, 1401]
another_far_away_pred = [1000, 1400, 1200, 1401]
# Rows represent predictions, columns ground truths
expected_result = np.array(
[
[[iou_bb1_bb1_off, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]],
[[iou_bb1_bb1_off, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]],
],
dtype=np.float32,
)
sample_y_true = tf.constant(
[
[bb1, top_left_bounding_box, far_away_box],
],
dtype=tf.float32,
)
sample_y_pred = tf.constant(
[
[bb1_off_by_1_pred, top_left_bounding_box, another_far_away_pred],
[bb1_off_by_1_pred, top_left_bounding_box, another_far_away_pred],
],
dtype=tf.float32,
)
result = iou_lib.compute_iou(sample_y_true, sample_y_pred, "yxyx")
self.assertAllClose(expected_result, result.numpy())
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
def pad_batch_to_shape(bounding_boxes, target_shape, padding_values=-1):
"""Pads a list of bounding boxes with -1s.
Boxes represented by all -1s are ignored by COCO metrics.
Sample usage:
bounding_box = [[1, 2, 3, 4], [5, 6, 7, 8]] # 2 bounding_boxes with with xywh or
corners format.
target_shape = [3, 4] # Add 1 more dummy bounding_box
result = pad_batch_to_shape(bounding_box, target_shape)
# result == [[1, 2, 3, 4], [5, 6, 7, 8], [-1, -1, -1, -1]]
target_shape = [2, 5] # Add 1 more index after the current 4 coordinates.
result = pad_batch_to_shape(bounding_box, target_shape)
# result == [[1, 2, 3, 4, -1], [5, 6, 7, 8, -1]]
Args:
bounding_boxes: tf.Tensor of bounding boxes in any format.
target_shape: Target shape to pad bounding box to. This should have the same
rank as the bounding_boxes. Note that if the target_shape contains any
dimension that is smaller than the bounding box shape, then no value will be
padded.
padding_values: value to pad, defaults to -1 to mask out in coco metrics.
Returns:
bounding_boxes padded to target shape.
Raises:
ValueError, when target shape has smaller rank or dimension value when
comparing with shape of bounding boxes.
"""
bounding_box_shape = tf.shape(bounding_boxes)
if len(bounding_box_shape) != len(target_shape):
raise ValueError(
"Target shape should have same rank as the bounding box. "
f"Got bounding_box shape = {bounding_box_shape}, "
f"target_shape = {target_shape}"
)
for dim in range(len(target_shape)):
if bounding_box_shape[dim] > target_shape[dim]:
raise ValueError(
"Target shape should be larger than bounding box shape "
"in all dimensions. "
f"Got bounding_box shape = {bounding_box_shape}, "
f"target_shape = {target_shape}"
)
paddings = [
[0, target_shape[dim] - bounding_box_shape[dim]]
for dim in range(len(target_shape))
]
return tf.pad(
bounding_boxes, paddings, mode="CONSTANT", constant_values=padding_values
)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from keras_cv import bounding_box
class PadBatchToShapeTestCase(tf.test.TestCase):
def test_bounding_box_padding(self):
bounding_boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
target_shape = [3, 4]
result = bounding_box.pad_batch_to_shape(bounding_boxes, target_shape)
self.assertAllClose(result, [[1, 2, 3, 4], [5, 6, 7, 8], [-1, -1, -1, -1]])
target_shape = [2, 5]
result = bounding_box.pad_batch_to_shape(bounding_boxes, target_shape)
self.assertAllClose(result, [[1, 2, 3, 4, -1], [5, 6, 7, 8, -1]])
# Make sure to raise error if the rank is different between bounding_box and
# target shape
with self.assertRaisesRegex(ValueError, "Target shape should have same rank"):
bounding_box.pad_batch_to_shape(bounding_boxes, [1, 2, 3])
# Make sure raise error if the target shape is smaller
target_shape = [3, 2]
with self.assertRaisesRegex(
ValueError, "Target shape should be larger than bounding box shape"
):
bounding_box.pad_batch_to_shape(bounding_boxes, target_shape)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for working with bounding boxes."""
import tensorflow as tf
from keras_cv import bounding_box
from keras_cv.bounding_box.formats import XYWH
def _relative_area(bounding_boxes, bounding_box_format, images):
bounding_boxes = bounding_box.convert_format(
bounding_boxes,
source=bounding_box_format,
target="rel_xywh",
images=images,
)
widths = bounding_boxes[..., XYWH.WIDTH]
heights = bounding_boxes[..., XYWH.HEIGHT]
# handle corner case where shear performs a full inversion.
return tf.where(tf.math.logical_and(widths > 0, heights > 0), widths * heights, 0.0)
def clip_to_image(bounding_boxes, images, bounding_box_format):
"""clips bounding boxes to image boundaries.
`clip_to_image()` clips bounding boxes that have coordinates out of bounds of an
image down to the boundaries of the image. This is done by converting the bounding
box to relative formats, then clipping them to the `[0, 1]` range. Additionally,
bounding boxes that end up with a zero area have their class ID set to -1,
indicating that there is no object present in them.
Args:
bounding_boxes: bounding box tensor to clip.
images: list of images to clip the bounding boxes to.
bounding_box_format: the KerasCV bounding box format the bounding boxes are in.
"""
if bounding_boxes.shape[-1] < 5:
raise ValueError(
"`bounding_boxes` must include a class_id index on the final "
"axis. This is used to set `bounding_boxes` that are fully outside of the "
"provided image to the background class, -1."
)
bounding_boxes = bounding_box.convert_format(
bounding_boxes,
source=bounding_box_format,
target="rel_xyxy",
images=images,
)
bounding_boxes, images, squeeze = _format_inputs(bounding_boxes, images)
x1, y1, x2, y2, rest = tf.split(
bounding_boxes, [1, 1, 1, 1, bounding_boxes.shape[-1] - 4], axis=-1
)
clipped_bounding_boxes = tf.concat(
[
tf.clip_by_value(x1, clip_value_min=0, clip_value_max=1),
tf.clip_by_value(y1, clip_value_min=0, clip_value_max=1),
tf.clip_by_value(x2, clip_value_min=0, clip_value_max=1),
tf.clip_by_value(y2, clip_value_min=0, clip_value_max=1),
rest,
],
axis=-1,
)
areas = _relative_area(
clipped_bounding_boxes, bounding_box_format="rel_xyxy", images=images
)
clipped_bounding_boxes = bounding_box.convert_format(
clipped_bounding_boxes,
source="rel_xyxy",
target=bounding_box_format,
images=images,
)
clipped_bounding_boxes = tf.where(
tf.expand_dims(areas > 0.0, axis=-1), clipped_bounding_boxes, -1.0
)
nan_indices = tf.math.reduce_any(tf.math.is_nan(clipped_bounding_boxes), axis=-1)
clipped_bounding_boxes = tf.where(
tf.expand_dims(nan_indices, axis=-1), -1.0, clipped_bounding_boxes
)
clipped_bounding_boxes = _format_outputs(clipped_bounding_boxes, squeeze)
return clipped_bounding_boxes
# TODO (tanzhenyu): merge with clip_to_image
def _clip_boxes(boxes, box_format, image_shape):
"""Clip boxes to the boundaries of the image shape"""
if boxes.shape[-1] != 4:
raise ValueError(
"boxes.shape[-1] is {:d}, but must be 4.".format(boxes.shape[-1])
)
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width, _ = image_shape
max_length = [height, width, height, width]
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width, _ = tf.unstack(image_shape, axis=-1)
max_length = tf.stack([height, width, height, width], axis=-1)
clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
return clipped_boxes
def _format_inputs(boxes, images):
boxes_rank = len(boxes.shape)
if boxes_rank > 3:
raise ValueError(
"Expected len(boxes.shape)=2, or len(boxes.shape)=3, got "
f"len(boxes.shape)={boxes_rank}"
)
boxes_includes_batch = boxes_rank == 3
# Determine if images needs an expand_dims() call
if images is not None:
images_rank = len(images.shape)
if images_rank > 4:
raise ValueError(
"Expected len(images.shape)=2, or len(images.shape)=3, got "
f"len(images.shape)={images_rank}"
)
images_include_batch = images_rank == 4
if boxes_includes_batch != images_include_batch:
raise ValueError(
"clip_to_image() expects both boxes and images to be batched, or both "
f"boxes and images to be unbatched. Received len(boxes.shape)={boxes_rank}, "
f"len(images.shape)={images_rank}. Expected either len(boxes.shape)=2 AND "
"len(images.shape)=3, or len(boxes.shape)=3 AND len(images.shape)=4."
)
if not images_include_batch:
images = tf.expand_dims(images, axis=0)
if not boxes_includes_batch:
return tf.expand_dims(boxes, axis=0), images, True
return boxes, images, False
def _format_outputs(boxes, squeeze):
if squeeze:
return tf.squeeze(boxes, axis=0)
return boxes
def pad_with_sentinels(bounding_boxes, sentinel_value=-1):
"""Pads the given bounding box tensor with sentinel_value.
This is done to convert RaggedTensors into standard Dense
tensors, which have better performance and compatibility
within the TensorFlow ecosystem.
Args:
bounding_boxes: a ragged tensor of bounding boxes.
Can be batched or unbatched.
sentinel_value: Value to set for indices not specified
in bounding_boxes. Defaults to -1.
Returns:
a Tensor containing the sentinel_value padded bounding boxes.
"""
return bounding_boxes.to_tensor(default_value=sentinel_value)
def filter_sentinels(bounding_boxes, sentinel_value=-1):
"""converts a Dense padded bounding box `tf.Tensor` to a `tf.RaggedTensor`.
Bounding boxes are ragged tensors in most use cases. Converting them to a dense
tensor makes it easier to work with Tensorflow ecosystem.
This function can be used to filter out the padded bounding boxes by
checking for padded sentinel value of the class_id axis of the bounding_boxes.
Args:
bounding_boxes: a Tensor of bounding boxes. May be batched, or unbatched.
sentinel_value: Value used to filter dense bounding box tensor.
bounding_boxes with class_id equal to sentinel_value will be dropped.
Returns:
`tf.RaggedTensor`or 'tf.Tensor' containing the filtered bounding boxes.
"""
is_ragged = isinstance(bounding_boxes, tf.RaggedTensor)
if is_ragged:
bounding_boxes = bounding_box.pad_with_sentinels(
bounding_boxes, sentinel_value=sentinel_value
)
mask = bounding_boxes[..., 4] != sentinel_value
filtered_bounding_boxes = tf.ragged.boolean_mask(bounding_boxes, mask)
return filtered_bounding_boxes
def add_class_id(bounding_boxes, class_id=0):
"""Add class ID to a new dimension of the final axis of a bounding box Tensor.
Bounding box utilities in KerasCV expect bounding boxes to have class IDs.
This utility adds a class ID to a new axis of the provided tf.Tensor.
Usage:
```python
bounding_boxes = tf.random.uniform(shape=[2, 2, 4])
bounding_boxes_with_class_id = keras_cv.bounding_box.add_class_id(
bounding_boxes, class_id=1)
# bounding_boxes_with_class_id is a Tensor of shape [2, 2, 5]
```
Args:
bounding_boxes: a `tf.Tensor` of bounding_boxes, may be batched unbatched.
class_id: (Optional) The value of class id that needs to be padded.
Defaults to 0.
Returns:
`tf.Tensor` with an additional class id padded to the original bounding boxes.
"""
# format input bounding boxes
is_ragged = isinstance(bounding_boxes, tf.RaggedTensor)
if is_ragged:
row_lengths = list(bounding_boxes.nested_row_lengths())
# increase row length to account for clas-id addition
row_lengths[1] = row_lengths[1] + 1
bounding_boxes = bounding_boxes.to_tensor()
# pad input bounding boxes
if bounding_boxes.shape[-1] != 4:
raise ValueError(
"The number of values along the final axis of `bounding_boxes` is "
"expected to be 4. But got {}.".format(bounding_boxes.shape[-1])
)
bounding_box_rank = len(tf.shape(bounding_boxes))
if bounding_box_rank == 2:
paddings = tf.constant([[0, 0], [0, 1]])
elif bounding_box_rank == 3:
paddings = tf.constant([[0, 0], [0, 0], [0, 1]])
else:
raise ValueError(
f"`bounding_boxes` should be of rank 2 or 3. However "
f"add_class_id received `bounding_boxes` of rank={bounding_box_rank}"
)
bounding_boxes = tf.pad(
bounding_boxes,
paddings=paddings,
mode="CONSTANT",
constant_values=class_id,
)
# format output bounding boxes
if is_ragged:
bounding_boxes = tf.RaggedTensor.from_tensor(
bounding_boxes,
lengths=row_lengths,
)
return bounding_boxes
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment