Commit 0016b0a7 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'dtk22.04' into 'main'

Dtk22.04

See merge request dcutoolkit/deeplearing/dlexamples_new!49
parents 17bc28d5 7a382d5d
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from keras_cv.datasets.pascal_voc.load import load
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from keras_cv import bounding_box
def curry_map_function(bounding_box_format, img_size):
"""Mapping function to create batched image and bbox coordinates"""
if img_size is not None:
resizing = keras.layers.Resizing(
height=img_size[0], width=img_size[1], crop_to_aspect_ratio=False
)
# TODO(lukewood): update `keras.layers.Resizing` to support bounding boxes.
def apply(inputs):
# Support image size none.
if img_size is not None:
inputs["image"] = resizing(inputs["image"])
inputs["objects"]["bbox"] = bounding_box.convert_format(
inputs["objects"]["bbox"],
images=inputs["image"],
source="rel_yxyx",
target=bounding_box_format,
)
bounding_boxes = inputs["objects"]["bbox"]
labels = tf.cast(inputs["objects"]["label"], tf.float32)
labels = tf.expand_dims(labels, axis=-1)
bounding_boxes = tf.concat([bounding_boxes, labels], axis=-1)
return {"images": inputs["image"], "bounding_boxes": bounding_boxes}
return apply
def load(
split,
bounding_box_format,
batch_size=None,
shuffle_buffer=None,
shuffle_files=True,
img_size=None,
):
"""Loads the PascalVOC 2007 dataset.
Usage:
```python
dataset, ds_info = keras_cv.datasets.pascal_voc.load(
split="train", bounding_box_format="xywh", batch_size=9
)
```
Args:
split: the split string passed to the `tensorflow_datasets.load()` call. Should
be one of "train", "test", or "validation."
bounding_box_format: the keras_cv bounding box format to load the boxes into.
For a list of supported formats, please Refer
[to the keras.io docs](https://keras.io/api/keras_cv/bounding_box/formats/)
for more details on supported bounding box formats.
batch_size: (Optional) how many instances to include in batches after loading. If
not provided, no batching will occur.
shuffle_buffer: (Optional) the size of the buffer to use in shuffling.
shuffle_files: (Optional) whether or not to shuffle files, defaults to True.
img_size: (Optional) size to resize the images to. By default, images are not
resized `tf.RaggedTensor` batches are produced if batching occurs.
Returns:
tf.data.Dataset containing PascalVOC. Each entry is a dictionary containing
keys {"images": images, "bounding_boxes": bounding_boxes} where images is a
Tensor of shape [batch, H, W, 3] and bounding_boxes is a `tf.RaggedTensor` of
shape [batch, None, 5].
"""
dataset, dataset_info = tfds.load(
"voc/2007", split=split, shuffle_files=shuffle_files, with_info=True
)
dataset = dataset.map(
curry_map_function(bounding_box_format=bounding_box_format, img_size=img_size),
num_parallel_calls=tf.data.AUTOTUNE,
)
if shuffle_buffer:
dataset = dataset.shuffle(shuffle_buffer, reshuffle_each_iteration=True)
if batch_size is not None:
dataset = dataset.apply(
tf.data.experimental.dense_to_ragged_batch(batch_size=batch_size)
)
return dataset, dataset_info
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data loader for Pascal VOC 2012 segmentation dataset.
The image classification and object detection (bounding box) data is covered by existing
TF datasets in https://www.tensorflow.org/datasets/catalog/voc. The segmentation data (
both class segmentation and instance segmentation) are included in the VOC 2012, but not
offered by TF-DS yet. This module is trying to fill this gap while TFDS team can
address this feature (b/252870855, https://github.com/tensorflow/datasets/issues/27 and
https://github.com/tensorflow/datasets/pull/1198).
The schema design is similar to the existing design of TFDS, but trimmed to fit the need
of Keras CV models.
This module contains following functionalities:
1. Download and unpack original data from Pascal VOC.
2. Reprocess and build up dataset that include image, class label, object bounding boxes,
class and instance segmentation masks.
3. Produce tfrecords from the dataset.
4. Load existing tfrecords from result in 3.
"""
import logging
import multiprocessing
import os.path
import tarfile
import xml
import tensorflow as tf
DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar"
# Note that this list doesn't contain the background class. In the classification use
# case, the label is 0 based (aeroplane -> 0), whereas in segmentation use case, the 0 is
# reserved for background, so aeroplane maps to 1.
CLASSES = [
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor",
]
# This is used to map between string class to index.
CLASS_TO_INDEX = {name: index for index, name in enumerate(CLASSES)}
# For the mask data in the PNG file, the encoded raw pixel value need be to converted
# to the proper class index. In the following map, [0, 0, 0] will be convert to 0, and
# [128, 0, 0] will be conveted to 1, so on so forth. Also note that the mask class is 1
# base since class 0 is reserved for the background. The [128, 0, 0] (class 1) is mapped
# to `aeroplane`.
VOC_PNG_COLOR_VALUE = [
[0, 0, 0],
[128, 0, 0],
[0, 128, 0],
[128, 128, 0],
[0, 0, 128],
[128, 0, 128],
[0, 128, 128],
[128, 128, 128],
[64, 0, 0],
[192, 0, 0],
[64, 128, 0],
[192, 128, 0],
[64, 0, 128],
[192, 0, 128],
[64, 128, 128],
[192, 128, 128],
[0, 64, 0],
[128, 64, 0],
[0, 192, 0],
[128, 192, 0],
[0, 64, 128],
]
# Will be populated by _maybe_populate_voc_color_mapping() below.
VOC_PNG_COLOR_MAPPING = None
def _maybe_populate_voc_color_mapping():
# Lazy creation of VOC_PNG_COLOR_MAPPING, which could take 64M memory.
global VOC_PNG_COLOR_MAPPING
if VOC_PNG_COLOR_MAPPING is None:
VOC_PNG_COLOR_MAPPING = [0] * (256**3)
for i, colormap in enumerate(VOC_PNG_COLOR_VALUE):
VOC_PNG_COLOR_MAPPING[
(colormap[0] * 256 + colormap[1]) * 256 + colormap[2]
] = i
# There is a special mapping with [224, 224, 192] -> 255
VOC_PNG_COLOR_MAPPING[224 * 256 * 256 + 224 * 256 + 192] = 255
VOC_PNG_COLOR_MAPPING = tf.constant(VOC_PNG_COLOR_MAPPING)
return VOC_PNG_COLOR_MAPPING
def _download_pascal_voc_2012(data_url, local_dir_path=None, override_extract=False):
"""Fetch the original Pascal VOC 2012 from remote URL.
Args:
data_url: string, the URL for the Pascal VOC data, should be in a tar package.
local_dir_path: string, the local directory path to save the data.
Returns:
the path to the folder of extracted Pascal VOC data.
"""
if not local_dir_path:
fname = "pascal_voc_2012/data.tar"
else:
# Make sure the directory exists
if not os.path.exists(local_dir_path):
os.makedirs(local_dir_path, exist_ok=True)
fname = os.path.join(local_dir_path, "data.tar")
data_file_path = tf.keras.utils.get_file(fname=fname, origin=data_url)
logging.info("Received data file from %s", data_file_path)
# Extra the data into the same directory as the tar file.
data_directory = os.path.dirname(data_file_path)
# Note that the extracted data will be located in a folder `VOCdevkit` (from tar).
# If the folder is already there and `override_extract` is False, then we will skip
# extracting the folder again.
if override_extract or not os.path.exists(
os.path.join(data_directory, "VOCdevkit")
):
logging.info("Extract data into %s", data_directory)
with tarfile.open(data_file_path) as f:
f.extractall(data_directory)
return os.path.join(data_directory, "VOCdevkit", "VOC2012")
def _parse_annotation_data(annotation_file_path):
"""Parse the annotation XML file for the image.
The annotation contains the metadata, as well as the object bounding box information.
"""
with tf.io.gfile.GFile(annotation_file_path, "r") as f:
root = xml.etree.ElementTree.parse(f).getroot()
size = root.find("size")
width = int(size.find("width").text)
height = int(size.find("height").text)
objects = []
for obj in root.findall("object"):
# Get object's label name.
label = CLASS_TO_INDEX[obj.find("name").text.lower()]
# Get objects' pose name.
pose = obj.find("pose").text.lower()
is_truncated = obj.find("truncated").text == "1"
is_difficult = obj.find("difficult").text == "1"
bndbox = obj.find("bndbox")
xmax = int(bndbox.find("xmax").text)
xmin = int(bndbox.find("xmin").text)
ymax = int(bndbox.find("ymax").text)
ymin = int(bndbox.find("ymin").text)
objects.append(
{
"label": label,
"pose": pose,
"bbox": [ymin, xmin, ymax, xmax],
"is_truncated": is_truncated,
"is_difficult": is_difficult,
}
)
return {"width": width, "height": height, "objects": objects}
def _get_image_ids(data_dir, split):
data_file_mapping = {"train": "train.txt", "eval": "val.txt", None: "trainval.txt"}
with tf.io.gfile.GFile(
os.path.join(data_dir, "ImageSets", "Segmentation", data_file_mapping[split]),
"r",
) as f:
image_ids = f.read().splitlines()
logging.info(f"Received {len(image_ids)} images for {split} dataset.")
return image_ids
def _parse_single_image(image_file_path):
data_dir, image_file_name = os.path.split(image_file_path)
data_dir = os.path.normpath(os.path.join(data_dir, os.path.pardir))
image_id, _ = os.path.splitext(image_file_name)
class_segmentation_file_path = os.path.join(
data_dir, "SegmentationClass", image_id + ".png"
)
object_segmentation_file_path = os.path.join(
data_dir, "SegmentationObject", image_id + ".png"
)
annotation_file_path = os.path.join(data_dir, "Annotations", image_id + ".xml")
image_annotations = _parse_annotation_data(annotation_file_path)
result = {
"image/filename": image_id + ".jpg",
"image/file_path": image_file_path,
"segmentation/class/file_path": class_segmentation_file_path,
"segmentation/object/file_path": object_segmentation_file_path,
}
result.update(image_annotations)
# Labels field should be same as the 'object.label'
labels = list(set([o["label"] for o in result["objects"]]))
result["labels"] = sorted(labels)
return result
def _build_metadata(data_dir, image_ids):
# Parallel process all the images.
image_file_paths = [
os.path.join(data_dir, "JPEGImages", i + ".jpg") for i in image_ids
]
pool_size = 10 if len(image_ids) > 10 else len(image_ids)
with multiprocessing.Pool(pool_size) as p:
metadata = p.map(_parse_single_image, image_file_paths)
# Transpose the metadata which convert from list of dict to dict of list.
keys = [
"image/filename",
"image/file_path",
"segmentation/class/file_path",
"segmentation/object/file_path",
"labels",
"width",
"height",
]
result = {}
for key in keys:
values = [value[key] for value in metadata]
result[key] = values
# The ragged objects need some special handling
for key in ["label", "pose", "bbox", "is_truncated", "is_difficult"]:
values = []
objects = [value["objects"] for value in metadata]
for object in objects:
values.append([o[key] for o in object])
result["objects/" + key] = values
return result
# With jit_compile=True, there will be 0.4 sec compilation overhead, but save about 0.2
# sec per 1000 images. See https://github.com/keras-team/keras-cv/pull/943#discussion_r1001092882
# for more details.
@tf.function(jit_compile=True)
def _decode_png_mask(mask):
"""Decode the raw PNG image and convert it to 2D tensor with probably class."""
# Cast the mask to int32 since the original uint8 will overflow when multiple with 256
mask = tf.cast(mask, tf.int32)
mask = mask[:, :, 0] * 256 * 256 + mask[:, :, 1] * 256 + mask[:, :, 2]
mask = tf.expand_dims(tf.gather(VOC_PNG_COLOR_MAPPING, mask), -1)
mask = tf.cast(mask, tf.uint8)
return mask
def _load_images(example):
image_file_path = example.pop("image/file_path")
segmentation_class_file_path = example.pop("segmentation/class/file_path")
segmentation_object_file_path = example.pop("segmentation/object/file_path")
image = tf.io.read_file(image_file_path)
image = tf.image.decode_jpeg(image)
segmentation_class_mask = tf.io.read_file(segmentation_class_file_path)
segmentation_class_mask = tf.image.decode_png(segmentation_class_mask)
segmentation_class_mask = _decode_png_mask(segmentation_class_mask)
segmentation_object_mask = tf.io.read_file(segmentation_object_file_path)
segmentation_object_mask = tf.image.decode_png(segmentation_object_mask)
segmentation_object_mask = _decode_png_mask(segmentation_object_mask)
example.update(
{
"image": image,
"class_segmentation": segmentation_class_mask,
"object_segmentation": segmentation_object_mask,
}
)
return example
def _build_dataset_from_metadata(metadata):
# The objects need some manual conversion to ragged tensor.
metadata["labels"] = tf.ragged.constant(metadata["labels"])
metadata["objects/label"] = tf.ragged.constant(metadata["objects/label"])
metadata["objects/pose"] = tf.ragged.constant(metadata["objects/pose"])
metadata["objects/is_truncated"] = tf.ragged.constant(
metadata["objects/is_truncated"]
)
metadata["objects/is_difficult"] = tf.ragged.constant(
metadata["objects/is_difficult"]
)
metadata["objects/bbox"] = tf.ragged.constant(
metadata["objects/bbox"], ragged_rank=1
)
dataset = tf.data.Dataset.from_tensor_slices(metadata)
dataset = dataset.map(_load_images, num_parallel_calls=tf.data.AUTOTUNE)
return dataset
def load(
split="train",
data_dir=None,
):
"""Load the Pacal VOC 2012 dataset.
This function will download the data tar file from remote if needed, and untar to
the local `data_dir`, and build dataset from it.
Args:
split: string, can be 'train', 'eval', or None. When None, both train and eval data
will be loaded. Default to `train`
data_dir: string, local directory path for the loaded data. This will be used to
download the data file, and unzip. It will be used as a cach directory.
Default to None, and `~/.keras/pascal_voc_2012` will be used.
"""
supported_split_value = ["train", "eval", None]
if split not in supported_split_value:
raise ValueError(
f"The support value for `split` are {supported_split_value}. "
f"Got: {split}"
)
if data_dir is not None:
data_dir = os.path.expanduser(data_dir)
data_dir = _download_pascal_voc_2012(DATA_URL, local_dir_path=data_dir)
image_ids = _get_image_ids(data_dir, split)
metadata = _build_metadata(data_dir, image_ids)
_maybe_populate_voc_color_mapping()
dataset = _build_dataset_from_metadata(metadata)
return dataset
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pathlib
import sys
import tensorflow as tf
from absl import flags
from keras_cv.datasets.pascal_voc import segmentation
class PascalVocSegmentationDataTest(tf.test.TestCase):
def setUp(self):
super().setUp()
self.tempdir = self.get_tempdir()
# Note that this will not work with bazel, need to be rewrite into relying on
# FLAGS.test_srcdir
self.test_data_tar_path = os.path.abspath(
os.path.join(
os.path.abspath(__file__), os.path.pardir, "test_data", "VOC_mini.tar"
)
)
def get_tempdir(self):
try:
flags.FLAGS.test_tmpdir
except flags.UnparsedFlagAccessError:
# Need to initialize flags when running `pytest`.
flags.FLAGS(sys.argv, known_only=True)
return self.create_tempdir().full_path
def test_download_data(self):
# Since the original data package is too large, we use a small package as a
# replacement.
local_data_dir = os.path.join(self.tempdir, "pascal_voc_2012/")
test_data_dir = segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
)
self.assertTrue(os.path.exists(test_data_dir))
# Make sure the data is unzipped correctly and populated with correct content
expected_subdirs = [
"Annotations",
"ImageSets",
"JPEGImages",
"SegmentationClass",
"SegmentationObject",
]
for sub_dir in expected_subdirs:
self.assertTrue(os.path.exists(os.path.join(test_data_dir, sub_dir)))
def test_skip_download_and_override(self):
local_data_dir = os.path.join(self.tempdir, "pascal_voc_2012/")
test_data_dir = segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
)
# Touch a file in the test_data_dir and make sure it exists (not being override)
# when invoke the _download_pascal_voc_2012 again
os.makedirs(os.path.join(test_data_dir, "Annotations", "dummy_dir"))
segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
override_extract=False,
)
self.assertTrue(
os.path.exists(os.path.join(test_data_dir, "Annotations", "dummy_dir"))
)
def test_get_image_ids(self):
local_data_dir = os.path.join(self.tempdir, "pascal_voc_2012/")
data_dir = segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
)
train_ids = ["2007_000032", "2007_000039", "2007_000063"]
eval_ids = ["2007_000033"]
train_eval_ids = train_ids + eval_ids
self.assertEquals(segmentation._get_image_ids(data_dir, "train"), train_ids)
self.assertEquals(segmentation._get_image_ids(data_dir, "eval"), eval_ids)
self.assertEquals(segmentation._get_image_ids(data_dir, None), train_eval_ids)
def test_parse_annotation_file(self):
local_data_dir = os.path.join(self.tempdir, "pascal_voc_2012/")
data_dir = segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
)
# One of the train file.
annotation_file = os.path.join(data_dir, "Annotations", "2007_000032.xml")
metadata = segmentation._parse_annotation_data(annotation_file)
expected_result = {
"height": 281,
"width": 500,
"objects": [
{
"label": 0,
"pose": "frontal",
"bbox": [78, 104, 183, 375],
"is_truncated": False,
"is_difficult": False,
},
{
"label": 0,
"pose": "left",
"bbox": [88, 133, 123, 197],
"is_truncated": False,
"is_difficult": False,
},
{
"label": 14,
"pose": "rear",
"bbox": [180, 195, 229, 213],
"is_truncated": False,
"is_difficult": False,
},
{
"label": 14,
"pose": "rear",
"bbox": [189, 26, 238, 44],
"is_truncated": False,
"is_difficult": False,
},
],
}
self.assertEquals(metadata, expected_result)
def test_decode_png_mask(self):
local_data_dir = os.path.join(self.tempdir, "pascal_voc_2012/")
data_dir = segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
)
mask_file = os.path.join(data_dir, "SegmentationClass", "2007_000032.png")
mask = tf.io.decode_png(tf.io.read_file(mask_file))
segmentation._maybe_populate_voc_color_mapping()
mask = segmentation._decode_png_mask(mask)
self.assertEquals(mask.shape, (281, 500, 1))
self.assertEquals(tf.reduce_max(mask), 255) # The 255 value is for the boundary
self.assertEquals(tf.reduce_min(mask), 0) # The 0 value is for the background
# The mask contains two classes, 1 and 15, see the label section in the previous
# test case.
self.assertEquals(tf.reduce_sum(tf.cast(tf.equal(mask, 1), tf.int32)), 4734)
self.assertEquals(tf.reduce_sum(tf.cast(tf.equal(mask, 15), tf.int32)), 866)
def test_parse_single_image(self):
local_data_dir = os.path.join(self.tempdir, "pascal_voc_2012/")
data_dir = segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
)
image_file = os.path.join(data_dir, "JPEGImages", "2007_000032.jpg")
result_dict = segmentation._parse_single_image(image_file)
expected_result = {
"image/filename": "2007_000032.jpg",
"image/file_path": image_file,
"height": 281,
"width": 500,
"objects": [
{
"label": 0,
"pose": "frontal",
"bbox": [78, 104, 183, 375],
"is_truncated": False,
"is_difficult": False,
},
{
"label": 0,
"pose": "left",
"bbox": [88, 133, 123, 197],
"is_truncated": False,
"is_difficult": False,
},
{
"label": 14,
"pose": "rear",
"bbox": [180, 195, 229, 213],
"is_truncated": False,
"is_difficult": False,
},
{
"label": 14,
"pose": "rear",
"bbox": [189, 26, 238, 44],
"is_truncated": False,
"is_difficult": False,
},
],
"labels": [0, 14],
"segmentation/class/file_path": os.path.join(
data_dir, "SegmentationClass", "2007_000032.png"
),
"segmentation/object/file_path": os.path.join(
data_dir, "SegmentationObject", "2007_000032.png"
),
}
self.assertEquals(result_dict, expected_result)
def test_build_metadata(self):
local_data_dir = os.path.join(self.tempdir, "pascal_voc_2012/")
data_dir = segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
)
image_ids = segmentation._get_image_ids(data_dir, None)
metadata = segmentation._build_metadata(data_dir, image_ids)
self.assertEquals(
metadata["image/filename"],
[
"2007_000032.jpg",
"2007_000039.jpg",
"2007_000063.jpg",
"2007_000033.jpg",
],
)
expected_keys = [
"image/filename",
"image/file_path",
"segmentation/class/file_path",
"segmentation/object/file_path",
"labels",
"width",
"height",
"objects/label",
"objects/pose",
"objects/bbox",
"objects/is_truncated",
"objects/is_difficult",
]
for key in expected_keys:
self.assertLen(metadata[key], 4)
def test_build_dataset(self):
local_data_dir = os.path.join(self.tempdir, "pascal_voc_2012/")
data_dir = segmentation._download_pascal_voc_2012(
data_url=pathlib.Path(self.test_data_tar_path).as_uri(),
local_dir_path=local_data_dir,
)
image_ids = segmentation._get_image_ids(data_dir, None)
metadata = segmentation._build_metadata(data_dir, image_ids)
segmentation._maybe_populate_voc_color_mapping()
dataset = segmentation._build_dataset_from_metadata(metadata)
entry = next(dataset.take(1).as_numpy_iterator())
self.assertEquals(entry["image/filename"], b"2007_000032.jpg")
expected_keys = [
"image",
"image/filename",
"labels",
"width",
"height",
"objects/label",
"objects/pose",
"objects/bbox",
"objects/is_truncated",
"objects/is_difficult",
"class_segmentation",
"object_segmentation",
]
for key in expected_keys:
self.assertIn(key, entry)
# Check the mask png content
png = entry["class_segmentation"]
self.assertEquals(png.shape, (281, 500, 1))
self.assertEquals(tf.reduce_max(png), 255) # The 255 value is for the boundary
self.assertEquals(tf.reduce_min(png), 0) # The 0 value is for the background
# The mask contains two classes, 1 and 15, see the label section in the previous
# test case.
self.assertEquals(tf.reduce_sum(tf.cast(tf.equal(png, 1), tf.int32)), 4734)
self.assertEquals(tf.reduce_sum(tf.cast(tf.equal(png, 15), tf.int32)), 866)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from keras_cv.keypoint.converters import convert_format
from keras_cv.keypoint.formats import REL_XY
from keras_cv.keypoint.formats import XY
from keras_cv.keypoint.utils import filter_out_of_image
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Converter functions for working with keypoints formats."""
import tensorflow as tf
# Internal exception
class _RequiresImagesException(Exception):
pass
def _rel_xy_to_xy(keypoints, images=None):
if images is None:
raise _RequiresImagesException()
shape = tf.cast(tf.shape(images), keypoints.dtype)
h, w = shape[1], shape[2]
x, y, rest = tf.split(keypoints, [1, 1, keypoints.shape[-1] - 2], axis=-1)
return tf.concat([x * w, y * h, rest], axis=-1)
def _xy_to_rel_xy(keypoints, images=None):
if images is None:
raise _RequiresImagesException()
shape = tf.cast(tf.shape(images), keypoints.dtype)
h, w = shape[1], shape[2]
x, y, rest = tf.split(keypoints, [1, 1, keypoints.shape[-1] - 2], axis=-1)
return tf.concat([x / w, y / h, rest], axis=-1)
def _xy_noop(keypoints, images=None):
return keypoints
TO_XY_CONVERTERS = {
"xy": _xy_noop,
"rel_xy": _rel_xy_to_xy,
}
FROM_XY_CONVERTERS = {
"xy": _xy_noop,
"rel_xy": _xy_to_rel_xy,
}
def convert_format(keypoints, source, target, images=None, dtype=None):
"""Converts keypoints from one format to another.
Supported formats are:
- `"xy"`, absolute pixel positions.
- `"rel_xyxy"`. relative pixel positions.
Formats are case insensitive. It is recommended that you
capitalize width and height to maximize the visual difference
between `"xyWH"` and `"xyxy"`.
Relative formats, abbreviated `rel`, make use of the shapes of the
`images` passsed. In these formats, the coordinates, widths, and
heights are all specified as percentages of the host image.
`images` may be a ragged Tensor. Note that using a ragged Tensor
for images may cause a substantial performance loss, as each image
will need to be processed separately due to the mismatching image
shapes.
Usage:
```python
images, keypoints = load_my_dataset()
keypoints_in_rel = keras_cv.keypoint.convert_format(
keypoint,
source='xy',
target='rel_xy',
images=images,
)
```
Args:
keypoints: tf.Tensor or tf.RaggedTensor representing keypoints
in the format specified in the `source` parameter.
`keypoints` can optionally have extra dimensions stacked
on the final axis to store metadata. keypoints should
have a rank between 2 and 4, with the shape
`[num_boxes,*]`, `[batch_size, num_boxes, *]` or
`[batch_size, num_groups, num_keypoints,*]`.
source: One of {" ".join([f'"{f}"' for f in
TO_XY_CONVERTERS.keys()])}. Used to specify the original
format of the `boxes` parameter.
target: One of {" ".join([f'"{f}"' for f in
TO_XY_CONVERTERS.keys()])}. Used to specify the
destination format of the `boxes` parameter.
images: (Optional) a batch of images aligned with `boxes` on
the first axis. Should be rank 3 (`HWC` format) or 4
(`BHWC` format). Used in some converters to compute
relative pixel values of the bounding box dimensions.
Required when transforming from a rel format to a non-rel
format.
dtype: the data type to use when transforming the boxes.
Defaults to None, i.e. `keypoints` dtype.
"""
source = source.lower()
target = target.lower()
if source not in TO_XY_CONVERTERS:
raise ValueError(
f"convert_format() received an unsupported format for the argument "
f"`source`. `source` should be one of {TO_XY_CONVERTERS.keys()}. "
f"Got source={source}"
)
if target not in FROM_XY_CONVERTERS:
raise ValueError(
f"convert_format() received an unsupported format for the argument "
f"`target`. `target` should be one of {FROM_XY_CONVERTERS.keys()}. "
f"Got target={target}"
)
if dtype:
keypoints = tf.cast(keypoints, dtype)
if source == target:
return keypoints
keypoints, images, squeeze_axis = _format_inputs(keypoints, images)
try:
in_xy = TO_XY_CONVERTERS[source](keypoints, images=images)
result = FROM_XY_CONVERTERS[target](in_xy, images=images)
except _RequiresImagesException:
raise ValueError(
"convert_format() must receive `images` when transforming "
f"between relative and absolute formats. "
f"convert_format() received source=`{source}`, target=`{target}`, "
f"but images={images}"
)
return _format_outputs(result, squeeze_axis)
def _format_inputs(keypoints, images):
keypoints_rank = len(keypoints.shape)
if keypoints_rank > 4:
raise ValueError(
"Expected keypoints rank to be in [2, 4], got "
f"len(keypoints.shape)={keypoints_rank}."
)
keypoints_includes_batch = keypoints_rank > 2
keypoints_are_grouped = keypoints_rank == 4
if images is not None:
images_rank = len(images.shape)
if images_rank > 4 or images_rank < 3:
raise ValueError(
"Expected images rank to be 3 or 4, got "
f"len(images.shape)={images_rank}."
)
images_include_batch = images_rank == 4
if keypoints_includes_batch != images_include_batch:
raise ValueError(
"convert_format() expects both `keypoints` and `images` to be batched "
f"or both unbatched. Received len(keypoints.shape)={keypoints_rank}, "
f"len(images.shape)={images_rank}. Expected either "
"len(keypoints.shape)=2 and len(images.shape)=3, or "
"len(keypoints.shape)>=3 and len(images.shape)=4."
)
if not images_include_batch:
images = tf.expand_dims(images, axis=0)
squeeze_axis = []
if not keypoints_includes_batch:
keypoints = tf.expand_dims(keypoints, axis=0)
squeeze_axis.append(0)
if not keypoints_are_grouped:
keypoints = tf.expand_dims(keypoints, axis=1)
squeeze_axis.append(1)
return keypoints, images, squeeze_axis
def _format_outputs(result, squeeze_axis):
if len(squeeze_axis) == 0:
return result
return tf.squeeze(result, axis=squeeze_axis)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import tensorflow as tf
from absl.testing import parameterized
from keras_cv import keypoint
xy_keypoints = tf.constant(
[[[10, 20], [110, 120], [210, 220]], [[20, 30], [120, 130], [220, 230]]],
dtype=tf.float32,
)
rel_xy_keypoints = tf.constant(
[
[[0.01, 0.04], [0.11, 0.24], [0.21, 0.44]],
[[0.02, 0.06], [0.12, 0.26], [0.22, 0.46]],
],
dtype=tf.float32,
)
images = tf.ones([2, 500, 1000, 3])
keypoints = {
"xy": xy_keypoints,
"rel_xy": rel_xy_keypoints,
}
test_cases = [
(f"{source}_{target}", source, target)
for (source, target) in itertools.permutations(keypoints.keys(), 2)
] + [("xy_xy", "xy", "xy")]
class ConvertersTestCase(tf.test.TestCase, parameterized.TestCase):
@parameterized.named_parameters(*test_cases)
def test_converters(self, source, target):
source_keypoints = keypoints[source]
target_keypoints = keypoints[target]
self.assertAllClose(
keypoint.convert_format(
source_keypoints, source=source, target=target, images=images
),
target_keypoints,
)
@parameterized.named_parameters(*test_cases)
def test_converters_unbatched(self, source, target):
source_keypoints = keypoints[source][0]
target_keypoints = keypoints[target][0]
self.assertAllClose(
keypoint.convert_format(
source_keypoints, source=source, target=target, images=images[0]
),
target_keypoints,
)
@parameterized.named_parameters(*test_cases)
def test_converters_ragged_groups(self, source, target):
source_keypoints = keypoints[source]
target_keypoints = keypoints[target]
def create_ragged_group(ins):
res = []
for b, groups in zip(ins, [[1, 2], [0, 3]]):
res.append(tf.RaggedTensor.from_row_lengths(b, groups))
return tf.stack(res, axis=0)
source_keypoints = create_ragged_group(source_keypoints)
target_keypoints = create_ragged_group(target_keypoints)
self.assertAllClose(
keypoint.convert_format(
source_keypoints, source=source, target=target, images=images
),
target_keypoints,
)
@parameterized.named_parameters(*test_cases)
def test_converters_with_metadata(self, source, target):
source_keypoints = keypoints[source]
target_keypoints = keypoints[target]
def add_metadata(ins):
return tf.concat([ins, tf.ones([2, 3, 5])], axis=-1)
source_keypoints = add_metadata(source_keypoints)
target_keypoints = add_metadata(target_keypoints)
self.assertAllClose(
keypoint.convert_format(
source_keypoints, source=source, target=target, images=images
),
target_keypoints,
)
def test_raise_errors_when_missing_shape(self):
with self.assertRaises(ValueError) as e:
keypoint.convert_format(keypoints["xy"], source="xy", target="rel_xy")
self.assertEqual(
str(e.exception),
"convert_format() must receive `images` when transforming "
"between relative and absolute formats. convert_format() "
"received source=`xy`, target=`rel_xy`, but images=None",
)
@parameterized.named_parameters(
(
"keypoint_rank",
tf.ones([2, 3, 4, 2, 1]),
None,
"Expected keypoints rank to be in [2, 4], got len(keypoints.shape)=5.",
),
(
"images_rank",
tf.ones([4, 2]),
tf.ones([35, 35]),
"Expected images rank to be 3 or 4, got len(images.shape)=2.",
),
(
"batch_mismatch",
tf.ones([2, 4, 2]),
tf.ones([35, 35, 3]),
"convert_format() expects both `keypoints` and `images` to be batched or "
"both unbatched. Received len(keypoints.shape)=3, len(images.shape)=3. "
"Expected either len(keypoints.shape)=2 and len(images.shape)=3, or "
"len(keypoints.shape)>=3 and len(images.shape)=4.",
),
)
def test_input_format_exception(self, keypoints, images, expected):
with self.assertRaises(ValueError) as e:
keypoint.convert_format(
keypoints, source="xy", target="rel_xy", images=images
)
self.assertEqual(str(e.exception), expected)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
formats.py contains axis information for each supported format.
"""
class XY:
"""XY contains axis indices for the XY format.
All values in the XY format should be absolute pixel values.
The XY format consists of the following required indices:
- X: the width position
- Y: the height position
and the following optional indices, used in some KerasCV components:
- CLASS: class of the keypoints
- CONFIDENCE: confidence of the keypoints
"""
X = 0
Y = 1
CLASS = 2
CONFIDENCE = 3
class REL_XY:
"""REL_XY contains axis indices for the REL_XY format.
REL_XY is like XY, but each value is relative to the width and height of the
origin image. Values are percentages of the origin images' width and height
respectively.
The REL_XY format consists of the following required indices:
- X: the width position
- Y: the height position
and the following optional indices, used in some KerasCV components:
- CLASS: class of the keypoints
- CONFIDENCE: confidence of the keypoints
"""
X = 0
Y = 1
CLASS = 2
CONFIDENCE = 3
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for keypoint transformation."""
import tensorflow as tf
H_AXIS = -3
W_AXIS = -2
def filter_out_of_image(keypoints, image):
"""Discards keypoints if falling outside of the image.
Args:
keypoints: a, possibly ragged, 2D (ungrouped), 3D (grouped)
keypoint data in the 'xy' format.
image: a 3D tensor in the HWC format.
Returns:
tf.RaggedTensor: a 2D or 3D ragged tensor with at least one
ragged rank containing only keypoint in the image.
"""
image_shape = tf.cast(tf.shape(image), keypoints.dtype)
mask = tf.math.logical_and(
tf.math.logical_and(
keypoints[..., 0] >= 0, keypoints[..., 0] < image_shape[W_AXIS]
),
tf.math.logical_and(
keypoints[..., 1] >= 0, keypoints[..., 1] < image_shape[H_AXIS]
),
)
masked = tf.ragged.boolean_mask(keypoints, mask)
if isinstance(masked, tf.RaggedTensor):
return masked
return tf.RaggedTensor.from_tensor(masked)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl.testing import parameterized
from keras_cv.keypoint.utils import filter_out_of_image
class UtilsTestCase(tf.test.TestCase, parameterized.TestCase):
@parameterized.named_parameters(
(
"all inside",
tf.constant([[10.0, 20.0], [30.0, 40.0], [50.0, 50.0]]),
tf.zeros([100, 100, 3]),
tf.ragged.constant([[10.0, 20.0], [30.0, 40.0], [50.0, 50.0]]),
),
(
"some inside",
tf.constant([[10.0, 20.0], [30.0, 40.0], [50.0, 50.0]]),
tf.zeros([50, 50, 3]),
tf.ragged.constant([[10.0, 20.0], [30.0, 40.0]]),
),
(
"ragged input",
tf.RaggedTensor.from_row_lengths(
[[10.0, 20.0], [30.0, 40.0], [50.0, 50.0]], [2, 1]
),
tf.zeros([50, 50, 3]),
tf.RaggedTensor.from_row_lengths([[10.0, 20.0], [30.0, 40.0]], [2, 0]),
),
(
"height - width confusion",
tf.constant([[[10.0, 20.0]], [[40.0, 30.0]], [[30.0, 40.0]]]),
tf.zeros((50, 40, 3)),
tf.ragged.constant([[[10.0, 20.0]], [], [[30.0, 40.0]]], ragged_rank=1),
),
)
def test_result(self, keypoints, image, expected):
self.assertAllClose(filter_out_of_image(keypoints, image), expected)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tensorflow.keras.layers import CenterCrop
from tensorflow.keras.layers import RandomBrightness
from tensorflow.keras.layers import RandomContrast
from tensorflow.keras.layers import RandomCrop
from tensorflow.keras.layers import RandomHeight
from tensorflow.keras.layers import RandomRotation
from tensorflow.keras.layers import RandomTranslation
from tensorflow.keras.layers import RandomWidth
from tensorflow.keras.layers import RandomZoom
from tensorflow.keras.layers import Rescaling
from tensorflow.keras.layers import Resizing
from keras_cv.layers.feature_pyramid import FeaturePyramid
from keras_cv.layers.object_detection.anchor_generator import AnchorGenerator
from keras_cv.layers.object_detection.nms_prediction_decoder import NmsPredictionDecoder
from keras_cv.layers.object_detection.non_max_suppression import NonMaxSuppression
from keras_cv.layers.object_detection.retina_net_label_encoder import (
RetinaNetLabelEncoder,
)
from keras_cv.layers.preprocessing.aug_mix import AugMix
from keras_cv.layers.preprocessing.augmenter import Augmenter
from keras_cv.layers.preprocessing.auto_contrast import AutoContrast
from keras_cv.layers.preprocessing.base_image_augmentation_layer import (
BaseImageAugmentationLayer,
)
from keras_cv.layers.preprocessing.channel_shuffle import ChannelShuffle
from keras_cv.layers.preprocessing.cut_mix import CutMix
from keras_cv.layers.preprocessing.equalization import Equalization
from keras_cv.layers.preprocessing.fourier_mix import FourierMix
from keras_cv.layers.preprocessing.grayscale import Grayscale
from keras_cv.layers.preprocessing.grid_mask import GridMask
from keras_cv.layers.preprocessing.maybe_apply import MaybeApply
from keras_cv.layers.preprocessing.mix_up import MixUp
from keras_cv.layers.preprocessing.mosaic import Mosaic
from keras_cv.layers.preprocessing.posterization import Posterization
from keras_cv.layers.preprocessing.rand_augment import RandAugment
from keras_cv.layers.preprocessing.random_augmentation_pipeline import (
RandomAugmentationPipeline,
)
from keras_cv.layers.preprocessing.random_channel_shift import RandomChannelShift
from keras_cv.layers.preprocessing.random_choice import RandomChoice
from keras_cv.layers.preprocessing.random_color_degeneration import (
RandomColorDegeneration,
)
from keras_cv.layers.preprocessing.random_color_jitter import RandomColorJitter
from keras_cv.layers.preprocessing.random_crop_and_resize import RandomCropAndResize
from keras_cv.layers.preprocessing.random_cutout import RandomCutout
from keras_cv.layers.preprocessing.random_flip import RandomFlip
from keras_cv.layers.preprocessing.random_gaussian_blur import RandomGaussianBlur
from keras_cv.layers.preprocessing.random_hue import RandomHue
from keras_cv.layers.preprocessing.random_jpeg_quality import RandomJpegQuality
from keras_cv.layers.preprocessing.random_saturation import RandomSaturation
from keras_cv.layers.preprocessing.random_sharpness import RandomSharpness
from keras_cv.layers.preprocessing.random_shear import RandomShear
from keras_cv.layers.preprocessing.randomly_zoomed_crop import RandomlyZoomedCrop
from keras_cv.layers.preprocessing.solarization import Solarization
from keras_cv.layers.regularization.drop_path import DropPath
from keras_cv.layers.regularization.dropblock_2d import DropBlock2D
from keras_cv.layers.regularization.squeeze_excite import SqueezeAndExcite2D
from keras_cv.layers.regularization.stochastic_depth import StochasticDepth
from keras_cv.layers.spatial_pyramid import SpatialPyramidPooling
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
# TODO(scottzhu): Register it later due to the conflict in the retina_net
# @tf.keras.utils.register_keras_serializable(package="keras_cv")
class FeaturePyramid(tf.keras.layers.Layer):
"""Implements a Feature Pyramid Network.
This implements the paper:
Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan, and
Serge Belongie.
Feature Pyramid Networks for Object Detection.
(https://arxiv.org/pdf/1612.03144)
Feature Pyramid Networks (FPNs) are basic components that are added to an
existing feature extractor (CNN) to combine features at different scales. For the
basic FPN, the inputs are features `Ci` from different levels of a CNN, which is
usually the last block for each level, where the feature is scaled from the image
by a factor of `1/2^i`.
There is an output associated with each level in the basic FPN. The output Pi
at level `i` (corresponding to Ci) is given by performing a merge operation on
the outputs of:
1) a lateral operation on Ci (usually a conv2D layer with kernel = 1 and strides = 1)
2) a top-down upsampling operation from Pi+1 (except for the top most level)
The final output of each level will also have a conv2D operation
(usually with kernel = 3 and strides = 1).
The inputs to the layer should be a dict with int keys should match the
pyramid_levels, e.g. for `pyramid_levels` = [2,3,4,5], the expected input dict should
be `{2:c2, 3:c3, 4:c4, 5:c5}`.
The output of the layer will have same structures as the inputs, a dict with int keys
and value for each of the level.
Args:
min_level: a python int for the lowest level of the pyramid for
feature extraction.
max_level: a python int for the highest level of the pyramid for
feature extraction.
num_channels: an integer representing the number of channels for the FPN
operations. Defaults to 256.
lateral_layers: a python dict with int keys that matches to each of the pyramid
level. The values of the dict should be `keras.Layer`, which will be called
with feature activation outputs from backbone at each level. Default to
None, and a `keras.Conv2D` layer with kernel 1x1 will be created for each
pyramid level.
output_layers: a python dict with int keys that matches to each of the pyramid
level. The values of the dict should be `keras.Layer`, which will be called
with feature inputs and merged result from upstream levels. Default to None,
and a `keras.Conv2D` layer with kernel 3x3 will be created for each pyramid
level.
Sample Usage:
```python
inp = tf.keras.layers.Input((384, 384, 3))
backbone = tf.keras.applications.EfficientNetB0(input_tensor=inp, include_top=False)
layer_names = ['block2b_add', 'block3b_add', 'block5c_add', 'top_activation']
backbone_outputs = {}
for i, layer_name in enumerate(layer_names):
backbone_outputs[i+2] = backbone.get_layer(layer_name).output
# output_dict is a dict with 2, 3, 4, 5 as keys
output_dict = keras_cv.layers.FeaturePyramid(min_level=2, max_level=5)(backbone_outputs)
```
"""
def __init__(
self,
min_level,
max_level,
num_channels=256,
lateral_layers=None,
output_layers=None,
**kwargs,
):
super().__init__(**kwargs)
self.min_level = min_level
self.max_level = max_level
self.pyramid_levels = list(range(min_level, max_level + 1))
self.num_channels = num_channels
# required for successful serialization
self.lateral_layers_passed = lateral_layers
self.output_layers_passed = output_layers
if not lateral_layers:
# populate self.lateral_ops with default FPN Conv2D 1X1 layers
self.lateral_layers = {}
for i in self.pyramid_levels:
self.lateral_layers[i] = tf.keras.layers.Conv2D(
self.num_channels,
kernel_size=1,
strides=1,
padding="same",
name=f"lateral_P{i}",
)
else:
self._validate_user_layers(lateral_layers, "lateral_layers")
self.lateral_layers = lateral_layers
# Output conv2d layers.
if not output_layers:
self.output_layers = {}
for i in self.pyramid_levels:
self.output_layers[i] = tf.keras.layers.Conv2D(
self.num_channels,
kernel_size=3,
strides=1,
padding="same",
name=f"output_P{i}",
)
else:
self._validate_user_layers(output_layers, "output_layers")
self.output_layers = output_layers
# the same upsampling layer is used for all levels
self.top_down_op = tf.keras.layers.UpSampling2D(size=2)
# the same merge layer is used for all levels
self.merge_op = tf.keras.layers.Add()
def _validate_user_layers(self, user_input, param_name):
if (
not isinstance(user_input, dict)
or sorted(user_input.keys()) != self.pyramid_levels
):
raise ValueError(
f"Expect {param_name} to be a dict with keys as "
f"{self.pyramid_levels}, got {user_input}"
)
def call(self, features):
# Note that this assertion might not be true for all the subclasses. It is
# possible to have FPN that has high levels than the height of backbone outputs.
if (
not isinstance(features, dict)
or sorted(features.keys()) != self.pyramid_levels
):
raise ValueError(
"FeaturePyramid expects input features to be a dict with int keys "
"that match the values provided in pyramid_levels. "
f"Expect feature keys: {self.pyramid_levels}, got: {features}"
)
return self.build_feature_pyramid(features)
def build_feature_pyramid(self, input_features):
# To illustrate the connection/topology, the basic flow for a FPN with level
# 3, 4, 5 is like below:
#
# input_l5 -> conv2d_1x1_l5 ----V---> conv2d_3x3_l5 -> output_l5
# V
# upsample2d
# V
# input_l4 -> conv2d_1x1_l4 -> Add -> conv2d_3x3_l4 -> output_l4
# V
# upsample2d
# V
# input_l3 -> conv2d_1x1_l3 -> Add -> conv2d_3x3_l3 -> output_l3
output_features = {}
reversed_levels = list(sorted(input_features.keys(), reverse=True))
top_level = reversed_levels[0]
for level in reversed_levels:
output = self.lateral_layers[level](input_features[level])
if level < top_level:
# for the top most output, it doesn't need to merge with any upper stream
# outputs
upstream_output = self.top_down_op(output_features[level + 1])
output = self.merge_op([output, upstream_output])
output_features[level] = output
# Post apply the output layers so that we don't leak them to the down stream level
for level in reversed_levels:
output_features[level] = self.output_layers[level](output_features[level])
return output_features
def get_config(self):
config = {
"min_level": self.min_level,
"max_level": self.max_level,
"num_channels": self.num_channels,
"lateral_layers": self.lateral_layers_passed,
"output_layers": self.output_layers_passed,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from keras_cv.layers import FeaturePyramid
class FeaturePyramidTest(tf.test.TestCase):
def test_return_type_dict(self):
layer = FeaturePyramid(min_level=2, max_level=5)
c2 = tf.ones([2, 64, 64, 3])
c3 = tf.ones([2, 32, 32, 3])
c4 = tf.ones([2, 16, 16, 3])
c5 = tf.ones([2, 8, 8, 3])
inputs = {2: c2, 3: c3, 4: c4, 5: c5}
output = layer(inputs)
self.assertTrue(isinstance(output, dict))
self.assertEquals(sorted(output.keys()), [2, 3, 4, 5])
def test_result_shapes(self):
layer = FeaturePyramid(min_level=2, max_level=5)
c2 = tf.ones([2, 64, 64, 3])
c3 = tf.ones([2, 32, 32, 3])
c4 = tf.ones([2, 16, 16, 3])
c5 = tf.ones([2, 8, 8, 3])
inputs = {2: c2, 3: c3, 4: c4, 5: c5}
output = layer(inputs)
for level in inputs.keys():
self.assertEquals(output[level].shape[1], inputs[level].shape[1])
self.assertEquals(output[level].shape[2], inputs[level].shape[2])
self.assertEquals(output[level].shape[3], layer.num_channels)
# Test with different resolution and channel size
c2 = tf.ones([2, 64, 128, 4])
c3 = tf.ones([2, 32, 64, 8])
c4 = tf.ones([2, 16, 32, 16])
c5 = tf.ones([2, 8, 16, 32])
inputs = {2: c2, 3: c3, 4: c4, 5: c5}
layer = FeaturePyramid(min_level=2, max_level=5)
output = layer(inputs)
for level in inputs.keys():
self.assertEquals(output[level].shape[1], inputs[level].shape[1])
self.assertEquals(output[level].shape[2], inputs[level].shape[2])
self.assertEquals(output[level].shape[3], layer.num_channels)
def test_with_keras_input_tensor(self):
# This mimic the model building with Backbone network
layer = FeaturePyramid(min_level=2, max_level=5)
c2 = tf.keras.layers.Input([64, 64, 3])
c3 = tf.keras.layers.Input([32, 32, 3])
c4 = tf.keras.layers.Input([16, 16, 3])
c5 = tf.keras.layers.Input([8, 8, 3])
inputs = {2: c2, 3: c3, 4: c4, 5: c5}
output = layer(inputs)
for level in inputs.keys():
self.assertEquals(output[level].shape[1], inputs[level].shape[1])
self.assertEquals(output[level].shape[2], inputs[level].shape[2])
self.assertEquals(output[level].shape[3], layer.num_channels)
def test_invalid_lateral_layers(self):
lateral_layers = [tf.keras.layers.Conv2D(256, 1)] * 3
with self.assertRaisesRegexp(ValueError, "Expect lateral_layers to be a dict"):
_ = FeaturePyramid(min_level=2, max_level=5, lateral_layers=lateral_layers)
lateral_layers = {
2: tf.keras.layers.Conv2D(256, 1),
3: tf.keras.layers.Conv2D(256, 1),
4: tf.keras.layers.Conv2D(256, 1),
}
with self.assertRaisesRegexp(ValueError, "with keys as .* [2, 3, 4, 5]"):
_ = FeaturePyramid(min_level=2, max_level=5, lateral_layers=lateral_layers)
def test_invalid_output_layers(self):
output_layers = [tf.keras.layers.Conv2D(256, 3)] * 3
with self.assertRaisesRegexp(ValueError, "Expect output_layers to be a dict"):
_ = FeaturePyramid(min_level=2, max_level=5, output_layers=output_layers)
output_layers = {
2: tf.keras.layers.Conv2D(256, 3),
3: tf.keras.layers.Conv2D(256, 3),
4: tf.keras.layers.Conv2D(256, 3),
}
with self.assertRaisesRegexp(ValueError, "with keys as .* [2, 3, 4, 5]"):
_ = FeaturePyramid(min_level=2, max_level=5, output_layers=output_layers)
def test_invalid_input_features(self):
layer = FeaturePyramid(min_level=2, max_level=5)
c2 = tf.ones([2, 64, 64, 3])
c3 = tf.ones([2, 32, 32, 3])
c4 = tf.ones([2, 16, 16, 3])
c5 = tf.ones([2, 8, 8, 3])
list_input = [c2, c3, c4, c5]
with self.assertRaisesRegexp(ValueError, "expects input features to be a dict"):
layer(list_input)
dict_input_with_missing_feature = {2: c2, 3: c3, 4: c4}
with self.assertRaisesRegexp(ValueError, "Expect feature keys.*[2, 3, 4, 5]"):
layer(dict_input_with_missing_feature)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from tensorflow import keras
from keras_cv import bounding_box
class AnchorGenerator(keras.layers.Layer):
"""AnchorGenerator generates anchors for multiple feature maps.
AnchorGenerator takes multiple scales and generates anchor boxes based on the anchor
sizes, scales, aspect ratios, and strides provided. To invoke AnchorGenerator, call
it on the image that needs anchor boxes.
`sizes` and `strides` must match structurally - they are pairs. Scales and
aspect ratios can either be a list, that is then used for all of the sizes
(aka levels), or a dictionary from `{'level_{number}': [parameters at scale...]}`.
Args:
bounding_box_format: The format of bounding boxes to generate. Refer
[to the keras.io docs](https://keras.io/api/keras_cv/bounding_box/formats/)
for more details on supported bounding box formats.
sizes: A list of integers that represent the anchor sizes for each level,
or a dictionary of integer lists with each key representing a level.
For each anchor size, anchor height will be `anchor_size / sqrt(aspect_ratio)`,
and anchor width will be `anchor_size * sqrt(aspect_ratio)`. This is repeated
for each scale and aspect ratio.
scales: A list of floats corresponding to multipliers that will be
multiplied by each `anchor_size` to generate a level.
aspect_ratios: A list of floats representing the ratio of anchor width to height.
strides: iterable of ints that represent the anchor stride size between
center of anchors at each scale.
clip_boxes: Whether or not to clip generated anchor boxes to the image size.
Defaults to `False`.
Usage:
```python
strides = [8, 16, 32]
scales = [1, 1.2599210498948732, 1.5874010519681994]
sizes = [32.0, 64.0, 128.0]
aspect_ratios = [0.5, 1.0, 2.0]
image = tf.random.uniform((512, 512, 3))
anchor_generator = cv_layers.AnchorGenerator(
bounding_box_format="rel_yxyx",
sizes=sizes,
aspect_ratios=aspect_ratios,
scales=scales,
strides=strides,
clip_boxes=True,
)
anchors = anchor_generator(image)
print(anchors)
# > {0: ..., 1: ..., 2: ...}
```
Input shape: an image with shape `[H, W, C]`
Output: a dictionary with integer keys corresponding to each level of the feature
pyramid. The size of the anchors at each level will be
`(H/strides[i] * W/strides[i] * len(scales) * len(aspect_ratios), 4)`.
"""
def __init__(
self,
bounding_box_format,
sizes,
scales,
aspect_ratios,
strides,
clip_boxes=False,
**kwargs,
):
super().__init__(**kwargs)
self.bounding_box_format = bounding_box_format
# aspect_ratio is a single list that is the same across all levels.
sizes, strides = self._format_sizes_and_strides(sizes, strides)
aspect_ratios = self._match_param_structure_to_sizes(aspect_ratios, sizes)
scales = self._match_param_structure_to_sizes(scales, sizes)
self.anchor_generators = {}
for k in sizes.keys():
self.anchor_generators[k] = _SingleAnchorGenerator(
bounding_box_format,
sizes[k],
scales[k],
aspect_ratios[k],
strides[k],
clip_boxes,
dtype=self.compute_dtype,
)
self.built = True
@staticmethod
def _format_sizes_and_strides(sizes, strides):
result_sizes = AnchorGenerator._ensure_param_is_levels_dict(sizes, "sizes")
result_strides = AnchorGenerator._ensure_param_is_levels_dict(
strides, "strides"
)
if sorted(result_strides.keys()) != sorted(result_sizes.keys()):
raise ValueError(
"Expected sizes and strides to be either lists of"
"the same length, or dictionaries with the same keys. Received "
f"sizes={sizes}, strides={strides}"
)
return result_sizes, result_strides
@staticmethod
def _ensure_param_is_levels_dict(param, param_name):
"""Takes a param and its name, converts lists to dictionaries of levels.
For example, the list [1, 2] is converted to {0: 1, 1: 2}.
Raises:
ValueError: when param is not a dict, list or tuple.
"""
if isinstance(param, dict):
return param
if not isinstance(param, (list, tuple)):
raise ValueError(
f"Expected {param_name} to be a dict, list or tuple, received "
f"{param_name}={param}"
)
result = {}
for i in range(len(param)):
result[i] = param[i]
return result
@staticmethod
def _match_param_structure_to_sizes(params, sizes):
"""broadcast the params to match sizes."""
# if isinstance(sizes, (tuple, list)):
# return [params] * len(sizes)
if not isinstance(sizes, dict):
raise ValueError(
"the structure of `sizes` must be a dict, " f"received sizes={sizes}"
)
return tf.nest.map_structure(lambda _: params, sizes)
def __call__(self, image=None, image_shape=None):
if image is None and image_shape is None:
raise ValueError("AnchorGenerator() requires `images` or `image_shape`.")
if image is not None:
if image.shape.rank != 3:
raise ValueError(
"Expected `image` to be a Tensor of rank 3. Got "
f"image.shape.rank={image.shape.rank}"
)
image_shape = tf.shape(image)
anchor_generators = tf.nest.flatten(self.anchor_generators)
results = [anchor_gen(image_shape) for anchor_gen in anchor_generators]
results = tf.nest.pack_sequence_as(self.anchor_generators, results)
for key in results:
results[key] = bounding_box.convert_format(
results[key],
source="yxyx",
target=self.bounding_box_format,
image_shape=image_shape,
)
return results
# TODO(tanzheny): consider having customized anchor offset.
class _SingleAnchorGenerator:
"""Internal utility to generate anchors for a single feature map in `yxyx` format.
Example:
```python
anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16)
anchors = anchor_gen([512, 512, 3])
```
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]`
Args:
sizes: A single int represents the base anchor size. The anchor
height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be
`anchor_size * sqrt(aspect_ratio)`.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: a list/tuple of positive floats representing the ratio of
anchor width to anchor height.
stride: A single int represents the anchor stride size between center of
each anchor.
clip_boxes: Boolean to represent whether the anchor coordinates should be
clipped to the image size. Defaults to `False`.
dtype: (Optional) The data type to use for the output anchors. Defaults to
'float32'.
"""
def __init__(
self,
bounding_box_format,
sizes,
scales,
aspect_ratios,
stride,
clip_boxes=False,
dtype="float32",
):
self.sizes = sizes
self.scales = scales
self.aspect_ratios = aspect_ratios
self.stride = stride
self.clip_boxes = clip_boxes
self.dtype = dtype
def __call__(self, image_size):
image_height = tf.cast(image_size[0], tf.float32)
image_width = tf.cast(image_size[1], tf.float32)
aspect_ratios = tf.cast(self.aspect_ratios, tf.float32)
aspect_ratios_sqrt = tf.cast(tf.sqrt(aspect_ratios), dtype=tf.float32)
anchor_size = tf.cast(self.sizes, tf.float32)
# [K]
anchor_heights = []
anchor_widths = []
for scale in self.scales:
anchor_size_t = anchor_size * scale
anchor_height = anchor_size_t / aspect_ratios_sqrt
anchor_width = anchor_size_t * aspect_ratios_sqrt
anchor_heights.append(anchor_height)
anchor_widths.append(anchor_width)
anchor_heights = tf.concat(anchor_heights, axis=0)
anchor_widths = tf.concat(anchor_widths, axis=0)
half_anchor_heights = tf.reshape(0.5 * anchor_heights, [1, 1, -1])
half_anchor_widths = tf.reshape(0.5 * anchor_widths, [1, 1, -1])
stride = tf.cast(self.stride, tf.float32)
# [W]
cx = tf.range(0.5 * stride, image_width + 1, stride)
# [H]
cy = tf.range(0.5 * stride, image_height + 1, stride)
# [H, W]
cx_grid, cy_grid = tf.meshgrid(cx, cy)
# [H, W, 1]
cx_grid = tf.expand_dims(cx_grid, axis=-1)
cy_grid = tf.expand_dims(cy_grid, axis=-1)
y_min = tf.reshape(cy_grid - half_anchor_heights, (-1,))
y_max = tf.reshape(cy_grid + half_anchor_heights, (-1,))
x_min = tf.reshape(cx_grid - half_anchor_widths, (-1,))
x_max = tf.reshape(cx_grid + half_anchor_widths, (-1,))
# [H * W * K, 1]
y_min = tf.expand_dims(y_min, axis=-1)
y_max = tf.expand_dims(y_max, axis=-1)
x_min = tf.expand_dims(x_min, axis=-1)
x_max = tf.expand_dims(x_max, axis=-1)
if self.clip_boxes:
y_min = tf.maximum(tf.minimum(y_min, image_height), 0.0)
y_max = tf.maximum(tf.minimum(y_max, image_height), 0.0)
x_min = tf.maximum(tf.minimum(x_min, image_width), 0.0)
x_max = tf.maximum(tf.minimum(x_max, image_width), 0.0)
# [H * W * K, 4]
return tf.cast(tf.concat([y_min, x_min, y_max, x_max], axis=-1), self.dtype)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl.testing import parameterized
from keras_cv import layers as cv_layers
class AnchorGeneratorTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.named_parameters(
("unequal_lists", [0, 1, 2], [1]),
("unequal_levels_dicts", {"level_1": [0, 1, 2]}, {"1": [0, 1, 2]}),
)
def test_raises_when_strides_not_equal_to_sizes(self, sizes, strides):
with self.assertRaises(ValueError):
cv_layers.AnchorGenerator(
bounding_box_format="xyxy",
sizes=sizes,
strides=strides,
aspect_ratios=[3 / 4, 1, 4 / 3],
scales=[0.5, 1.0, 1.5],
)
def test_raises_batched_images(self):
strides = [4]
scales = [1.0]
sizes = [4]
aspect_ratios = [1.0]
anchor_generator = cv_layers.AnchorGenerator(
bounding_box_format="xyxy",
sizes=sizes,
aspect_ratios=aspect_ratios,
scales=scales,
strides=strides,
)
image = tf.random.uniform((4, 8, 8, 3))
with self.assertRaisesRegex(ValueError, "rank"):
_ = anchor_generator(image=image)
def test_output_shapes_image(self):
strides = [2**i for i in range(3, 8)]
scales = [2**x for x in [0, 1 / 3, 2 / 3]]
sizes = [x**2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]]
aspect_ratios = [0.5, 1.0, 2.0]
image_shape = (512, 512, 3)
image = tf.random.uniform(image_shape)
anchor_generator = cv_layers.AnchorGenerator(
bounding_box_format="yxyx",
sizes=sizes,
aspect_ratios=aspect_ratios,
scales=scales,
strides=strides,
)
boxes = anchor_generator(image=image)
boxes = tf.concat(list(boxes.values()), axis=0)
# 49104 is a number found by using the previous internal anchor generator from
# PR https://github.com/keras-team/keras-cv/pull/609
# This unit test was written to ensure compatibility with the existing model.
self.assertEqual(boxes.shape, [49104, 4])
def test_output_shapes_image_shape(self):
strides = [2**i for i in range(3, 8)]
scales = [2**x for x in [0, 1 / 3, 2 / 3]]
sizes = [x**2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]]
aspect_ratios = [0.5, 1.0, 2.0]
image_shape = (512, 512, 3)
anchor_generator = cv_layers.AnchorGenerator(
bounding_box_format="yxyx",
sizes=sizes,
aspect_ratios=aspect_ratios,
scales=scales,
strides=strides,
)
boxes = anchor_generator(image_shape=image_shape)
boxes = tf.concat(list(boxes.values()), axis=0)
# 49104 is a number found by using the previous internal anchor generator from
# PR https://github.com/keras-team/keras-cv/pull/609
# This unit test was written to ensure compatibility with the existing model.
self.assertEqual(boxes.shape, [49104, 4])
def test_hand_crafted_aspect_ratios(self):
strides = [4]
scales = [1.0]
sizes = [4]
aspect_ratios = [3 / 4, 1.0, 4 / 3]
anchor_generator = cv_layers.AnchorGenerator(
bounding_box_format="xyxy",
sizes=sizes,
aspect_ratios=aspect_ratios,
scales=scales,
strides=strides,
)
image = tf.random.uniform((8, 8, 3))
boxes = anchor_generator(image=image)
level_0 = boxes[0]
# width/4 * height/4 * aspect_ratios =
self.assertAllEqual(level_0.shape, [12, 4])
image = tf.random.uniform((4, 4, 3))
boxes = anchor_generator(image=image)
level_0 = boxes[0]
expected_boxes = [
[0.267949224, -0.309401035, 3.7320509, 4.30940104],
[0, 0, 4, 4],
[-0.309401035, 0.267949104, 4.30940104, 3.7320509],
]
self.assertAllClose(level_0, expected_boxes)
def test_hand_crafted_strides(self):
strides = [4]
scales = [1.0]
sizes = [4]
aspect_ratios = [1.0]
anchor_generator = cv_layers.AnchorGenerator(
bounding_box_format="xyxy",
sizes=sizes,
aspect_ratios=aspect_ratios,
scales=scales,
strides=strides,
)
image = tf.random.uniform((8, 8, 3))
boxes = anchor_generator(image=image)
level_0 = boxes[0]
expected_boxes = [
[0, 0, 4, 4],
[4, 0, 8, 4],
[0, 4, 4, 8],
[4, 4, 8, 8],
]
self.assertAllClose(level_0, expected_boxes)
def test_relative_generation(self):
strides = [8, 16, 32]
# 0, 1 / 3, 2 / 3
scales = [2**x for x in [0, 1 / 3, 2 / 3]]
sizes = [32.0, 64.0, 128.0]
aspect_ratios = [0.5, 1.0, 2.0]
image = tf.random.uniform((512, 512, 3))
anchor_generator = cv_layers.AnchorGenerator(
bounding_box_format="rel_yxyx",
sizes=sizes,
aspect_ratios=aspect_ratios,
scales=scales,
strides=strides,
clip_boxes=False,
)
boxes = anchor_generator(image=image)
boxes = tf.concat(list(boxes.values()), axis=0)
self.assertAllLessEqual(boxes, 1.5)
self.assertAllGreaterEqual(boxes, -0.50)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from keras_cv import bounding_box
from keras_cv import layers as cv_layers
class NmsPredictionDecoder(tf.keras.layers.Layer):
"""A Keras layer that decodes predictions of an object detection model.
By default, NmsPredictionDecoder uses a
`keras_cv.layers.NonMaxSuppression` layer to perform box pruning. The layer may
optionally take a `suppression_layer`, which can perform an alternative suppression
operation, such as SoftNonMaxSuppression.
Arguments:
classes: Number of classes in the dataset.
bounding_box_format: The format of bounding boxes of input dataset. Refer
[to the keras.io docs](https://keras.io/api/keras_cv/bounding_box/formats/)
for more details on supported bounding box formats.
anchor_generator: a `keras_cv.layers.AnchorGenerator`.
suppression_layer: (Optional) a `keras.layers.Layer` that follows the same API
signature of the `keras_cv.layers.NonMaxSuppression` layer. This layer should
perform a suppression operation such as NonMaxSuppression, or
SoftNonMaxSuppression.
box_variance: (Optional) The scaling factors used to scale the bounding box
targets. Defaults to `(0.1, 0.1, 0.2, 0.2)`. **Important Note:**
`box_variance` is applied to the boxes in `xywh` format.
"""
def __init__(
self,
bounding_box_format,
anchor_generator,
classes=None,
suppression_layer=None,
box_variance=(0.1, 0.1, 0.2, 0.2),
**kwargs,
):
super().__init__(**kwargs)
if not suppression_layer and not classes:
raise ValueError(
"NmsPredictionDecoder() requires either `suppression_layer` "
f"or `classes`. Received `suppression_layer={suppression_layer} and "
f"classes={classes}`"
)
self.bounding_box_format = bounding_box_format
self.suppression_layer = suppression_layer or cv_layers.NonMaxSuppression(
classes=classes,
bounding_box_format=bounding_box_format,
confidence_threshold=0.5,
iou_threshold=0.5,
max_detections=100,
max_detections_per_class=100,
)
if self.suppression_layer.bounding_box_format != self.bounding_box_format:
raise ValueError(
"`suppression_layer` must have the same `bounding_box_format` "
"as the `NmsPredictionDecoder()` layer. "
"Received `NmsPredictionDecoder.bounding_box_format="
f"{self.bounding_box_format}`, `suppression_layer={suppression_layer}`."
)
self.anchor_generator = anchor_generator
self.box_variance = tf.convert_to_tensor(box_variance, dtype=tf.float32)
self.built = True
# TODO(lukewood): provide this as general utility on top of bounding_box_format.
def _decode_box_predictions(self, anchor_boxes, box_predictions):
boxes = box_predictions * self.box_variance
boxes = tf.concat(
[
boxes[:, :, :2] * anchor_boxes[:, :, 2:] + anchor_boxes[:, :, :2],
tf.math.exp(boxes[:, :, 2:]) * anchor_boxes[:, :, 2:],
],
axis=-1,
)
return boxes
def call(self, images, predictions):
"""Accepts images and raw predictions, and returns bounding box predictions.
Args:
images: Tensor of shape [batch, height, width, channels].
predictions: Dense Tensor of shape [batch, anchor_boxes, 6] in the
`bounding_box_format` specified in the constructor.
"""
if isinstance(images, tf.RaggedTensor):
raise ValueError(
"DecodePredictions() does not support tf.RaggedTensor inputs. "
f"Received images={images}."
)
anchor_boxes = self.anchor_generator(images[0])
anchor_boxes = tf.concat(list(anchor_boxes.values()), axis=0)
anchor_boxes = bounding_box.convert_format(
anchor_boxes,
source=self.anchor_generator.bounding_box_format,
target="xywh",
images=images[0],
)
predictions = bounding_box.convert_format(
predictions, source=self.bounding_box_format, target="xywh", images=images
)
box_predictions = predictions[:, :, :4]
cls_predictions = tf.nn.sigmoid(predictions[:, :, 4:])
classes = tf.math.argmax(cls_predictions, axis=-1)
classes = tf.cast(classes, box_predictions.dtype)
confidence = tf.math.reduce_max(cls_predictions, axis=-1)
classes = tf.expand_dims(classes, axis=-1)
confidence = tf.expand_dims(confidence, axis=-1)
boxes = self._decode_box_predictions(anchor_boxes[None, ...], box_predictions)
boxes = tf.concat([boxes, classes, confidence], axis=-1)
boxes = bounding_box.convert_format(
boxes,
source="xywh",
target=self.suppression_layer.bounding_box_format,
images=images,
)
return self.suppression_layer(boxes, images=images)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from keras_cv import layers as cv_layers
class NmsPredictionDecoderTest(tf.test.TestCase):
def test_decode_predictions_output_shapes(self):
classes = 10
images_shape = (8, 512, 1024, 3)
predictions_shape = (8, 98208, 4 + classes)
images = tf.random.uniform(shape=images_shape)
predictions = tf.random.uniform(
shape=predictions_shape, minval=0.0, maxval=1.0, dtype=tf.float32
)
strides = [2**i for i in range(3, 8)]
scales = [2**x for x in [0, 1 / 3, 2 / 3]]
sizes = [x**2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]]
aspect_ratios = [0.5, 1.0, 2.0]
anchor_generator = cv_layers.AnchorGenerator(
bounding_box_format="yxyx",
sizes=sizes,
aspect_ratios=aspect_ratios,
scales=scales,
strides=strides,
)
layer = cv_layers.NmsPredictionDecoder(
anchor_generator=anchor_generator,
classes=classes,
bounding_box_format="rel_xyxy",
)
result = layer(images=images, predictions=predictions)
self.assertEqual(result.shape, [8, None, 6])
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from keras_cv import bounding_box
@tf.keras.utils.register_keras_serializable(package="keras_cv")
class NonMaxSuppression(tf.keras.layers.Layer):
"""
Implements the non-max suppression layer.
Non-maximal suppression is used to suppress potentially repeated boxes by:
1) picking the highest ranked boxes
2) pruning away all boxes that have a high IoU with the chosen boxes.
References:
- [Yolo paper](https://arxiv.org/pdf/1506.02640)
Args:
classes: an integer representing the number of classes that a bounding
box can belong to.
bounding_box_format: a case-insensitive string which is one of `"xyxy"`,
`"rel_xyxy"`, `"xyWH"`, `"center_xyWH"`, `"yxyx"`, `"rel_yxyx"`. The
position and shape of the bounding box will be followed by the class and
confidence values (in that order). This is required for proper ranking of
the bounding boxes. Therefore, each bounding box is defined by 6 values.
For detailed information on the supported format, see the
[KerasCV bounding box documentation](https://keras.io/api/keras_cv/bounding_box/formats/).
confidence_threshold: a float value in the range [0, 1]. All boxes with
confidence below this value will be discarded. Defaults to 0.05.
iou_threshold: a float value in the range [0, 1] representing the minimum
IoU threshold for two boxes to be considered same for suppression. Defaults
to 0.5.
max_detections: the maximum detections to consider after nms is applied. A large
number may trigger significant memory overhead. Defaults to 100.
max_detections_per_class: the maximum detections to consider per class after
nms is applied. Defaults to 100.
Usage:
```python
images = np.zeros((2, 480, 480, 3), dtype = np.float32)
ex_boxes = np.array([
[
[0, 0, 1, 1, 4, 0.9],
[0, 0, 2, 3, 4, 0.76],
[4, 5, 3, 6, 3, 0.89],
[2, 2, 3, 3, 6, 0.04],
],
[
[0, 0, 5, 6, 4, 0.9],
[0, 0, 7, 3, 1, 0.76],
[4, 5, 5, 6, 4, 0.04],
[2, 1, 3, 3, 7, 0.48],
],
], dtype = np.float32)
nms = NonMaxSuppression(
classes=8,
bounding_box_format="center_xyWH",
iou_threshold=0.1
)
boxes = nms(boxes, images)
```
"""
def __init__(
self,
classes,
bounding_box_format,
confidence_threshold=0.05,
iou_threshold=0.5,
max_detections=100,
max_detections_per_class=100,
**kwargs,
):
super().__init__(**kwargs)
self.classes = classes
self.bounding_box_format = bounding_box_format
self.confidence_threshold = confidence_threshold
self.iou_threshold = iou_threshold
self.max_detections = max_detections
self.max_detections_per_class = max_detections_per_class
self.built = True
def call(self, predictions, images=None):
if predictions.shape[-1] != 6:
raise ValueError(
"keras_cv.layers.NonMaxSuppression() expects `call()` "
"argument `predictions` to be of shape (None, None, 6). Received "
f"predictions.shape={tuple(predictions.shape)}."
)
# convert to yxyx for the TF NMS operation
predictions = bounding_box.convert_format(
predictions,
source=self.bounding_box_format,
target="yxyx",
images=images,
)
# preparing the predictions for TF NMS op
boxes = tf.expand_dims(predictions[..., :4], axis=2)
class_predictions = tf.cast(predictions[..., 4], tf.int32)
scores = predictions[..., 5]
class_predictions = tf.one_hot(class_predictions, self.classes)
scores = tf.expand_dims(scores, axis=-1) * class_predictions
# applying the NMS operation
nmsed_boxes = tf.image.combined_non_max_suppression(
boxes,
scores,
self.max_detections_per_class,
self.max_detections,
self.iou_threshold,
self.confidence_threshold,
clip_boxes=False,
)
# output will be a ragged tensor because num_boxes will change across the batch
boxes = self._decode_nms_boxes_to_tensor(nmsed_boxes)
# converting all boxes to the original format
boxes = self._encode_to_ragged(boxes, nmsed_boxes.valid_detections)
return bounding_box.convert_format(
boxes,
source="yxyx",
target=self.bounding_box_format,
images=images,
)
def _decode_nms_boxes_to_tensor(self, nmsed_boxes):
boxes = tf.TensorArray(
tf.float32, size=0, infer_shape=False, element_shape=(6,), dynamic_size=True
)
for i in tf.range(tf.shape(nmsed_boxes.nmsed_boxes)[0]):
num_detections = nmsed_boxes.valid_detections[i]
# recombining with classes and scores
boxes_recombined = tf.concat(
[
nmsed_boxes.nmsed_boxes[i][:num_detections],
tf.expand_dims(
nmsed_boxes.nmsed_classes[i][:num_detections], axis=-1
),
tf.expand_dims(
nmsed_boxes.nmsed_scores[i][:num_detections], axis=-1
),
],
axis=-1,
)
# iterate through the boxes and append it to TensorArray
for j in range(nmsed_boxes.valid_detections[i]):
boxes = boxes.write(boxes.size(), boxes_recombined[j])
# stacking to create a tensor
return boxes.stack()
def _encode_to_ragged(self, boxes, valid_detections):
# using cumulative sum to calculate row_limits for ragged tensor
row_limits = tf.cumsum(valid_detections)
# creating the output RaggedTensor by splitting boxes at row_limits
result = tf.RaggedTensor.from_row_limits(values=boxes, row_limits=row_limits)
return result
def get_config(self):
config = {
"classes": self.classes,
"bounding_box_format": self.bounding_box_format,
"confidence_threshold": self.confidence_threshold,
"iou_threshold": self.iou_threshold,
"max_detections": self.max_detections,
"max_detections_per_class": self.max_detections_per_class,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment