Commit cc748b2a authored by Abdullah Rashwan's avatar Abdullah Rashwan Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 329754787
parent 2f788e1d
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""The generic parser interface."""
import abc
class Parser(object):
"""Parses data and produces tensors to be consumed by models."""
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def _parse_train_data(self, decoded_tensors):
"""Generates images and labels that are usable for model training.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
@abc.abstractmethod
def _parse_eval_data(self, decoded_tensors):
"""Generates images and labels that are usable for model evaluation.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
def parse_fn(self, is_training):
"""Returns a parse fn that reads and parses raw tensors from the decoder.
Args:
is_training: a `bool` to indicate whether it is in training mode.
Returns:
parse: a `callable` that takes the serialized examle and generate the
images, labels tuple where labels is a dict of Tensors that contains
labels.
"""
def parse(decoded_tensors):
"""Parses the serialized example data."""
if is_training:
return self._parse_train_data(decoded_tensors)
else:
return self._parse_eval_data(decoded_tensors)
return parse
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Data parser and processing for RetinaNet.
Parse image and ground truths in a dataset to training targets and package them
into (image, labels) tuple for RetinaNet.
"""
# Import libraries
import tensorflow as tf
from official.vision.beta.dataloaders import parser
from official.vision.beta.dataloaders import utils
from official.vision.beta.ops import anchor
from official.vision.beta.ops import box_ops
from official.vision.beta.ops import preprocess_ops
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def __init__(self,
output_size,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
match_threshold=0.5,
unmatched_threshold=0.5,
aug_rand_hflip=False,
aug_scale_min=1.0,
aug_scale_max=1.0,
use_autoaugment=False,
autoaugment_policy_name='v0',
skip_crowd_during_training=True,
max_num_instances=100,
dtype='bfloat16',
mode=None):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added on each
level. For instances, num_scales=2 adds one additional intermediate
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
match_threshold: `float` number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: `float` number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
aug_rand_hflip: `bool`, if True, augment training with random horizontal
flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
during training.
autoaugment_policy_name: `string` that specifies the name of the
AutoAugment policy that will be used during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
prediction with groundtruths in the outputs.
"""
self._mode = mode
self._max_num_instances = max_num_instances
self._skip_crowd_during_training = skip_crowd_during_training
# Anchor.
self._output_size = output_size
self._min_level = min_level
self._max_level = max_level
self._num_scales = num_scales
self._aspect_ratios = aspect_ratios
self._anchor_size = anchor_size
self._match_threshold = match_threshold
self._unmatched_threshold = unmatched_threshold
# Data augmentation.
self._aug_rand_hflip = aug_rand_hflip
self._aug_scale_min = aug_scale_min
self._aug_scale_max = aug_scale_max
# Data Augmentation with AutoAugment.
self._use_autoaugment = use_autoaugment
self._autoaugment_policy_name = autoaugment_policy_name
# Device.
self._use_bfloat16 = True if dtype == 'bfloat16' else False
def _parse_train_data(self, data):
"""Parses data for training and evaluation."""
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
is_crowds = data['groundtruth_is_crowd']
# Skips annotations with `is_crowd` = True.
if self._skip_crowd_during_training:
num_groundtrtuhs = tf.shape(input=classes)[0]
with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
indices = tf.cond(
pred=tf.greater(tf.size(input=is_crowds), 0),
true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
classes = tf.gather(classes, indices)
boxes = tf.gather(boxes, indices)
# Gets original image and its size.
image = data['image']
image_shape = tf.shape(input=image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Flips image randomly during training.
if self._aug_rand_hflip:
image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(self._output_size,
2**self._max_level),
aug_scale_min=self._aug_scale_min,
aug_scale_max=self._aug_scale_max)
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
image_scale = image_info[2, :]
offset = image_info[3, :]
boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
image_info[1, :], offset)
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
# Assigns anchors.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
self._unmatched_threshold)
(cls_targets, box_targets, cls_weights,
box_weights) = anchor_labeler.label_anchors(
anchor_boxes, boxes,
tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
# If bfloat16 is used, casts input image to tf.bfloat16.
if self._use_bfloat16:
image = tf.cast(image, dtype=tf.bfloat16)
# Packs labels for model_fn outputs.
labels = {
'cls_targets': cls_targets,
'box_targets': box_targets,
'anchor_boxes': anchor_boxes,
'cls_weights': cls_weights,
'box_weights': box_weights,
'image_info': image_info,
}
return image, labels
def _parse_eval_data(self, data):
"""Parses data for training and evaluation."""
groundtruths = {}
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
# Gets original image and its size.
image = data['image']
image_shape = tf.shape(input=image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(self._output_size,
2**self._max_level),
aug_scale_min=1.0,
aug_scale_max=1.0)
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
image_scale = image_info[2, :]
offset = image_info[3, :]
boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
image_info[1, :], offset)
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
# Assigns anchors.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
self._unmatched_threshold)
(cls_targets, box_targets, cls_weights,
box_weights) = anchor_labeler.label_anchors(
anchor_boxes, boxes,
tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
# If bfloat16 is used, casts input image to tf.bfloat16.
if self._use_bfloat16:
image = tf.cast(image, dtype=tf.bfloat16)
# Sets up groundtruth data for evaluation.
groundtruths = {
'source_id': data['source_id'],
'height': data['height'],
'width': data['width'],
'num_detections': tf.shape(data['groundtruth_classes']),
'image_info': image_info,
'boxes': box_ops.denormalize_boxes(
data['groundtruth_boxes'], image_shape),
'classes': data['groundtruth_classes'],
'areas': data['groundtruth_area'],
'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
}
groundtruths['source_id'] = utils.process_source_id(
groundtruths['source_id'])
groundtruths = utils.pad_groundtruths_to_fixed_size(
groundtruths, self._max_num_instances)
# Packs labels for model_fn outputs.
labels = {
'cls_targets': cls_targets,
'box_targets': box_targets,
'anchor_boxes': anchor_boxes,
'cls_weights': cls_weights,
'box_weights': box_weights,
'image_info': image_info,
'groundtruths': groundtruths,
}
return image, labels
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for retinanet_parser.py."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from official.core import input_reader
from official.modeling.hyperparams import config_definitions as cfg
from official.vision.beta.dataloaders import retinanet_input
from official.vision.beta.dataloaders import tf_example_decoder
class RetinaNetInputTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
([512, 640], True, True, True),
([640, 640], False, False, False),
)
def testRetinanetInputReader(self,
output_size,
skip_crowd_during_training,
use_autoaugment,
is_training):
batch_size = 2
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [0.5, 1.0, 2.0]
anchor_size = 3
max_num_instances = 100
params = cfg.DataConfig(
input_path='/placer/prod/home/snaggletooth/test/data/coco/val*',
global_batch_size=batch_size,
is_training=is_training)
decoder = tf_example_decoder.TfExampleDecoder()
parser = retinanet_input.Parser(
output_size=output_size,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size,
skip_crowd_during_training=skip_crowd_during_training,
use_autoaugment=use_autoaugment,
max_num_instances=max_num_instances,
dtype='bfloat16')
reader = input_reader.InputReader(
params,
dataset_fn=tf.data.TFRecordDataset,
decoder_fn=decoder.decode,
parser_fn=parser.parse_fn(params.is_training))
dataset = reader.read()
iterator = iter(dataset)
image, labels = next(iterator)
np_image = image.numpy()
np_labels = tf.nest.map_structure(lambda x: x.numpy(), labels)
# Checks image shape.
self.assertEqual(list(np_image.shape),
[batch_size, output_size[0], output_size[1], 3])
# Checks keys in labels.
if is_training:
self.assertCountEqual(
np_labels.keys(),
['cls_targets', 'box_targets', 'anchor_boxes', 'cls_weights',
'box_weights', 'image_info'])
else:
self.assertCountEqual(
np_labels.keys(),
['cls_targets', 'box_targets', 'anchor_boxes', 'cls_weights',
'box_weights', 'groundtruths', 'image_info'])
# Checks shapes of `image_info` and `anchor_boxes`.
self.assertEqual(np_labels['image_info'].shape, (batch_size, 4, 2))
n_anchors = 0
for level in range(min_level, max_level + 1):
stride = 2 ** level
output_size_l = [output_size[0] / stride, output_size[1] / stride]
anchors_per_location = num_scales * len(aspect_ratios)
self.assertEqual(
list(np_labels['anchor_boxes'][level].shape),
[batch_size, output_size_l[0], output_size_l[1],
4 * anchors_per_location])
n_anchors += output_size_l[0] * output_size_l[1] * anchors_per_location
# Checks shapes of training objectives.
self.assertEqual(np_labels['cls_weights'].shape, (batch_size, n_anchors))
for level in range(min_level, max_level + 1):
stride = 2 ** level
output_size_l = [output_size[0] / stride, output_size[1] / stride]
anchors_per_location = num_scales * len(aspect_ratios)
self.assertEqual(
list(np_labels['cls_targets'][level].shape),
[batch_size, output_size_l[0], output_size_l[1],
anchors_per_location])
self.assertEqual(
list(np_labels['box_targets'][level].shape),
[batch_size, output_size_l[0], output_size_l[1],
4 * anchors_per_location])
# Checks shape of groundtruths for eval.
if not is_training:
self.assertEqual(np_labels['groundtruths']['source_id'].shape,
(batch_size,))
self.assertEqual(np_labels['groundtruths']['classes'].shape,
(batch_size, max_num_instances))
self.assertEqual(np_labels['groundtruths']['boxes'].shape,
(batch_size, max_num_instances, 4))
self.assertEqual(np_labels['groundtruths']['areas'].shape,
(batch_size, max_num_instances))
self.assertEqual(np_labels['groundtruths']['is_crowds'].shape,
(batch_size, max_num_instances))
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import csv
# Import libraries
import tensorflow as tf
from official.vision.beta.dataloaders import decoder
def _generate_source_id(image_bytes):
return tf.strings.as_string(
tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 63 - 1))
class TfExampleDecoder(decoder.Decoder):
"""Tensorflow Example proto decoder."""
def __init__(self,
include_mask=False,
regenerate_source_id=False):
self._include_mask = include_mask
self._regenerate_source_id = regenerate_source_id
self._keys_to_features = {
'image/encoded': tf.io.FixedLenFeature((), tf.string),
'image/source_id': tf.io.FixedLenFeature((), tf.string),
'image/height': tf.io.FixedLenFeature((), tf.int64),
'image/width': tf.io.FixedLenFeature((), tf.int64),
'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
'image/object/class/label': tf.io.VarLenFeature(tf.int64),
'image/object/area': tf.io.VarLenFeature(tf.float32),
'image/object/is_crowd': tf.io.VarLenFeature(tf.int64),
}
if include_mask:
self._keys_to_features.update({
'image/object/mask': tf.io.VarLenFeature(tf.string),
})
def _decode_image(self, parsed_tensors):
"""Decodes the image and set its static shape."""
image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3)
image.set_shape([None, None, 3])
return image
def _decode_boxes(self, parsed_tensors):
"""Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
xmin = parsed_tensors['image/object/bbox/xmin']
xmax = parsed_tensors['image/object/bbox/xmax']
ymin = parsed_tensors['image/object/bbox/ymin']
ymax = parsed_tensors['image/object/bbox/ymax']
return tf.stack([ymin, xmin, ymax, xmax], axis=-1)
def _decode_classes(self, parsed_tensors):
return parsed_tensors['image/object/class/label']
def _decode_areas(self, parsed_tensors):
xmin = parsed_tensors['image/object/bbox/xmin']
xmax = parsed_tensors['image/object/bbox/xmax']
ymin = parsed_tensors['image/object/bbox/ymin']
ymax = parsed_tensors['image/object/bbox/ymax']
height = tf.cast(parsed_tensors['image/height'], dtype=tf.float32)
width = tf.cast(parsed_tensors['image/width'], dtype=tf.float32)
return tf.cond(
tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0),
lambda: parsed_tensors['image/object/area'],
lambda: (xmax - xmin) * (ymax - ymin) * height * width)
def _decode_masks(self, parsed_tensors):
"""Decode a set of PNG masks to the tf.float32 tensors."""
def _decode_png_mask(png_bytes):
mask = tf.squeeze(
tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1)
mask = tf.cast(mask, dtype=tf.float32)
mask.set_shape([None, None])
return mask
height = parsed_tensors['image/height']
width = parsed_tensors['image/width']
masks = parsed_tensors['image/object/mask']
return tf.cond(
pred=tf.greater(tf.size(input=masks), 0),
true_fn=lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32),
false_fn=lambda: tf.zeros([0, height, width], dtype=tf.float32))
def decode(self, serialized_example):
"""Decode the serialized example.
Args:
serialized_example: a single serialized tf.Example string.
Returns:
decoded_tensors: a dictionary of tensors with the following fields:
- source_id: a string scalar tensor.
- image: a uint8 tensor of shape [None, None, 3].
- height: an integer scalar tensor.
- width: an integer scalar tensor.
- groundtruth_classes: a int64 tensor of shape [None].
- groundtruth_is_crowd: a bool tensor of shape [None].
- groundtruth_area: a float32 tensor of shape [None].
- groundtruth_boxes: a float32 tensor of shape [None, 4].
- groundtruth_instance_masks: a float32 tensor of shape
[None, None, None].
- groundtruth_instance_masks_png: a string tensor of shape [None].
"""
parsed_tensors = tf.io.parse_single_example(
serialized=serialized_example, features=self._keys_to_features)
for k in parsed_tensors:
if isinstance(parsed_tensors[k], tf.SparseTensor):
if parsed_tensors[k].dtype == tf.string:
parsed_tensors[k] = tf.sparse.to_dense(
parsed_tensors[k], default_value='')
else:
parsed_tensors[k] = tf.sparse.to_dense(
parsed_tensors[k], default_value=0)
if self._regenerate_source_id:
source_id = _generate_source_id(parsed_tensors['image/encoded'])
else:
source_id = tf.cond(
tf.greater(tf.strings.length(parsed_tensors['image/source_id']), 0),
lambda: parsed_tensors['image/source_id'],
lambda: _generate_source_id(parsed_tensors['image/encoded']))
image = self._decode_image(parsed_tensors)
boxes = self._decode_boxes(parsed_tensors)
classes = self._decode_classes(parsed_tensors)
areas = self._decode_areas(parsed_tensors)
is_crowds = tf.cond(
tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0),
lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool),
lambda: tf.zeros_like(classes, dtype=tf.bool))
if self._include_mask:
masks = self._decode_masks(parsed_tensors)
decoded_tensors = {
'source_id': source_id,
'image': image,
'height': parsed_tensors['image/height'],
'width': parsed_tensors['image/width'],
'groundtruth_classes': classes,
'groundtruth_is_crowd': is_crowds,
'groundtruth_area': areas,
'groundtruth_boxes': boxes,
}
if self._include_mask:
decoded_tensors.update({
'groundtruth_instance_masks': masks,
'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'],
})
return decoded_tensors
class TfExampleDecoderLabelMap(TfExampleDecoder):
"""Tensorflow Example proto decoder."""
def __init__(self, label_map, include_mask=False, regenerate_source_id=False):
super(TfExampleDecoderLabelMap, self).__init__(
include_mask=include_mask, regenerate_source_id=regenerate_source_id)
self._keys_to_features.update({
'image/object/class/text': tf.io.VarLenFeature(tf.string),
})
name_to_id = self._process_label_map(label_map)
self._name_to_id_table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(
keys=tf.constant(list(name_to_id.keys()), dtype=tf.string),
values=tf.constant(list(name_to_id.values()), dtype=tf.int64)),
default_value=-1)
def _process_label_map(self, label_map):
if label_map.endswith('.csv'):
name_to_id = self._process_csv(label_map)
else:
raise ValueError('The label map file is in incorrect format.')
return name_to_id
def _process_csv(self, label_map):
name_to_id = {}
with tf.io.gfile.GFile(label_map, 'r') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
if len(row) != 2:
raise ValueError('Each row of the csv label map file must be in '
'`id,name` format. length = {}'.format(len(row)))
id_index = int(row[0])
name = row[1]
name_to_id[name] = id_index
return name_to_id
def _decode_classes(self, parsed_tensors):
return self._name_to_id_table.lookup(
parsed_tensors['image/object/class/text'])
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tf_example_decoder.py."""
import io
# Import libraries
from absl.testing import parameterized
import numpy as np
from PIL import Image
import tensorflow as tf
from official.vision.beta.dataloaders import tf_example_decoder
DUMP_SOURCE_ID = b'123'
def _encode_image(image_array, fmt):
image = Image.fromarray(image_array)
with io.BytesIO() as output:
image.save(output, format=fmt)
return output.getvalue()
class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
(100, 100, 0, True),
(100, 100, 1, True),
(100, 100, 2, True),
(100, 100, 0, False),
(100, 100, 1, False),
(100, 100, 2, False),
)
def test_result_shape(self,
image_height,
image_width,
num_instances,
regenerate_source_id):
decoder = tf_example_decoder.TfExampleDecoder(
include_mask=True, regenerate_source_id=regenerate_source_id)
image = _encode_image(
np.uint8(np.random.rand(image_height, image_width, 3) * 255),
fmt='JPEG')
if num_instances == 0:
xmins = []
xmaxs = []
ymins = []
ymaxs = []
labels = []
areas = []
is_crowds = []
masks = []
else:
xmins = list(np.random.rand(num_instances))
xmaxs = list(np.random.rand(num_instances))
ymins = list(np.random.rand(num_instances))
ymaxs = list(np.random.rand(num_instances))
labels = list(np.random.randint(100, size=num_instances))
areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
is_crowds = [0] * num_instances
masks = []
for _ in range(num_instances):
mask = _encode_image(
np.uint8(np.random.rand(image_height, image_width) * 255),
fmt='PNG')
masks.append(mask)
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
'image/height': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/label': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=labels))),
'image/object/is_crowd': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=is_crowds))),
'image/object/area': (
tf.train.Feature(
float_list=tf.train.FloatList(value=areas))),
'image/object/mask': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
})).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(value=serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
if not regenerate_source_id:
self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
def test_result_content(self):
decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
image = _encode_image(np.uint8(image_content), fmt='PNG')
image_height = 4
image_width = 4
num_instances = 2
xmins = [0, 0.25]
xmaxs = [0.5, 1.0]
ymins = [0, 0]
ymaxs = [0.5, 1.0]
labels = [3, 1]
areas = [
0.25 * image_height * image_width, 0.75 * image_height * image_width
]
is_crowds = [1, 0]
mask_content = [[[255, 255, 0, 0],
[255, 255, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]],
[[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255]]]
masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
'image/height': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/label': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=labels))),
'image/object/is_crowd': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=is_crowds))),
'image/object/area': (
tf.train.Feature(
float_list=tf.train.FloatList(value=areas))),
'image/object/mask': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
})).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(value=serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
self.assertAllEqual(image_content, results['image'])
self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
self.assertAllEqual(
[3, 1], results['groundtruth_classes'])
self.assertAllEqual(
[True, False], results['groundtruth_is_crowd'])
self.assertNDArrayNear(
[0.25 * image_height * image_width, 0.75 * image_height * image_width],
results['groundtruth_area'], 1e-4)
self.assertNDArrayNear(
[[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
results['groundtruth_boxes'], 1e-4)
self.assertNDArrayNear(
mask_content, results['groundtruth_instance_masks'], 1e-4)
self.assertAllEqual(
masks, results['groundtruth_instance_masks_png'])
def test_handling_missing_fields(self):
decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
image = _encode_image(np.uint8(image_content), fmt='PNG')
image_height = 4
image_width = 4
num_instances = 2
xmins = [0, 0.25]
xmaxs = [0.5, 1.0]
ymins = [0, 0]
ymaxs = [0.5, 1.0]
labels = [3, 1]
mask_content = [[[255, 255, 0, 0],
[255, 255, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]],
[[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255]]]
masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
'image/height': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/label': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=labels))),
'image/object/mask': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
})).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
self.assertAllEqual(image_content, results['image'])
self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
self.assertAllEqual(
[3, 1], results['groundtruth_classes'])
self.assertAllEqual(
[False, False], results['groundtruth_is_crowd'])
self.assertNDArrayNear(
[0.25 * image_height * image_width, 0.75 * image_height * image_width],
results['groundtruth_area'], 1e-4)
self.assertNDArrayNear(
[[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
results['groundtruth_boxes'], 1e-4)
self.assertNDArrayNear(
mask_content, results['groundtruth_instance_masks'], 1e-4)
self.assertAllEqual(
masks, results['groundtruth_instance_masks_png'])
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import csv
# Import libraries
import tensorflow as tf
from official.vision.beta.dataloaders import tf_example_decoder
class TfExampleDecoderLabelMap(tf_example_decoder.TfExampleDecoder):
"""Tensorflow Example proto decoder."""
def __init__(self, label_map, include_mask=False, regenerate_source_id=False):
super(TfExampleDecoderLabelMap, self).__init__(
include_mask=include_mask, regenerate_source_id=regenerate_source_id)
self._keys_to_features.update({
'image/object/class/text': tf.io.VarLenFeature(tf.string),
})
name_to_id = self._process_label_map(label_map)
self._name_to_id_table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(
keys=tf.constant(list(name_to_id.keys()), dtype=tf.string),
values=tf.constant(list(name_to_id.values()), dtype=tf.int64)),
default_value=-1)
def _process_label_map(self, label_map):
if label_map.endswith('.csv'):
name_to_id = self._process_csv(label_map)
else:
raise ValueError('The label map file is in incorrect format.')
return name_to_id
def _process_csv(self, label_map):
name_to_id = {}
with tf.io.gfile.GFile(label_map, 'r') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
if len(row) != 2:
raise ValueError('Each row of the csv label map file must be in '
'`id,name` format. length = {}'.format(len(row)))
id_index = int(row[0])
name = row[1]
name_to_id[name] = id_index
return name_to_id
def _decode_classes(self, parsed_tensors):
return self._name_to_id_table.lookup(
parsed_tensors['image/object/class/text'])
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tf_example_label_map_decoder.py."""
import io
import os
# Import libraries
from absl.testing import parameterized
import numpy as np
from PIL import Image
import tensorflow as tf
from official.vision.beta.dataloaders import tf_example_label_map_decoder
DUMP_SOURCE_ID = b'123'
LABEL_MAP_CSV_CONTENT = '0,class_0\n1,class_1\n2,class_2'
def _encode_image(image_array, fmt):
image = Image.fromarray(image_array)
with io.BytesIO() as output:
image.save(output, format=fmt)
return output.getvalue()
class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
(100, 100, 0),
(100, 100, 1),
(100, 100, 2),
(100, 100, 0),
(100, 100, 1),
(100, 100, 2),
)
def test_result_shape(self, image_height, image_width, num_instances):
label_map_dir = self.get_temp_dir()
label_map_name = 'label_map.csv'
label_map_path = os.path.join(label_map_dir, label_map_name)
with open(label_map_path, 'w') as f:
f.write(LABEL_MAP_CSV_CONTENT)
decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
label_map_path, include_mask=True)
image = _encode_image(
np.uint8(np.random.rand(image_height, image_width, 3) * 255),
fmt='JPEG')
if num_instances == 0:
xmins = []
xmaxs = []
ymins = []
ymaxs = []
labels = []
areas = []
is_crowds = []
masks = []
else:
xmins = list(np.random.rand(num_instances))
xmaxs = list(np.random.rand(num_instances))
ymins = list(np.random.rand(num_instances))
ymaxs = list(np.random.rand(num_instances))
labels = list(np.random.randint(100, size=num_instances))
areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
is_crowds = [0] * num_instances
masks = []
labels = [b'class_1'] * num_instances
for _ in range(num_instances):
mask = _encode_image(
np.uint8(np.random.rand(image_height, image_width) * 255),
fmt='PNG')
masks.append(mask)
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
'image/height': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/text': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=labels))),
'image/object/is_crowd': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=is_crowds))),
'image/object/area': (
tf.train.Feature(
float_list=tf.train.FloatList(value=areas))),
'image/object/mask': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
})).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(value=serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
def test_result_content(self):
label_map_dir = self.get_temp_dir()
label_map_name = 'label_map.csv'
label_map_path = os.path.join(label_map_dir, label_map_name)
with open(label_map_path, 'w') as f:
f.write(LABEL_MAP_CSV_CONTENT)
decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
label_map_path, include_mask=True)
image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
image = _encode_image(np.uint8(image_content), fmt='PNG')
image_height = 4
image_width = 4
num_instances = 2
xmins = [0, 0.25]
xmaxs = [0.5, 1.0]
ymins = [0, 0]
ymaxs = [0.5, 1.0]
labels = [b'class_2', b'class_0']
areas = [
0.25 * image_height * image_width, 0.75 * image_height * image_width
]
is_crowds = [1, 0]
mask_content = [[[255, 255, 0, 0],
[255, 255, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]],
[[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255]]]
masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
'image/height': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (
tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/text': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=labels))),
'image/object/is_crowd': (
tf.train.Feature(
int64_list=tf.train.Int64List(value=is_crowds))),
'image/object/area': (
tf.train.Feature(
float_list=tf.train.FloatList(value=areas))),
'image/object/mask': (
tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
})).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(value=serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
self.assertAllEqual(image_content, results['image'])
self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
self.assertAllEqual(
[2, 0], results['groundtruth_classes'])
self.assertAllEqual(
[True, False], results['groundtruth_is_crowd'])
self.assertNDArrayNear(
[0.25 * image_height * image_width, 0.75 * image_height * image_width],
results['groundtruth_area'], 1e-4)
self.assertNDArrayNear(
[[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
results['groundtruth_boxes'], 1e-4)
self.assertNDArrayNear(
mask_content, results['groundtruth_instance_masks'], 1e-4)
self.assertAllEqual(
masks, results['groundtruth_instance_masks_png'])
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Data loader utils."""
# Import libraries
import tensorflow as tf
from official.vision.beta.ops import preprocess_ops
def process_source_id(source_id):
"""Processes source_id to the right format."""
if source_id.dtype == tf.string:
source_id = tf.cast(tf.strings.to_number(source_id), tf.int32)
with tf.control_dependencies([source_id]):
source_id = tf.cond(
pred=tf.equal(tf.size(input=source_id), 0),
true_fn=lambda: tf.cast(tf.constant(-1), tf.int32),
false_fn=lambda: tf.identity(source_id))
return source_id
def pad_groundtruths_to_fixed_size(groundtruths, size):
"""Pads the first dimension of groundtruths labels to the fixed size."""
groundtruths['boxes'] = preprocess_ops.clip_or_pad_to_fixed_size(
groundtruths['boxes'], size, -1)
groundtruths['is_crowds'] = preprocess_ops.clip_or_pad_to_fixed_size(
groundtruths['is_crowds'], size, 0)
groundtruths['areas'] = preprocess_ops.clip_or_pad_to_fixed_size(
groundtruths['areas'], size, -1)
groundtruths['classes'] = preprocess_ops.clip_or_pad_to_fixed_size(
groundtruths['classes'], size, -1)
return groundtruths
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Parser for video and label datasets."""
from typing import Dict, Optional, Tuple
from absl import logging
import tensorflow as tf
from official.vision.beta.configs import video_classification as exp_cfg
from official.vision.beta.dataloaders import decoder
from official.vision.beta.dataloaders import parser
from official.vision.beta.ops import preprocess_ops_3d
IMAGE_KEY = 'image/encoded'
LABEL_KEY = 'clip/label/index'
def _process_image(image: tf.Tensor,
is_training: bool = True,
num_frames: int = 32,
stride: int = 1,
num_test_clips: int = 1,
min_resize: int = 224,
crop_size: int = 200,
zero_centering_image: bool = False,
seed: Optional[int] = None) -> tf.Tensor:
"""Processes a serialized image tensor.
Args:
image: Input Tensor of shape [timesteps] and type tf.string of serialized
frames.
is_training: Whether or not in training mode. If True, random sample, crop
and left right flip is used.
num_frames: Number of frames per subclip.
stride: Temporal stride to sample frames.
num_test_clips: Number of test clips (1 by default). If more than 1, this
will sample multiple linearly spaced clips within each video at test time.
If 1, then a single clip in the middle of the video is sampled. The clips
are aggreagated in the batch dimension.
min_resize: Frames are resized so that min(height, width) is min_resize.
crop_size: Final size of the frame after cropping the resized frames. Both
height and width are the same.
zero_centering_image: If True, frames are normalized to values in [-1, 1].
If False, values in [0, 1].
seed: A deterministic seed to use when sampling.
Returns:
Processed frames. Tensor of shape
[num_frames * num_test_clips, crop_size, crop_size, 3].
"""
# Validate parameters.
if is_training and num_test_clips != 1:
logging.warning(
'`num_test_clips` %d is ignored since `is_training` is `True`.',
num_test_clips)
# Temporal sampler.
if is_training:
# Sample random clip.
image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride,
seed)
elif num_test_clips > 1:
# Sample linspace clips.
image = preprocess_ops_3d.sample_linspace_sequence(image, num_test_clips,
num_frames, stride)
else:
# Sample middle clip.
image = preprocess_ops_3d.sample_sequence(image, num_frames, False, stride)
# Decode JPEG string to tf.uint8.
image = preprocess_ops_3d.decode_jpeg(image, 3)
# Resize images (resize happens only if necessary to save compute).
image = preprocess_ops_3d.resize_smallest(image, min_resize)
if is_training:
# Standard image data augmentation: random crop and random flip.
image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, True,
seed)
image = preprocess_ops_3d.random_flip_left_right(image, seed)
else:
# Central crop of the frames.
image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, False)
# Cast the frames in float32, normalizing according to zero_centering_image.
return preprocess_ops_3d.normalize_image(image, zero_centering_image)
def _postprocess_image(image: tf.Tensor,
is_training: bool = True,
num_frames: int = 32,
num_test_clips: int = 1) -> tf.Tensor:
"""Processes a batched Tensor of frames.
The same parameters used in process should be used here.
Args:
image: Input Tensor of shape [batch, timesteps, height, width, 3].
is_training: Whether or not in training mode. If True, random sample, crop
and left right flip is used.
num_frames: Number of frames per subclip.
num_test_clips: Number of test clips (1 by default). If more than 1, this
will sample multiple linearly spaced clips within each video at test time.
If 1, then a single clip in the middle of the video is sampled. The clips
are aggreagated in the batch dimension.
Returns:
Processed frames. Tensor of shape
[batch * num_test_clips, num_frames, height, width, 3].
"""
if num_test_clips > 1 and not is_training:
# In this case, multiple clips are merged together in batch dimenstion which
# will be B * num_test_clips.
image = tf.reshape(
image, (-1, num_frames, image.shape[2], image.shape[3], image.shape[4]))
return image
def _process_label(label: tf.Tensor,
one_hot_label: bool = True,
num_classes: Optional[int] = None) -> tf.Tensor:
"""Processes label Tensor."""
# Validate parameters.
if one_hot_label and not num_classes:
raise ValueError(
'`num_classes` should be given when requesting one hot label.')
# Cast to tf.int32.
label = tf.cast(label, dtype=tf.int32)
if one_hot_label:
# Replace label index by one hot representation.
label = tf.one_hot(label, num_classes)
return label
class Decoder(decoder.Decoder):
"""A tf.Example decoder for classification task."""
def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY):
self._image_key = IMAGE_KEY
self._label_key = LABEL_KEY
self._context_description = {
# One integer stored in context.
self._label_key: tf.io.FixedLenFeature((), tf.int64),
}
self._sequence_description = {
# Each image is a string encoding JPEG.
self._image_key: tf.io.FixedLenSequenceFeature((), tf.string),
}
def decode(self, serialized_example):
"""Parses a single tf.Example into image and label tensors."""
context, sequences = tf.io.parse_single_sequence_example(
serialized_example, self._context_description,
self._sequence_description)
return {
self._image_key: sequences[self._image_key],
self._label_key: context[self._label_key]
}
class Parser(parser.Parser):
"""Parses a video and label dataset."""
def __init__(self,
input_params: exp_cfg.DataConfig,
image_key: str = IMAGE_KEY,
label_key: str = LABEL_KEY):
self._num_frames = input_params.feature_shape[0]
self._stride = input_params.temporal_stride
self._num_test_clips = input_params.num_test_clips
self._min_resize = input_params.min_image_size
self._crop_size = input_params.feature_shape[1]
self._one_hot_label = input_params.one_hot
self._num_classes = input_params.num_classes
self._image_key = image_key
self._label_key = label_key
def _parse_train_data(
self, decoded_tensors: Dict[str, tf.Tensor]
) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
"""Parses data for training."""
# Process image and label.
image = decoded_tensors[self._image_key]
label = decoded_tensors[self._label_key]
image = _process_image(
image=image,
is_training=True,
num_frames=self._num_frames,
stride=self._stride,
num_test_clips=self._num_test_clips,
min_resize=self._min_resize,
crop_size=self._crop_size)
label = _process_label(label, self._one_hot_label, self._num_classes)
return {'image': image}, label
def _parse_eval_data(
self, decoded_tensors: Dict[str, tf.Tensor]
) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
"""Parses data for evaluation."""
image = decoded_tensors[self._image_key]
label = decoded_tensors[self._label_key]
image = _process_image(
image=image,
is_training=False,
num_frames=self._num_frames,
stride=self._stride,
num_test_clips=self._num_test_clips,
min_resize=self._min_resize,
crop_size=self._crop_size)
label = _process_label(label, self._one_hot_label, self._num_classes)
return {'image': image}, label
class PostBatchProcessor(object):
"""Processes a video and label dataset which is batched."""
def __init__(self, input_params: exp_cfg.DataConfig):
self._is_training = input_params.is_training
self._num_frames = input_params.feature_shape[0]
self._num_test_clips = input_params.num_test_clips
def __call__(
self,
image: Dict[str, tf.Tensor],
label: tf.Tensor) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
"""Parses a single tf.Example into image and label tensors."""
image = image['image']
image = _postprocess_image(
image=image,
is_training=self._is_training,
num_frames=self._num_frames,
num_test_clips=self._num_test_clips)
return {'image': image}, label
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import io
# Import libraries
import numpy as np
from PIL import Image
import tensorflow as tf
from official.vision.beta.configs import video_classification as exp_cfg
from official.vision.beta.dataloaders import video_input
class DecoderTest(tf.test.TestCase):
"""A tf.SequenceExample decoder for the video classification task."""
def test_decoder(self):
decoder = video_input.Decoder()
# Create fake data.
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
random_image = Image.fromarray(random_image)
label = 42
with io.BytesIO() as buffer:
random_image.save(buffer, format='JPEG')
raw_image_bytes = buffer.getvalue()
seq_example = tf.train.SequenceExample()
seq_example.feature_lists.feature_list.get_or_create(
video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
raw_image_bytes
]
seq_example.feature_lists.feature_list.get_or_create(
video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
raw_image_bytes
]
seq_example.context.feature[video_input.LABEL_KEY].int64_list.value[:] = [
label
]
serialized_example = seq_example.SerializeToString()
decoded_tensors = decoder.decode(tf.convert_to_tensor(serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertCountEqual([video_input.IMAGE_KEY, video_input.LABEL_KEY],
results.keys())
self.assertEqual(label, results[video_input.LABEL_KEY])
class VideoAndLabelParserTest(tf.test.TestCase):
def test_video_input(self):
params = exp_cfg.kinetics600(is_training=True)
params.feature_shape = (2, 224, 224, 3)
params.min_image_size = 224
decoder = video_input.Decoder()
parser = video_input.Parser(params).parse_fn(params.is_training)
# Create fake data.
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
random_image = Image.fromarray(random_image)
with io.BytesIO() as buffer:
random_image.save(buffer, format='JPEG')
raw_image_bytes = buffer.getvalue()
seq_example = tf.train.SequenceExample()
seq_example.feature_lists.feature_list.get_or_create(
video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
raw_image_bytes
]
seq_example.feature_lists.feature_list.get_or_create(
video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
raw_image_bytes
]
seq_example.context.feature[video_input.LABEL_KEY].int64_list.value[:] = [
42
]
input_tensor = tf.constant(seq_example.SerializeToString())
decoded_tensors = decoder.decode(input_tensor)
output_tensor = parser(decoded_tensors)
image_features, label = output_tensor
image = image_features['image']
self.assertAllEqual(image.shape, (2, 224, 224, 3))
self.assertAllEqual(label.shape, (600,))
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""The COCO-style evaluator.
The following snippet demonstrates the use of interfaces:
evaluator = COCOEvaluator(...)
for _ in range(num_evals):
for _ in range(num_batches_per_eval):
predictions, groundtruth = predictor.predict(...) # pop a batch.
evaluator.update_state(groundtruths, predictions)
evaluator.result() # finish one full eval and reset states.
See also: https://github.com/cocodataset/cocoapi/
"""
import atexit
import tempfile
# Import libraries
from absl import logging
import numpy as np
from pycocotools import cocoeval
import six
import tensorflow as tf
from official.vision.beta.evaluation import coco_utils
class COCOEvaluator(object):
"""COCO evaluation metric class."""
def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True):
"""Constructs COCO evaluation class.
The class provides the interface to COCO metrics_fn. The
_update_op() takes detections from each image and push them to
self.detections. The _evaluate() loads a JSON file in COCO annotation format
as the groundtruths and runs COCO evaluation.
Args:
annotation_file: a JSON file that stores annotations of the eval dataset.
If `annotation_file` is None, groundtruth annotations will be loaded
from the dataloader.
include_mask: a boolean to indicate whether or not to include the mask
eval.
need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
to absolute values (`image_info` is needed in this case).
"""
if annotation_file:
if annotation_file.startswith('gs://'):
_, local_val_json = tempfile.mkstemp(suffix='.json')
tf.io.gfile.remove(local_val_json)
tf.io.gfile.copy(annotation_file, local_val_json)
atexit.register(tf.io.gfile.remove, local_val_json)
else:
local_val_json = annotation_file
self._coco_gt = coco_utils.COCOWrapper(
eval_type=('mask' if include_mask else 'box'),
annotation_file=local_val_json)
self._annotation_file = annotation_file
self._include_mask = include_mask
self._metric_names = [
'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', 'ARmax10',
'ARmax100', 'ARs', 'ARm', 'ARl'
]
self._required_prediction_fields = [
'source_id', 'num_detections', 'detection_classes', 'detection_scores',
'detection_boxes'
]
self._need_rescale_bboxes = need_rescale_bboxes
if self._need_rescale_bboxes:
self._required_prediction_fields.append('image_info')
self._required_groundtruth_fields = [
'source_id', 'height', 'width', 'classes', 'boxes'
]
if self._include_mask:
mask_metric_names = ['mask_' + x for x in self._metric_names]
self._metric_names.extend(mask_metric_names)
self._required_prediction_fields.extend(['detection_masks'])
self._required_groundtruth_fields.extend(['masks'])
self.reset_states()
@property
def name(self):
return 'coco_metric'
def reset_states(self):
"""Resets internal states for a fresh run."""
self._predictions = {}
if not self._annotation_file:
self._groundtruths = {}
def result(self):
"""Evaluates detection results, and reset_states."""
metric_dict = self.evaluate()
# Cleans up the internal variables in order for a fresh eval next time.
self.reset_states()
return metric_dict
def evaluate(self):
"""Evaluates with detections from all images with COCO API.
Returns:
coco_metric: float numpy array with shape [24] representing the
coco-style evaluation metrics (box and mask).
"""
if not self._annotation_file:
logging.info('There is no annotation_file in COCOEvaluator.')
gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
self._groundtruths)
coco_gt = coco_utils.COCOWrapper(
eval_type=('mask' if self._include_mask else 'box'),
gt_dataset=gt_dataset)
else:
logging.info('Using annotation file: %s', self._annotation_file)
coco_gt = self._coco_gt
coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
self._predictions)
coco_dt = coco_gt.loadRes(predictions=coco_predictions)
image_ids = [ann['image_id'] for ann in coco_predictions]
coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox')
coco_eval.params.imgIds = image_ids
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
coco_metrics = coco_eval.stats
if self._include_mask:
mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm')
mcoco_eval.params.imgIds = image_ids
mcoco_eval.evaluate()
mcoco_eval.accumulate()
mcoco_eval.summarize()
mask_coco_metrics = mcoco_eval.stats
if self._include_mask:
metrics = np.hstack((coco_metrics, mask_coco_metrics))
else:
metrics = coco_metrics
metrics_dict = {}
for i, name in enumerate(self._metric_names):
metrics_dict[name] = metrics[i].astype(np.float32)
return metrics_dict
def _process_predictions(self, predictions):
image_scale = np.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2))
predictions['detection_boxes'] = (
predictions['detection_boxes'].astype(np.float32))
predictions['detection_boxes'] /= image_scale
if 'detection_outer_boxes' in predictions:
predictions['detection_outer_boxes'] = (
predictions['detection_outer_boxes'].astype(np.float32))
predictions['detection_outer_boxes'] /= image_scale
def _convert_to_numpy(self, groundtruths, predictions):
"""Converts tesnors to numpy arrays."""
if groundtruths:
labels = tf.nest.map_structure(lambda x: x.numpy(), groundtruths)
numpy_groundtruths = {}
for key, val in labels.items():
if isinstance(val, tuple):
val = np.concatenate(val)
numpy_groundtruths[key] = val
else:
numpy_groundtruths = groundtruths
if predictions:
outputs = tf.nest.map_structure(lambda x: x.numpy(), predictions)
numpy_predictions = {}
for key, val in outputs.items():
if isinstance(val, tuple):
val = np.concatenate(val)
numpy_predictions[key] = val
else:
numpy_predictions = predictions
return numpy_groundtruths, numpy_predictions
def update_state(self, groundtruths, predictions):
"""Update and aggregate detection results and groundtruth data.
Args:
groundtruths: a dictionary of Tensors including the fields below.
See also different parsers under `../dataloader` for more details.
Required fields:
- source_id: a numpy array of int or string of shape [batch_size].
- height: a numpy array of int of shape [batch_size].
- width: a numpy array of int of shape [batch_size].
- num_detections: a numpy array of int of shape [batch_size].
- boxes: a numpy array of float of shape [batch_size, K, 4].
- classes: a numpy array of int of shape [batch_size, K].
Optional fields:
- is_crowds: a numpy array of int of shape [batch_size, K]. If the
field is absent, it is assumed that this instance is not crowd.
- areas: a numy array of float of shape [batch_size, K]. If the
field is absent, the area is calculated using either boxes or
masks depending on which one is available.
- masks: a numpy array of float of shape
[batch_size, K, mask_height, mask_width],
predictions: a dictionary of tensors including the fields below.
See different parsers under `../dataloader` for more details.
Required fields:
- source_id: a numpy array of int or string of shape [batch_size].
- image_info [if `need_rescale_bboxes` is True]: a numpy array of
float of shape [batch_size, 4, 2].
- num_detections: a numpy array of
int of shape [batch_size].
- detection_boxes: a numpy array of float of shape [batch_size, K, 4].
- detection_classes: a numpy array of int of shape [batch_size, K].
- detection_scores: a numpy array of float of shape [batch_size, K].
Optional fields:
- detection_masks: a numpy array of float of shape
[batch_size, K, mask_height, mask_width].
Raises:
ValueError: if the required prediction or groundtruth fields are not
present in the incoming `predictions` or `groundtruths`.
"""
groundtruths, predictions = self._convert_to_numpy(groundtruths,
predictions)
for k in self._required_prediction_fields:
if k not in predictions:
raise ValueError(
'Missing the required key `{}` in predictions!'.format(k))
if self._need_rescale_bboxes:
self._process_predictions(predictions)
for k, v in six.iteritems(predictions):
if k not in self._predictions:
self._predictions[k] = [v]
else:
self._predictions[k].append(v)
if not self._annotation_file:
assert groundtruths
for k in self._required_groundtruth_fields:
if k not in groundtruths:
raise ValueError(
'Missing the required key `{}` in groundtruths!'.format(k))
for k, v in six.iteritems(groundtruths):
if k not in self._groundtruths:
self._groundtruths[k] = [v]
else:
self._groundtruths[k].append(v)
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for coco_evaluator."""
import io
import os
# Import libraries
from absl import logging
from absl.testing import absltest
from absl.testing import parameterized
import numpy as np
from PIL import Image
from pycocotools.coco import COCO
import six
import tensorflow as tf
from official.vision.beta.evaluation import coco_evaluator
from official.vision.beta.evaluation import coco_utils
_COCO_JSON_FILE = '/placer/prod/home/snaggletooth/test/data/coco/instances_val2017.json'
_SAVED_COCO_JSON_FILE = 'tmp.json'
def get_groundtruth_annotations(image_id, coco, include_mask=False):
anns = coco.loadAnns(coco.getAnnIds([image_id]))
if not anns:
return None
image = coco.loadImgs([image_id])[0]
groundtruths = {
'boxes': [],
'classes': [],
'is_crowds': [],
'areas': [],
}
if include_mask:
groundtruths['masks'] = []
for ann in anns:
# Creates detections from groundtruths.
# Converts [x, y, w, h] to [y1, x1, y2, x2] box format.
box = [ann['bbox'][1],
ann['bbox'][0],
(ann['bbox'][1] + ann['bbox'][3]),
(ann['bbox'][0] + ann['bbox'][2])]
# Creates groundtruths.
groundtruths['boxes'].append(box)
groundtruths['classes'].append(ann['category_id'])
groundtruths['is_crowds'].append(ann['iscrowd'])
groundtruths['areas'].append(ann['area'])
if include_mask:
mask_img = Image.fromarray(coco.annToMask(ann).astype(np.uint8))
with io.BytesIO() as stream:
mask_img.save(stream, format='PNG')
mask_bytes = stream.getvalue()
groundtruths['masks'].append(mask_bytes)
for key, val in groundtruths.items():
groundtruths[key] = np.stack(val, axis=0)
groundtruths['source_id'] = image['id']
groundtruths['height'] = image['height']
groundtruths['width'] = image['width']
groundtruths['num_detections'] = len(anns)
for k, v in six.iteritems(groundtruths):
groundtruths[k] = np.expand_dims(v, axis=0)
return groundtruths
def get_predictions(image_id, coco, include_mask=False):
anns = coco.loadAnns(coco.getAnnIds([image_id]))
if not anns:
return None
image = coco.loadImgs([image_id])[0]
predictions = {
'detection_boxes': [],
'detection_classes': [],
'detection_scores': [],
}
if include_mask:
predictions['detection_masks'] = []
for ann in anns:
# Creates detections from groundtruths.
# Converts [x, y, w, h] to [y1, x1, y2, x2] box format and
# does the denormalization.
box = [ann['bbox'][1],
ann['bbox'][0],
(ann['bbox'][1] + ann['bbox'][3]),
(ann['bbox'][0] + ann['bbox'][2])]
predictions['detection_boxes'].append(box)
predictions['detection_classes'].append(ann['category_id'])
predictions['detection_scores'].append(1)
if include_mask:
mask = coco.annToMask(ann)
predictions['detection_masks'].append(mask)
for key, val in predictions.items():
predictions[key] = np.expand_dims(np.stack(val, axis=0), axis=0)
predictions['source_id'] = np.array([image['id']])
predictions['num_detections'] = np.array([len(anns)])
predictions['image_info'] = np.array(
[[[image['height'], image['width']],
[image['height'], image['width']],
[1, 1],
[0, 0]]], dtype=np.float32)
return predictions
def get_fake_predictions(image_id, coco, include_mask=False):
anns = coco.loadAnns(coco.getAnnIds([image_id]))
if not anns:
return None
label_id_max = max([ann['category_id'] for ann in anns])
image = coco.loadImgs([image_id])[0]
num_detections = 100
xmin = np.random.randint(
low=0, high=int(image['width'] / 2), size=(1, num_detections))
xmax = np.random.randint(
low=int(image['width'] / 2), high=image['width'],
size=(1, num_detections))
ymin = np.random.randint(
low=0, high=int(image['height'] / 2), size=(1, num_detections))
ymax = np.random.randint(
low=int(image['height'] / 2), high=image['height'],
size=(1, num_detections))
predictions = {
'detection_boxes': np.stack([ymin, xmin, ymax, xmax], axis=-1),
'detection_classes': np.random.randint(
low=0, high=(label_id_max + 1), size=(1, num_detections)),
'detection_scores': np.random.random(size=(1, num_detections)),
}
if include_mask:
predictions['detection_masks'] = np.random.randint(
1, size=(1, num_detections, image['height'], image['width']))
predictions['source_id'] = np.array([image['id']])
predictions['num_detections'] = np.array([num_detections])
predictions['image_info'] = np.array(
[[[image['height'], image['width']],
[image['height'], image['width']],
[1, 1],
[0, 0]]], dtype=np.float32)
return predictions
class DummyGroundtruthGenerator(object):
def __init__(self, include_mask, image_id, coco):
self._include_mask = include_mask
self._image_id = image_id
self._coco = coco
def __call__(self):
yield get_groundtruth_annotations(
self._image_id, self._coco, self._include_mask)
class COCOEvaluatorTest(parameterized.TestCase, absltest.TestCase):
def setUp(self):
super(COCOEvaluatorTest, self).setUp()
temp = self.create_tempdir()
self._saved_coco_json_file = os.path.join(temp.full_path,
_SAVED_COCO_JSON_FILE)
def tearDown(self):
super(COCOEvaluatorTest, self).tearDown()
@parameterized.parameters(
(False, False), (False, True), (True, False), (True, True))
def testEval(self, include_mask, use_fake_predictions):
coco = COCO(annotation_file=_COCO_JSON_FILE)
index = np.random.randint(len(coco.dataset['images']))
image_id = coco.dataset['images'][index]['id']
# image_id = 26564
# image_id = 324158
if use_fake_predictions:
predictions = get_fake_predictions(
image_id, coco, include_mask=include_mask)
else:
predictions = get_predictions(image_id, coco, include_mask=include_mask)
if not predictions:
logging.info('Empty predictions for index=%d', index)
return
predictions = tf.nest.map_structure(
lambda x: tf.convert_to_tensor(x) if x is not None else None,
predictions)
evaluator_w_json = coco_evaluator.COCOEvaluator(
annotation_file=_COCO_JSON_FILE, include_mask=include_mask)
evaluator_w_json.update_state(groundtruths=None, predictions=predictions)
results_w_json = evaluator_w_json.result()
dummy_generator = DummyGroundtruthGenerator(
include_mask=include_mask, image_id=image_id, coco=coco)
coco_utils.generate_annotation_file(dummy_generator,
self._saved_coco_json_file)
evaluator_no_json = coco_evaluator.COCOEvaluator(
annotation_file=self._saved_coco_json_file, include_mask=include_mask)
evaluator_no_json.update_state(groundtruths=None, predictions=predictions)
results_no_json = evaluator_no_json.result()
for k, v in results_w_json.items():
self.assertEqual(v, results_no_json[k])
@parameterized.parameters(
(False, False), (False, True), (True, False), (True, True))
def testEvalOnTheFly(self, include_mask, use_fake_predictions):
coco = COCO(annotation_file=_COCO_JSON_FILE)
index = np.random.randint(len(coco.dataset['images']))
image_id = coco.dataset['images'][index]['id']
# image_id = 26564
# image_id = 324158
if use_fake_predictions:
predictions = get_fake_predictions(
image_id, coco, include_mask=include_mask)
else:
predictions = get_predictions(image_id, coco, include_mask=include_mask)
if not predictions:
logging.info('Empty predictions for index=%d', index)
return
predictions = tf.nest.map_structure(
lambda x: tf.convert_to_tensor(x) if x is not None else None,
predictions)
evaluator_w_json = coco_evaluator.COCOEvaluator(
annotation_file=_COCO_JSON_FILE, include_mask=include_mask)
evaluator_w_json.update_state(groundtruths=None, predictions=predictions)
results_w_json = evaluator_w_json.result()
groundtruths = get_groundtruth_annotations(image_id, coco, include_mask)
groundtruths = tf.nest.map_structure(
lambda x: tf.convert_to_tensor(x) if x is not None else None,
groundtruths)
evaluator_no_json = coco_evaluator.COCOEvaluator(
annotation_file=None, include_mask=include_mask)
evaluator_no_json.update_state(groundtruths, predictions)
results_no_json = evaluator_no_json.result()
for k, v in results_w_json.items():
self.assertEqual(v, results_no_json[k])
if __name__ == '__main__':
absltest.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Util functions related to pycocotools and COCO eval."""
import copy
import json
# Import libraries
from absl import logging
import numpy as np
from PIL import Image
from pycocotools import coco
from pycocotools import mask as mask_api
import six
import tensorflow as tf
from official.vision.beta.dataloaders import tf_example_decoder
from official.vision.beta.ops import box_ops
from official.vision.beta.ops import mask_ops
class COCOWrapper(coco.COCO):
"""COCO wrapper class.
This class wraps COCO API object, which provides the following additional
functionalities:
1. Support string type image id.
2. Support loading the groundtruth dataset using the external annotation
dictionary.
3. Support loading the prediction results using the external annotation
dictionary.
"""
def __init__(self, eval_type='box', annotation_file=None, gt_dataset=None):
"""Instantiates a COCO-style API object.
Args:
eval_type: either 'box' or 'mask'.
annotation_file: a JSON file that stores annotations of the eval dataset.
This is required if `gt_dataset` is not provided.
gt_dataset: the groundtruth eval datatset in COCO API format.
"""
if ((annotation_file and gt_dataset) or
((not annotation_file) and (not gt_dataset))):
raise ValueError('One and only one of `annotation_file` and `gt_dataset` '
'needs to be specified.')
if eval_type not in ['box', 'mask']:
raise ValueError('The `eval_type` can only be either `box` or `mask`.')
coco.COCO.__init__(self, annotation_file=annotation_file)
self._eval_type = eval_type
if gt_dataset:
self.dataset = gt_dataset
self.createIndex()
def loadRes(self, predictions):
"""Loads result file and return a result api object.
Args:
predictions: a list of dictionary each representing an annotation in COCO
format. The required fields are `image_id`, `category_id`, `score`,
`bbox`, `segmentation`.
Returns:
res: result COCO api object.
Raises:
ValueError: if the set of image id from predctions is not the subset of
the set of image id of the groundtruth dataset.
"""
res = coco.COCO()
res.dataset['images'] = copy.deepcopy(self.dataset['images'])
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
image_ids = [ann['image_id'] for ann in predictions]
if set(image_ids) != (set(image_ids) & set(self.getImgIds())):
raise ValueError('Results do not correspond to the current dataset!')
for ann in predictions:
x1, x2, y1, y2 = [ann['bbox'][0], ann['bbox'][0] + ann['bbox'][2],
ann['bbox'][1], ann['bbox'][1] + ann['bbox'][3]]
if self._eval_type == 'box':
ann['area'] = ann['bbox'][2] * ann['bbox'][3]
ann['segmentation'] = [
[x1, y1, x1, y2, x2, y2, x2, y1]]
elif self._eval_type == 'mask':
ann['area'] = mask_api.area(ann['segmentation'])
res.dataset['annotations'] = copy.deepcopy(predictions)
res.createIndex()
return res
def convert_predictions_to_coco_annotations(predictions):
"""Converts a batch of predictions to annotations in COCO format.
Args:
predictions: a dictionary of lists of numpy arrays including the following
fields. K below denotes the maximum number of instances per image.
Required fields:
- source_id: a list of numpy arrays of int or string of shape
[batch_size].
- num_detections: a list of numpy arrays of int of shape [batch_size].
- detection_boxes: a list of numpy arrays of float of shape
[batch_size, K, 4], where coordinates are in the original image
space (not the scaled image space).
- detection_classes: a list of numpy arrays of int of shape
[batch_size, K].
- detection_scores: a list of numpy arrays of float of shape
[batch_size, K].
Optional fields:
- detection_masks: a list of numpy arrays of float of shape
[batch_size, K, mask_height, mask_width].
Returns:
coco_predictions: prediction in COCO annotation format.
"""
coco_predictions = []
num_batches = len(predictions['source_id'])
batch_size = predictions['source_id'][0].shape[0]
max_num_detections = predictions['detection_classes'][0].shape[1]
use_outer_box = 'detection_outer_boxes' in predictions
for i in range(num_batches):
predictions['detection_boxes'][i] = box_ops.yxyx_to_xywh(
predictions['detection_boxes'][i])
if use_outer_box:
predictions['detection_outer_boxes'][i] = box_ops.yxyx_to_xywh(
predictions['detection_outer_boxes'][i])
mask_boxes = predictions['detection_outer_boxes']
else:
mask_boxes = predictions['detection_boxes']
for j in range(batch_size):
if 'detection_masks' in predictions:
image_masks = mask_ops.paste_instance_masks(
predictions['detection_masks'][i][j],
mask_boxes[i][j],
int(predictions['image_info'][i][j, 0, 0]),
int(predictions['image_info'][i][j, 0, 1]))
binary_masks = (image_masks > 0.0).astype(np.uint8)
encoded_masks = [
mask_api.encode(np.asfortranarray(binary_mask))
for binary_mask in list(binary_masks)]
for k in range(max_num_detections):
ann = {}
ann['image_id'] = predictions['source_id'][i][j]
ann['category_id'] = predictions['detection_classes'][i][j, k]
ann['bbox'] = predictions['detection_boxes'][i][j, k]
ann['score'] = predictions['detection_scores'][i][j, k]
if 'detection_masks' in predictions:
ann['segmentation'] = encoded_masks[k]
coco_predictions.append(ann)
for i, ann in enumerate(coco_predictions):
ann['id'] = i + 1
return coco_predictions
def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
"""Converts groundtruths to the dataset in COCO format.
Args:
groundtruths: a dictionary of numpy arrays including the fields below.
Note that each element in the list represent the number for a single
example without batch dimension. K below denotes the actual number of
instances for each image.
Required fields:
- source_id: a list of numpy arrays of int or string of shape
[batch_size].
- height: a list of numpy arrays of int of shape [batch_size].
- width: a list of numpy arrays of int of shape [batch_size].
- num_detections: a list of numpy arrays of int of shape [batch_size].
- boxes: a list of numpy arrays of float of shape [batch_size, K, 4],
where coordinates are in the original image space (not the
normalized coordinates).
- classes: a list of numpy arrays of int of shape [batch_size, K].
Optional fields:
- is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If
th field is absent, it is assumed that this instance is not crowd.
- areas: a list of numy arrays of float of shape [batch_size, K]. If the
field is absent, the area is calculated using either boxes or
masks depending on which one is available.
- masks: a list of numpy arrays of string of shape [batch_size, K],
label_map: (optional) a dictionary that defines items from the category id
to the category name. If `None`, collect the category mappping from the
`groundtruths`.
Returns:
coco_groundtruths: the groundtruth dataset in COCO format.
"""
source_ids = np.concatenate(groundtruths['source_id'], axis=0)
heights = np.concatenate(groundtruths['height'], axis=0)
widths = np.concatenate(groundtruths['width'], axis=0)
gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w
in zip(source_ids, heights, widths)]
gt_annotations = []
num_batches = len(groundtruths['source_id'])
batch_size = groundtruths['source_id'][0].shape[0]
for i in range(num_batches):
for j in range(batch_size):
num_instances = groundtruths['num_detections'][i][j]
for k in range(int(num_instances)):
ann = {}
ann['image_id'] = int(groundtruths['source_id'][i][j])
if 'is_crowds' in groundtruths:
ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k])
else:
ann['iscrowd'] = 0
ann['category_id'] = int(groundtruths['classes'][i][j, k])
boxes = groundtruths['boxes'][i]
ann['bbox'] = [
float(boxes[j, k, 1]),
float(boxes[j, k, 0]),
float(boxes[j, k, 3] - boxes[j, k, 1]),
float(boxes[j, k, 2] - boxes[j, k, 0])]
if 'areas' in groundtruths:
ann['area'] = float(groundtruths['areas'][i][j, k])
else:
ann['area'] = float(
(boxes[j, k, 3] - boxes[j, k, 1]) *
(boxes[j, k, 2] - boxes[j, k, 0]))
if 'masks' in groundtruths:
mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k]))
width, height = mask.size
np_mask = (
np.array(mask.getdata()).reshape(height, width).astype(np.uint8))
np_mask[np_mask > 0] = 255
encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
ann['segmentation'] = encoded_mask
if 'areas' not in groundtruths:
ann['area'] = mask_api.area(encoded_mask)
gt_annotations.append(ann)
for i, ann in enumerate(gt_annotations):
ann['id'] = i + 1
if label_map:
gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map]
else:
category_ids = [gt['category_id'] for gt in gt_annotations]
gt_categories = [{'id': i} for i in set(category_ids)]
gt_dataset = {
'images': gt_images,
'categories': gt_categories,
'annotations': copy.deepcopy(gt_annotations),
}
return gt_dataset
class COCOGroundtruthGenerator:
"""Generates the groundtruth annotations from a single example."""
def __init__(self, file_pattern, num_examples, include_mask):
self._file_pattern = file_pattern
self._num_examples = num_examples
self._include_mask = include_mask
self._dataset_fn = tf.data.TFRecordDataset
def _parse_single_example(self, example):
"""Parses a single serialized tf.Example proto.
Args:
example: a serialized tf.Example proto string.
Returns:
A dictionary of groundtruth with the following fields:
source_id: a scalar tensor of int64 representing the image source_id.
height: a scalar tensor of int64 representing the image height.
width: a scalar tensor of int64 representing the image width.
boxes: a float tensor of shape [K, 4], representing the groundtruth
boxes in absolute coordinates with respect to the original image size.
classes: a int64 tensor of shape [K], representing the class labels of
each instances.
is_crowds: a bool tensor of shape [K], indicating whether the instance
is crowd.
areas: a float tensor of shape [K], indicating the area of each
instance.
masks: a string tensor of shape [K], containing the bytes of the png
mask of each instance.
"""
decoder = tf_example_decoder.TfExampleDecoder(
include_mask=self._include_mask)
decoded_tensors = decoder.decode(example)
image = decoded_tensors['image']
image_size = tf.shape(image)[0:2]
boxes = box_ops.denormalize_boxes(
decoded_tensors['groundtruth_boxes'], image_size)
groundtruths = {
'source_id': tf.string_to_number(
decoded_tensors['source_id'], out_type=tf.int64),
'height': decoded_tensors['height'],
'width': decoded_tensors['width'],
'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0],
'boxes': boxes,
'classes': decoded_tensors['groundtruth_classes'],
'is_crowds': decoded_tensors['groundtruth_is_crowd'],
'areas': decoded_tensors['groundtruth_area'],
}
if self._include_mask:
groundtruths.update({
'masks': decoded_tensors['groundtruth_instance_masks_png'],
})
return groundtruths
def _build_pipeline(self):
"""Builds data pipeline to generate groundtruth annotations."""
dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
dataset = dataset.interleave(
map_func=lambda filename: self._dataset_fn(filename).prefetch(1),
cycle_length=12,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.map(self._parse_single_example,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(1, drop_remainder=False)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
return dataset
def __call__(self):
for groundtruth_result in self._build_pipeline():
yield groundtruth_result
def scan_and_generator_annotation_file(file_pattern: str,
num_samples: int,
include_mask: bool,
annotation_file: str):
"""Scans and generate the COCO-style annotation JSON file given a dataset."""
groundtruth_generator = COCOGroundtruthGenerator(
file_pattern, num_samples, include_mask)
generate_annotation_file(groundtruth_generator, annotation_file)
def generate_annotation_file(groundtruth_generator,
annotation_file):
"""Generates COCO-style annotation JSON file given a groundtruth generator."""
groundtruths = {}
logging.info('Loading groundtruth annotations from dataset to memory...')
for groundtruth in groundtruth_generator():
for k, v in six.iteritems(groundtruth):
if k not in groundtruths:
groundtruths[k] = [v]
else:
groundtruths[k].append(v)
gt_dataset = convert_groundtruths_to_coco_dataset(groundtruths)
logging.info('Saving groundtruth annotations to the JSON file...')
with tf.io.gfile.GFile(annotation_file, 'w') as f:
f.write(json.dumps(gt_dataset))
logging.info('Done saving the JSON file...')
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Losses for maskrcn model."""
# Import libraries
import tensorflow as tf
class RpnScoreLoss(object):
"""Region Proposal Network score loss function."""
def __init__(self, rpn_batch_size_per_im):
self._rpn_batch_size_per_im = rpn_batch_size_per_im
self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, score_outputs, labels):
"""Computes total RPN detection loss.
Computes total RPN detection loss including box and score from all levels.
Args:
score_outputs: an OrderDict with keys representing levels and values
representing scores in [batch_size, height, width, num_anchors].
labels: the dictionary that returned from dataloader that includes
groundturth targets.
Returns:
rpn_score_loss: a scalar tensor representing total score loss.
"""
with tf.name_scope('rpn_loss'):
levels = sorted(score_outputs.keys())
score_losses = []
for level in levels:
score_losses.append(
self._rpn_score_loss(
score_outputs[level],
labels[level],
normalizer=tf.cast(
tf.shape(score_outputs[level])[0] *
self._rpn_batch_size_per_im,
dtype=tf.float32)))
# Sums per level losses to total loss.
return tf.math.add_n(score_losses)
def _rpn_score_loss(self, score_outputs, score_targets, normalizer=1.0):
"""Computes score loss."""
# score_targets has three values:
# (1) score_targets[i]=1, the anchor is a positive sample.
# (2) score_targets[i]=0, negative.
# (3) score_targets[i]=-1, the anchor is don't care (ignore).
with tf.name_scope('rpn_score_loss'):
mask = tf.math.logical_or(tf.math.equal(score_targets, 1),
tf.math.equal(score_targets, 0))
score_targets = tf.math.maximum(score_targets,
tf.zeros_like(score_targets))
score_targets = tf.expand_dims(score_targets, axis=-1)
score_outputs = tf.expand_dims(score_outputs, axis=-1)
score_loss = self._binary_crossentropy(
score_targets, score_outputs, sample_weight=mask)
score_loss /= normalizer
return score_loss
class RpnBoxLoss(object):
"""Region Proposal Network box regression loss function."""
def __init__(self, huber_loss_delta: float):
# The delta is typically around the mean value of regression target.
# for instances, the regression targets of 512x512 input with 6 anchors on
# P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
self._huber_loss = tf.keras.losses.Huber(
delta=huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
def __call__(self, box_outputs, labels):
"""Computes total RPN detection loss.
Computes total RPN detection loss including box and score from all levels.
Args:
box_outputs: an OrderDict with keys representing levels and values
representing box regression targets in
[batch_size, height, width, num_anchors * 4].
labels: the dictionary that returned from dataloader that includes
groundturth targets.
Returns:
rpn_box_loss: a scalar tensor representing total box regression loss.
"""
with tf.name_scope('rpn_loss'):
levels = sorted(box_outputs.keys())
box_losses = []
for level in levels:
box_losses.append(self._rpn_box_loss(box_outputs[level], labels[level]))
# Sum per level losses to total loss.
return tf.add_n(box_losses)
def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0):
"""Computes box regression loss."""
with tf.name_scope('rpn_box_loss'):
mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32)
box_targets = tf.expand_dims(box_targets, axis=-1)
box_outputs = tf.expand_dims(box_outputs, axis=-1)
box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
# The loss is normalized by the sum of non-zero weights and additional
# normalizer provided by the function caller. Using + 0.01 here to avoid
# division by zero.
box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
return box_loss
class FastrcnnClassLoss(object):
"""Fast R-CNN classification loss function."""
def __init__(self):
self._categorical_crossentropy = tf.keras.losses.CategoricalCrossentropy(
reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, class_outputs, class_targets):
"""Computes the class loss (Fast-RCNN branch) of Mask-RCNN.
This function implements the classification loss of the Fast-RCNN.
The classification loss is softmax on all RoIs.
Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long
Args:
class_outputs: a float tensor representing the class prediction for each box
with a shape of [batch_size, num_boxes, num_classes].
class_targets: a float tensor representing the class label for each box
with a shape of [batch_size, num_boxes].
Returns:
a scalar tensor representing total class loss.
"""
with tf.name_scope('fast_rcnn_loss'):
batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
class_targets = tf.cast(class_targets, dtype=tf.int32)
class_targets_one_hot = tf.one_hot(class_targets, num_classes)
return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot,
normalizer=batch_size * num_boxes)
def _fast_rcnn_class_loss(self, class_outputs, class_targets_one_hot,
normalizer=1.0):
"""Computes classification loss."""
with tf.name_scope('fast_rcnn_class_loss'):
class_loss = self._categorical_crossentropy(class_targets_one_hot,
class_outputs)
class_loss /= normalizer
return class_loss
class FastrcnnBoxLoss(object):
"""Fast R-CNN box regression loss function."""
def __init__(self, huber_loss_delta: float):
# The delta is typically around the mean value of regression target.
# for instances, the regression targets of 512x512 input with 6 anchors on
# P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
self._huber_loss = tf.keras.losses.Huber(
delta=huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
def __call__(self, box_outputs, class_targets, box_targets):
"""Computes the box loss (Fast-RCNN branch) of Mask-RCNN.
This function implements the box regression loss of the Fast-RCNN. As the
`box_outputs` produces `num_classes` boxes for each RoI, the reference model
expands `box_targets` to match the shape of `box_outputs` and selects only
the target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py) # pylint: disable=line-too-long
Instead, this function selects the `box_outputs` by the `class_targets` so
that it doesn't expand `box_targets`.
The box loss is smooth L1-loss on only positive samples of RoIs.
Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long
Args:
box_outputs: a float tensor representing the box prediction for each box
with a shape of [batch_size, num_boxes, num_classes * 4].
class_targets: a float tensor representing the class label for each box
with a shape of [batch_size, num_boxes].
box_targets: a float tensor representing the box label for each box
with a shape of [batch_size, num_boxes, 4].
Returns:
box_loss: a scalar tensor representing total box regression loss.
"""
with tf.name_scope('fast_rcnn_loss'):
class_targets = tf.cast(class_targets, dtype=tf.int32)
# Selects the box from `box_outputs` based on `class_targets`, with which
# the box has the maximum overlap.
(batch_size, num_rois,
num_class_specific_boxes) = box_outputs.get_shape().as_list()
num_classes = num_class_specific_boxes // 4
box_outputs = tf.reshape(box_outputs,
[batch_size, num_rois, num_classes, 4])
box_indices = tf.reshape(
class_targets + tf.tile(
tf.expand_dims(
tf.range(batch_size) * num_rois * num_classes, 1),
[1, num_rois]) + tf.tile(
tf.expand_dims(tf.range(num_rois) * num_classes, 0),
[batch_size, 1]), [-1])
box_outputs = tf.matmul(
tf.one_hot(
box_indices,
batch_size * num_rois * num_classes,
dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets)
def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets,
normalizer=1.0):
"""Computes box regression loss."""
with tf.name_scope('fast_rcnn_box_loss'):
mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
[1, 1, 4])
mask = tf.cast(mask, dtype=tf.float32)
box_targets = tf.expand_dims(box_targets, axis=-1)
box_outputs = tf.expand_dims(box_outputs, axis=-1)
box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
# The loss is normalized by the number of ones in mask,
# additianal normalizer provided by the user and using 0.01 here to avoid
# division by 0.
box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
return box_loss
class MaskrcnnLoss(object):
"""Mask R-CNN instance segmentation mask loss function."""
def __init__(self):
self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, mask_outputs, mask_targets, select_class_targets):
"""Computes the mask loss of Mask-RCNN.
This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
produces `num_classes` masks for each RoI, the reference model expands
`mask_targets` to match the shape of `mask_outputs` and selects only the
target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py) # pylint: disable=line-too-long
Instead, this implementation selects the `mask_outputs` by the `class_targets`
so that it doesn't expand `mask_targets`. Note that the selection logic is
done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.
Args:
mask_outputs: a float tensor representing the prediction for each mask,
with a shape of
[batch_size, num_masks, mask_height, mask_width].
mask_targets: a float tensor representing the binary mask of ground truth
labels for each mask with a shape of
[batch_size, num_masks, mask_height, mask_width].
select_class_targets: a tensor with a shape of [batch_size, num_masks],
representing the foreground mask targets.
Returns:
mask_loss: a float tensor representing total mask loss.
"""
with tf.name_scope('mask_rcnn_loss'):
(batch_size, num_masks, mask_height,
mask_width) = mask_outputs.get_shape().as_list()
weights = tf.tile(
tf.reshape(tf.greater(select_class_targets, 0),
[batch_size, num_masks, 1, 1]),
[1, 1, mask_height, mask_width])
weights = tf.cast(weights, dtype=tf.float32)
mask_targets = tf.expand_dims(mask_targets, axis=-1)
mask_outputs = tf.expand_dims(mask_outputs, axis=-1)
mask_loss = self._binary_crossentropy(mask_targets, mask_outputs,
sample_weight=weights)
# The loss is normalized by the number of 1's in weights and
# + 0.01 is used to avoid division by zero.
return mask_loss / (tf.reduce_sum(weights) + 0.01)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Losses used for detection models."""
# Import libraries
import tensorflow as tf
def focal_loss(logits, targets, alpha, gamma):
"""Compute the focal loss between `logits` and the golden `target` values.
Focal loss = -(1-pt)^gamma * log(pt)
where pt is the probability of being classified to the true class.
Args:
logits: A float32 tensor of size
[batch, d_1, ..., d_k, n_classes].
targets: A float32 tensor of size
[batch, d_1, ..., d_k, n_classes].
alpha: A float32 scalar multiplying alpha to the loss from positive examples
and (1-alpha) to the loss from negative examples.
gamma: A float32 scalar modulating loss from hard and easy examples.
Returns:
loss: A float32 Tensor of size
[batch, d_1, ..., d_k, n_classes] representing
normalized loss on the prediction map.
"""
with tf.name_scope('focal_loss'):
positive_label_mask = tf.equal(targets, 1.0)
cross_entropy = (
tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits))
probs = tf.sigmoid(logits)
probs_gt = tf.where(positive_label_mask, probs, 1.0 - probs)
# With small gamma, the implementation could produce NaN during back prop.
modulator = tf.pow(1.0 - probs_gt, gamma)
loss = modulator * cross_entropy
weighted_loss = tf.where(positive_label_mask, alpha * loss,
(1.0 - alpha) * loss)
return weighted_loss
class FocalLoss(tf.keras.losses.Loss):
"""Implements a Focal loss for classification problems.
Reference:
[Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
"""
def __init__(self,
alpha,
gamma,
num_classes,
reduction=tf.keras.losses.Reduction.AUTO,
name=None):
"""Initializes `FocalLoss`.
Arguments:
alpha: The `alpha` weight factor for binary class imbalance.
gamma: The `gamma` focusing parameter to re-weight loss.
num_classes: Number of foreground classes.
reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
loss. Default value is `AUTO`. `AUTO` indicates that the reduction
option will be determined by the usage context. For almost all cases
this defaults to `SUM_OVER_BATCH_SIZE`. When used with
`tf.distribute.Strategy`, outside of built-in training loops such as
`tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
will raise an error. Please see this custom training [tutorial](
https://www.tensorflow.org/tutorials/distribute/custom_training) for
more details.
name: Optional name for the op. Defaults to 'retinanet_class_loss'.
"""
self._num_classes = num_classes
self._alpha = alpha
self._gamma = gamma
super(FocalLoss, self).__init__(reduction=reduction, name=name)
def call(self, y_true, y_pred):
"""Invokes the `FocalLoss`.
Arguments:
y_true: Ordered Dict with level to [batch, height, width, num_anchors].
for example,
{3: tf.Tensor(shape=[32, 512, 512, 9], dtype=tf.float32),
4: tf.Tensor([shape=32, 256, 256, 9, dtype=tf.float32])}
y_pred: Ordered Dict with level to [batch, height, width, num_anchors *
num_classes]. for example,
{3: tf.Tensor(shape=[32, 512, 512, 9], dtype=tf.int64),
4: tf.Tensor(shape=[32, 256, 256, 9 * 21], dtype=tf.int64)}
Returns:
Summed loss float `Tensor`.
"""
flattened_cls_outputs = []
flattened_labels = []
batch_size = None
for level in y_pred.keys():
cls_output = y_pred[level]
label = y_true[level]
if batch_size is None:
batch_size = cls_output.shape[0] or tf.shape(cls_output)[0]
flattened_cls_outputs.append(
tf.reshape(cls_output, [batch_size, -1, self._num_classes]))
flattened_labels.append(tf.reshape(label, [batch_size, -1]))
cls_outputs = tf.concat(flattened_cls_outputs, axis=1)
labels = tf.concat(flattened_labels, axis=1)
cls_targets_one_hot = tf.one_hot(labels, self._num_classes)
return focal_loss(
tf.cast(cls_outputs, dtype=tf.float32),
tf.cast(cls_targets_one_hot, dtype=tf.float32), self._alpha,
self._gamma)
def get_config(self):
config = {
'alpha': self._alpha,
'gamma': self._gamma,
'num_classes': self._num_classes,
}
base_config = super(FocalLoss, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class RetinanetBoxLoss(tf.keras.losses.Loss):
"""RetinaNet box Huber loss."""
def __init__(self,
delta,
reduction=tf.keras.losses.Reduction.AUTO,
name=None):
"""Initializes `RetinanetBoxLoss`.
Arguments:
delta: A float, the point where the Huber loss function changes from a
quadratic to linear.
reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
loss. Default value is `AUTO`. `AUTO` indicates that the reduction
option will be determined by the usage context. For almost all cases
this defaults to `SUM_OVER_BATCH_SIZE`. When used with
`tf.distribute.Strategy`, outside of built-in training loops such as
`tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
will raise an error. Please see this custom training [tutorial](
https://www.tensorflow.org/tutorials/distribute/custom_training) for
more details.
name: Optional name for the op. Defaults to 'retinanet_class_loss'.
"""
self._huber_loss = tf.keras.losses.Huber(
delta=delta, reduction=tf.keras.losses.Reduction.NONE)
self._delta = delta
super(RetinanetBoxLoss, self).__init__(reduction=reduction, name=name)
def call(self, y_true, y_pred):
"""Computes box detection loss.
Computes total detection loss including box and class loss from all levels.
Arguments:
y_true: Ordered Dict with level to [batch, height, width,
num_anchors * 4] for example,
{3: tf.Tensor(shape=[32, 512, 512, 9 * 4], dtype=tf.float32),
4: tf.Tensor([shape=32, 256, 256, 9 * 4, dtype=tf.float32])}
y_pred: Ordered Dict with level to [batch, height, width,
num_anchors * 4]. for example,
{3: tf.Tensor(shape=[32, 512, 512, 9 * 4], dtype=tf.int64),
4: tf.Tensor(shape=[32, 256, 256, 9 * 4], dtype=tf.int64)}
Returns:
an integer tensor representing total box regression loss.
"""
# Sums all positives in a batch for normalization and avoids zero
# num_positives_sum, which would lead to inf loss during training
flattened_box_outputs = []
flattened_labels = []
batch_size = None
for level in y_pred.keys():
box_output = y_pred[level]
label = y_true[level]
if batch_size is None:
batch_size = box_output.shape[0] or tf.shape(box_output)[0]
flattened_box_outputs.append(tf.reshape(box_output, [batch_size, -1, 4]))
flattened_labels.append(tf.reshape(label, [batch_size, -1, 4]))
box_outputs = tf.concat(flattened_box_outputs, axis=1)
labels = tf.concat(flattened_labels, axis=1)
loss = self._huber_loss(labels, box_outputs)
return loss
def get_config(self):
config = {
'delta': self._delta,
}
base_config = super(RetinanetBoxLoss, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Backbones package definition."""
from official.vision.beta.modeling.backbones.efficientnet import EfficientNet
from official.vision.beta.modeling.backbones.resnet import ResNet
from official.vision.beta.modeling.backbones.resnet_3d import ResNet3D
from official.vision.beta.modeling.backbones.revnet import RevNet
from official.vision.beta.modeling.backbones.spinenet import SpineNet
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains definitions of EfficientNet Networks."""
import math
# Import libraries
from absl import logging
import tensorflow as tf
from official.modeling import tf_utils
from official.vision.beta.modeling.layers import nn_blocks
layers = tf.keras.layers
# The fixed EfficientNet-B0 architecture discovered by NAS.
# Each element represents a specification of a building block:
# (block_fn, block_repeats, kernel_size, strides, expand_ratio, in_filters,
# out_filters, is_output)
EN_B0_BLOCK_SPECS = [
('mbconv', 1, 3, 1, 1, 32, 16, False),
('mbconv', 2, 3, 2, 6, 16, 24, True),
('mbconv', 2, 5, 2, 6, 24, 40, True),
('mbconv', 3, 3, 2, 6, 40, 80, False),
('mbconv', 3, 5, 1, 6, 80, 112, True),
('mbconv', 4, 5, 2, 6, 112, 192, False),
('mbconv', 1, 3, 1, 6, 192, 320, True),
]
SCALING_MAP = {
'b0': dict(width_scale=1.0, depth_scale=1.0),
'b1': dict(width_scale=1.0, depth_scale=1.1),
'b2': dict(width_scale=1.1, depth_scale=1.2),
'b3': dict(width_scale=1.2, depth_scale=1.4),
'b4': dict(width_scale=1.4, depth_scale=1.8),
'b5': dict(width_scale=1.6, depth_scale=2.2),
'b6': dict(width_scale=1.8, depth_scale=2.6),
'b7': dict(width_scale=2.0, depth_scale=3.1),
}
def round_filters(filters, multiplier, divisor=8, min_depth=None, skip=False):
"""Round number of filters based on depth multiplier."""
orig_f = filters
if skip or not multiplier:
return filters
filters *= multiplier
min_depth = min_depth or divisor
new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_filters < 0.9 * filters:
new_filters += divisor
logging.info('round_filter input=%s output=%s', orig_f, new_filters)
return int(new_filters)
def round_repeats(repeats, multiplier, skip=False):
"""Round number of filters based on depth multiplier."""
if skip or not multiplier:
return repeats
return int(math.ceil(multiplier * repeats))
def block_spec_decoder(specs, width_scale, depth_scale):
"""Decode specs for a block."""
decoded_specs = []
for s in specs:
s = s + (
width_scale,
depth_scale,
)
decoded_specs.append(BlockSpec(*s))
return decoded_specs
class BlockSpec(object):
"""A container class that specifies the block configuration for MnasNet."""
def __init__(self, block_fn, block_repeats, kernel_size, strides,
expand_ratio, in_filters, out_filters, is_output, width_scale,
depth_scale):
self.block_fn = block_fn
self.block_repeats = round_repeats(block_repeats, depth_scale)
self.kernel_size = kernel_size
self.strides = strides
self.expand_ratio = expand_ratio
self.in_filters = round_filters(in_filters, width_scale)
self.out_filters = round_filters(out_filters, width_scale)
self.is_output = is_output
@tf.keras.utils.register_keras_serializable(package='Vision')
class EfficientNet(tf.keras.Model):
"""Class to build EfficientNet family model."""
def __init__(self,
model_id,
input_specs=layers.InputSpec(shape=[None, None, None, 3]),
se_ratio=0.0,
stochastic_depth_drop_rate=0.0,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
**kwargs):
"""EfficientNet initialization function.
Args:
model_id: `str` model id of EfficientNet.
input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
se_ratio: `float` squeeze and excitation ratio for inverted bottleneck
blocks.
stochastic_depth_drop_rate: `float` drop rate for drop connect layer.
kernel_initializer: kernel_initializer for convolutional layers.
kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
Default to None.
bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
Default to None.
activation: `str` name of the activation function.
use_sync_bn: if True, use synchronized batch normalization.
norm_momentum: `float` normalization omentum for the moving average.
norm_epsilon: `float` small float added to variance to avoid dividing by
zero.
**kwargs: keyword arguments to be passed.
"""
self._model_id = model_id
self._input_specs = input_specs
self._se_ratio = se_ratio
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._use_sync_bn = use_sync_bn
self._activation = activation
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = layers.experimental.SyncBatchNormalization
else:
self._norm = layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
bn_axis = -1
else:
bn_axis = 1
# Build EfficientNet.
inputs = tf.keras.Input(shape=input_specs.shape[1:])
width_scale = SCALING_MAP[model_id]['width_scale']
depth_scale = SCALING_MAP[model_id]['depth_scale']
# Build stem.
x = layers.Conv2D(
filters=round_filters(32, width_scale),
kernel_size=3,
strides=2,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)(
inputs)
x = self._norm(
axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)(
x)
x = tf_utils.get_activation(activation)(x)
# Build intermediate blocks.
endpoints = {}
endpoint_level = 2
decoded_specs = block_spec_decoder(EN_B0_BLOCK_SPECS, width_scale,
depth_scale)
for i, specs in enumerate(decoded_specs):
x = self._block_group(
inputs=x, specs=specs, name='block_group_{}'.format(i))
if specs.is_output:
endpoints[endpoint_level] = x
endpoint_level += 1
# Build output specs for downstream tasks.
self._output_specs = {l: endpoints[l].get_shape for l in endpoints.keys()}
# Build the final conv for classification.
x = layers.Conv2D(
filters=round_filters(1280, width_scale),
kernel_size=1,
strides=1,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)(
x)
x = self._norm(
axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)(
x)
endpoints[endpoint_level] = tf_utils.get_activation(activation)(x)
super(EfficientNet, self).__init__(
inputs=inputs, outputs=endpoints, **kwargs)
def _block_group(self, inputs, specs, name='block_group'):
"""Creates one group of blocks for the EfficientNet model.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
specs: specifications for one inverted bottleneck block group.
name: `str`name for the block.
Returns:
The output `Tensor` of the block layer.
"""
if specs.block_fn == 'mbconv':
block_fn = nn_blocks.InvertedBottleneckBlock
else:
raise ValueError('Block func {} not supported.'.format(specs.block_fn))
x = block_fn(
in_filters=specs.in_filters,
out_filters=specs.out_filters,
expand_ratio=specs.expand_ratio,
strides=specs.strides,
kernel_size=specs.kernel_size,
se_ratio=self._se_ratio,
stochastic_depth_drop_rate=self._stochastic_depth_drop_rate,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activation=self._activation,
use_sync_bn=self._use_sync_bn,
norm_momentum=self._norm_momentum,
norm_epsilon=self._norm_epsilon)(
inputs)
for _ in range(1, specs.block_repeats):
x = block_fn(
in_filters=specs.out_filters, # Set 'in_filters' to 'out_filters'.
out_filters=specs.out_filters,
expand_ratio=specs.expand_ratio,
strides=1, # Fix strides to 1.
kernel_size=specs.kernel_size,
se_ratio=self._se_ratio,
stochastic_depth_drop_rate=self._stochastic_depth_drop_rate,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activation=self._activation,
use_sync_bn=self._use_sync_bn,
norm_momentum=self._norm_momentum,
norm_epsilon=self._norm_epsilon)(
x)
return tf.identity(x, name=name)
def get_config(self):
config_dict = {
'model_id': self._model_id,
'se_ratio': self._se_ratio,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
return config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
@property
def output_specs(self):
"""A dict of {level: TensorShape} pairs for the model output."""
return self._output_specs
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for EfficientNet."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from official.vision.beta.modeling.backbones import efficientnet
class EfficientNetTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(32, 224)
def test_network_creation(self, input_size):
"""Test creation of EfficientNet family models."""
tf.keras.backend.set_image_data_format('channels_last')
network = efficientnet.EfficientNet(model_id='b0')
inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
endpoints = network(inputs)
self.assertAllEqual([1, input_size / 2**2, input_size / 2**2, 24],
endpoints[2].shape.as_list())
self.assertAllEqual([1, input_size / 2**3, input_size / 2**3, 40],
endpoints[3].shape.as_list())
self.assertAllEqual([1, input_size / 2**4, input_size / 2**4, 112],
endpoints[4].shape.as_list())
self.assertAllEqual([1, input_size / 2**5, input_size / 2**5, 320],
endpoints[5].shape.as_list())
@parameterized.parameters('b0', 'b3', 'b6')
def test_network_scaling(self, model_id):
"""Test compound scaling."""
efficientnet_params = {
'b0': 4049564,
'b3': 10783528,
'b6': 40960136,
}
tf.keras.backend.set_image_data_format('channels_last')
input_size = 32
network = efficientnet.EfficientNet(model_id=model_id, se_ratio=0.25)
self.assertEqual(network.count_params(), efficientnet_params[model_id])
inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
_ = network(inputs)
@parameterized.parameters(1, 3)
def test_input_specs(self, input_dim):
"""Test different input feature dimensions."""
tf.keras.backend.set_image_data_format('channels_last')
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim])
network = efficientnet.EfficientNet(model_id='b0', input_specs=input_specs)
inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1)
_ = network(inputs)
def test_serialize_deserialize(self):
# Create a network object that sets all of its config options.
kwargs = dict(
model_id='b0',
se_ratio=0.25,
stochastic_depth_drop_rate=None,
use_sync_bn=False,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
norm_momentum=0.99,
norm_epsilon=0.001,
)
network = efficientnet.EfficientNet(**kwargs)
expected_config = dict(kwargs)
self.assertEqual(network.get_config(), expected_config)
# Create another network object from the first object's config.
new_network = efficientnet.EfficientNet.from_config(network.get_config())
# Validate that the config can be forced to JSON.
_ = new_network.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(network.get_config(), new_network.get_config())
if __name__ == '__main__':
tf.test.main()
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""factory method."""
# Import libraries
import tensorflow as tf
from official.vision.beta.modeling import backbones
from official.vision.beta.modeling.backbones import spinenet
def build_backbone(input_specs: tf.keras.layers.InputSpec,
model_config,
l2_regularizer: tf.keras.regularizers.Regularizer = None):
"""Builds backbone from a config.
Args:
input_specs: tf.keras.layers.InputSpec.
model_config: a OneOfConfig. Model config.
l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None.
Returns:
tf.keras.Model instance of the backbone.
"""
backbone_type = model_config.backbone.type
backbone_cfg = model_config.backbone.get()
norm_activation_config = model_config.norm_activation
if backbone_type == 'resnet':
backbone = backbones.ResNet(
model_id=backbone_cfg.model_id,
input_specs=input_specs,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
elif backbone_type == 'efficientnet':
backbone = backbones.EfficientNet(
model_id=backbone_cfg.model_id,
input_specs=input_specs,
stochastic_depth_drop_rate=backbone_cfg.stochastic_depth_drop_rate,
se_ratio=backbone_cfg.se_ratio,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
elif backbone_type == 'spinenet':
model_id = backbone_cfg.model_id
if model_id not in spinenet.SCALING_MAP:
raise ValueError(
'SpineNet-{} is not a valid architecture.'.format(model_id))
scaling_params = spinenet.SCALING_MAP[model_id]
backbone = backbones.SpineNet(
input_specs=input_specs,
min_level=model_config.min_level,
max_level=model_config.max_level,
endpoints_num_filters=scaling_params['endpoints_num_filters'],
resample_alpha=scaling_params['resample_alpha'],
block_repeats=scaling_params['block_repeats'],
filter_size_scale=scaling_params['filter_size_scale'],
kernel_regularizer=l2_regularizer,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon)
elif backbone_type == 'revnet':
backbone = backbones.RevNet(
model_id=backbone_cfg.model_id,
input_specs=input_specs,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
else:
raise ValueError('Backbone {!r} not implement'.format(backbone_type))
return backbone
def build_backbone_3d(input_specs: tf.keras.layers.InputSpec,
model_config,
l2_regularizer: tf.keras.regularizers.Regularizer = None):
"""Builds 3d backbone from a config.
Args:
input_specs: tf.keras.layers.InputSpec.
model_config: a OneOfConfig. Model config.
l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None.
Returns:
tf.keras.Model instance of the backbone.
"""
backbone_type = model_config.backbone.type
backbone_cfg = model_config.backbone.get()
norm_activation_config = model_config.norm_activation
# Flatten configs before passing to the backbone.
temporal_strides = []
temporal_kernel_sizes = []
use_self_gating = []
for block_spec in backbone_cfg.block_specs:
temporal_strides.append(block_spec.temporal_strides)
temporal_kernel_sizes.append(block_spec.temporal_kernel_sizes)
use_self_gating.append(block_spec.use_self_gating)
if backbone_type == 'resnet_3d':
backbone = backbones.ResNet3D(
model_id=backbone_cfg.model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes,
use_self_gating=use_self_gating,
input_specs=input_specs,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
else:
raise ValueError('Backbone {!r} not implement'.format(backbone_type))
return backbone
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for factory functions."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.distribute import combinations
from official.vision.beta.configs import backbones as backbones_cfg
from official.vision.beta.configs import backbones_3d as backbones_3d_cfg
from official.vision.beta.configs import common as common_cfg
from official.vision.beta.configs import retinanet as retinanet_cfg
from official.vision.beta.modeling import backbones
from official.vision.beta.modeling.backbones import factory
class FactoryTest(tf.test.TestCase, parameterized.TestCase):
@combinations.generate(
combinations.combine(model_id=[18, 34, 50, 101, 152],))
def test_resnet_creation(self, model_id):
"""Test creation of ResNet models."""
network = backbones.ResNet(
model_id=model_id, norm_momentum=0.99, norm_epsilon=1e-5)
backbone_config = backbones_cfg.Backbone(
type='resnet',
resnet=backbones_cfg.ResNet(model_id=model_id))
norm_activation_config = common_cfg.NormActivation(
norm_momentum=0.99, norm_epsilon=1e-5)
model_config = retinanet_cfg.RetinaNet(
backbone=backbone_config, norm_activation=norm_activation_config)
factory_network = factory.build_backbone(
input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
model_config=model_config)
network_config = network.get_config()
factory_network_config = factory_network.get_config()
self.assertEqual(network_config, factory_network_config)
@combinations.generate(
combinations.combine(
model_id=['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
se_ratio=[0.0, 0.25],
))
def test_efficientnet_creation(self, model_id, se_ratio):
"""Test creation of EfficientNet models."""
network = backbones.EfficientNet(
model_id=model_id,
se_ratio=se_ratio,
norm_momentum=0.99,
norm_epsilon=1e-5)
backbone_config = backbones_cfg.Backbone(
type='efficientnet',
efficientnet=backbones_cfg.EfficientNet(
model_id=model_id, se_ratio=se_ratio))
norm_activation_config = common_cfg.NormActivation(
norm_momentum=0.99, norm_epsilon=1e-5)
model_config = retinanet_cfg.RetinaNet(
backbone=backbone_config, norm_activation=norm_activation_config)
factory_network = factory.build_backbone(
input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
model_config=model_config)
network_config = network.get_config()
factory_network_config = factory_network.get_config()
self.assertEqual(network_config, factory_network_config)
@combinations.generate(combinations.combine(model_id=['49'],))
def test_spinenet_creation(self, model_id):
"""Test creation of SpineNet models."""
input_size = 128
min_level = 3
max_level = 7
input_specs = tf.keras.layers.InputSpec(
shape=[None, input_size, input_size, 3])
network = backbones.SpineNet(
input_specs=input_specs,
min_level=min_level,
max_level=max_level,
norm_momentum=0.99,
norm_epsilon=1e-5)
backbone_config = backbones_cfg.Backbone(
type='spinenet',
spinenet=backbones_cfg.SpineNet(model_id=model_id))
norm_activation_config = common_cfg.NormActivation(
norm_momentum=0.99, norm_epsilon=1e-5)
model_config = retinanet_cfg.RetinaNet(
backbone=backbone_config, norm_activation=norm_activation_config)
factory_network = factory.build_backbone(
input_specs=tf.keras.layers.InputSpec(
shape=[None, input_size, input_size, 3]),
model_config=model_config)
network_config = network.get_config()
factory_network_config = factory_network.get_config()
self.assertEqual(network_config, factory_network_config)
@combinations.generate(
combinations.combine(model_id=[38, 56, 104],))
def test_revnet_creation(self, model_id):
"""Test creation of RevNet models."""
network = backbones.RevNet(
model_id=model_id, norm_momentum=0.99, norm_epsilon=1e-5)
backbone_config = backbones_cfg.Backbone(
type='revnet',
revnet=backbones_cfg.RevNet(model_id=model_id))
norm_activation_config = common_cfg.NormActivation(
norm_momentum=0.99, norm_epsilon=1e-5)
model_config = retinanet_cfg.RetinaNet(
backbone=backbone_config, norm_activation=norm_activation_config)
factory_network = factory.build_backbone(
input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
model_config=model_config)
network_config = network.get_config()
factory_network_config = factory_network.get_config()
self.assertEqual(network_config, factory_network_config)
@combinations.generate(combinations.combine(model_type=['resnet_3d'],))
def test_resnet_3d_creation(self, model_type):
"""Test creation of ResNet 3D models."""
backbone_cfg = backbones_3d_cfg.Backbone3D(type=model_type).get()
temporal_strides = []
temporal_kernel_sizes = []
for block_spec in backbone_cfg.block_specs:
temporal_strides.append(block_spec.temporal_strides)
temporal_kernel_sizes.append(block_spec.temporal_kernel_sizes)
_ = backbones.ResNet3D(
model_id=backbone_cfg.model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes,
norm_momentum=0.99,
norm_epsilon=1e-5)
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment