Unverified Commit 420a7253 authored by pkulzc's avatar pkulzc Committed by GitHub
Browse files

Refactor tests for Object Detection API. (#8688)

Internal changes

--

PiperOrigin-RevId: 316837667
parent d0ef3913
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for generate_embedding_data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import contextlib
import os
import tempfile
import unittest
import numpy as np
import six
import tensorflow.compat.v1 as tf
from object_detection import exporter
from object_detection.builders import model_builder
from object_detection.core import model
from object_detection.dataset_tools.context_rcnn import generate_embedding_data
from object_detection.protos import pipeline_pb2
from object_detection.utils import tf_version
from apache_beam import runners
if six.PY2:
import mock # pylint: disable=g-import-not-at-top
else:
mock = unittest.mock
class FakeModel(model.DetectionModel):
"""A Fake Detection model with expected output nodes from post-processing."""
def preprocess(self, inputs):
true_image_shapes = [] # Doesn't matter for the fake model.
return tf.identity(inputs), true_image_shapes
def predict(self, preprocessed_inputs, true_image_shapes):
return {'image': tf.layers.conv2d(preprocessed_inputs, 3, 1)}
def postprocess(self, prediction_dict, true_image_shapes):
with tf.control_dependencies(prediction_dict.values()):
num_features = 100
feature_dims = 10
classifier_feature = np.ones(
(2, feature_dims, feature_dims, num_features),
dtype=np.float32).tolist()
postprocessed_tensors = {
'detection_boxes': tf.constant([[[0.0, 0.1, 0.5, 0.6],
[0.5, 0.5, 0.8, 0.8]]], tf.float32),
'detection_scores': tf.constant([[0.95, 0.6]], tf.float32),
'detection_multiclass_scores': tf.constant([[[0.1, 0.7, 0.2],
[0.3, 0.1, 0.6]]],
tf.float32),
'detection_classes': tf.constant([[0, 1]], tf.float32),
'num_detections': tf.constant([2], tf.float32),
'detection_features':
tf.constant([classifier_feature],
tf.float32)
}
return postprocessed_tensors
def restore_map(self, checkpoint_path, fine_tune_checkpoint_type):
pass
def loss(self, prediction_dict, true_image_shapes):
pass
def regularization_losses(self):
pass
def updates(self):
pass
@contextlib.contextmanager
def InMemoryTFRecord(entries):
temp = tempfile.NamedTemporaryFile(delete=False)
filename = temp.name
try:
with tf.python_io.TFRecordWriter(filename) as writer:
for value in entries:
writer.write(value)
yield filename
finally:
os.unlink(temp.name)
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class GenerateEmbeddingData(tf.test.TestCase):
def _save_checkpoint_from_mock_model(self, checkpoint_path):
"""A function to save checkpoint from a fake Detection Model.
Args:
checkpoint_path: Path to save checkpoint from Fake model.
"""
g = tf.Graph()
with g.as_default():
mock_model = FakeModel(num_classes=5)
preprocessed_inputs, true_image_shapes = mock_model.preprocess(
tf.placeholder(tf.float32, shape=[None, None, None, 3]))
predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
mock_model.postprocess(predictions, true_image_shapes)
tf.train.get_or_create_global_step()
saver = tf.train.Saver()
init = tf.global_variables_initializer()
with self.test_session(graph=g) as sess:
sess.run(init)
saver.save(sess, checkpoint_path)
def _export_saved_model(self):
tmp_dir = self.get_temp_dir()
checkpoint_path = os.path.join(tmp_dir, 'model.ckpt')
self._save_checkpoint_from_mock_model(checkpoint_path)
output_directory = os.path.join(tmp_dir, 'output')
saved_model_path = os.path.join(output_directory, 'saved_model')
tf.io.gfile.makedirs(output_directory)
with mock.patch.object(
model_builder, 'build', autospec=True) as mock_builder:
mock_builder.return_value = FakeModel(num_classes=5)
pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
pipeline_config.eval_config.use_moving_averages = False
detection_model = model_builder.build(pipeline_config.model,
is_training=False)
outputs, placeholder_tensor = exporter.build_detection_graph(
input_type='tf_example',
detection_model=detection_model,
input_shape=None,
output_collection_name='inference_op',
graph_hook_fn=None)
output_node_names = ','.join(outputs.keys())
saver = tf.train.Saver()
input_saver_def = saver.as_saver_def()
frozen_graph_def = exporter.freeze_graph_with_def_protos(
input_graph_def=tf.get_default_graph().as_graph_def(),
input_saver_def=input_saver_def,
input_checkpoint=checkpoint_path,
output_node_names=output_node_names,
restore_op_name='save/restore_all',
filename_tensor_name='save/Const:0',
output_graph='',
clear_devices=True,
initializer_nodes='')
exporter.write_saved_model(
saved_model_path=saved_model_path,
frozen_graph_def=frozen_graph_def,
inputs=placeholder_tensor,
outputs=outputs)
return saved_model_path
def _create_tf_example(self):
with self.test_session():
encoded_image = tf.image.encode_jpeg(
tf.constant(np.ones((4, 4, 3)).astype(np.uint8))).eval()
def BytesFeature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def Int64Feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def FloatFeature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': BytesFeature(encoded_image),
'image/source_id': BytesFeature(b'image_id'),
'image/height': Int64Feature(400),
'image/width': Int64Feature(600),
'image/class/label': Int64Feature(5),
'image/class/text': BytesFeature(b'hyena'),
'image/object/bbox/xmin': FloatFeature(0.1),
'image/object/bbox/xmax': FloatFeature(0.6),
'image/object/bbox/ymin': FloatFeature(0.0),
'image/object/bbox/ymax': FloatFeature(0.5),
'image/object/class/score': FloatFeature(0.95),
'image/object/class/label': Int64Feature(5),
'image/object/class/text': BytesFeature(b'hyena'),
'image/date_captured': BytesFeature(b'2019-10-20 12:12:12')
}))
return example.SerializeToString()
def assert_expected_example(self, example, topk=False, botk=False):
# Check embeddings
if topk or botk:
self.assertEqual(len(
example.features.feature['image/embedding'].float_list.value),
218)
self.assertAllEqual(
example.features.feature['image/embedding_count'].int64_list.value,
[2])
else:
self.assertEqual(len(
example.features.feature['image/embedding'].float_list.value),
109)
self.assertAllEqual(
example.features.feature['image/embedding_count'].int64_list.value,
[1])
self.assertAllEqual(
example.features.feature['image/embedding_length'].int64_list.value,
[109])
# Check annotations
self.assertAllClose(
example.features.feature['image/object/bbox/ymin'].float_list.value,
[0.0])
self.assertAllClose(
example.features.feature['image/object/bbox/xmin'].float_list.value,
[0.1])
self.assertAllClose(
example.features.feature['image/object/bbox/ymax'].float_list.value,
[0.5])
self.assertAllClose(
example.features.feature['image/object/bbox/xmax'].float_list.value,
[0.6])
self.assertAllClose(
example.features.feature['image/object/class/score']
.float_list.value, [0.95])
self.assertAllClose(
example.features.feature['image/object/class/label']
.int64_list.value, [5])
self.assertAllEqual(
example.features.feature['image/object/class/text']
.bytes_list.value, ['hyena'])
self.assertAllClose(
example.features.feature['image/class/label']
.int64_list.value, [5])
self.assertAllEqual(
example.features.feature['image/class/text']
.bytes_list.value, ['hyena'])
# Check other essential attributes.
self.assertAllEqual(
example.features.feature['image/height'].int64_list.value, [400])
self.assertAllEqual(
example.features.feature['image/width'].int64_list.value, [600])
self.assertAllEqual(
example.features.feature['image/source_id'].bytes_list.value,
['image_id'])
self.assertTrue(
example.features.feature['image/encoded'].bytes_list.value)
def test_generate_embedding_data_fn(self):
saved_model_path = self._export_saved_model()
top_k_embedding_count = 1
bottom_k_embedding_count = 0
inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
inference_fn.start_bundle()
generated_example = self._create_tf_example()
self.assertAllEqual(tf.train.Example.FromString(
generated_example).features.feature['image/object/class/label']
.int64_list.value, [5])
self.assertAllEqual(tf.train.Example.FromString(
generated_example).features.feature['image/object/class/text']
.bytes_list.value, ['hyena'])
output = inference_fn.process(generated_example)
output_example = output[0]
self.assert_expected_example(output_example)
def test_generate_embedding_data_with_top_k_boxes(self):
saved_model_path = self._export_saved_model()
top_k_embedding_count = 2
bottom_k_embedding_count = 0
inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
inference_fn.start_bundle()
generated_example = self._create_tf_example()
self.assertAllEqual(
tf.train.Example.FromString(generated_example).features
.feature['image/object/class/label'].int64_list.value, [5])
self.assertAllEqual(
tf.train.Example.FromString(generated_example).features
.feature['image/object/class/text'].bytes_list.value, [b'hyena'])
output = inference_fn.process(generated_example)
output_example = output[0]
self.assert_expected_example(output_example, topk=True)
def test_generate_embedding_data_with_bottom_k_boxes(self):
saved_model_path = self._export_saved_model()
top_k_embedding_count = 0
bottom_k_embedding_count = 2
inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
inference_fn.start_bundle()
generated_example = self._create_tf_example()
self.assertAllEqual(
tf.train.Example.FromString(generated_example).features
.feature['image/object/class/label'].int64_list.value, [5])
self.assertAllEqual(
tf.train.Example.FromString(generated_example).features
.feature['image/object/class/text'].bytes_list.value, ['hyena'])
output = inference_fn.process(generated_example)
output_example = output[0]
self.assert_expected_example(output_example, botk=True)
def test_beam_pipeline(self):
with InMemoryTFRecord([self._create_tf_example()]) as input_tfrecord:
runner = runners.DirectRunner()
temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
output_tfrecord = os.path.join(temp_dir, 'output_tfrecord')
saved_model_path = self._export_saved_model()
top_k_embedding_count = 1
bottom_k_embedding_count = 0
num_shards = 1
pipeline = generate_embedding_data.construct_pipeline(
input_tfrecord, output_tfrecord, saved_model_path,
top_k_embedding_count, bottom_k_embedding_count, num_shards)
runner.run(pipeline)
filenames = tf.io.gfile.glob(
output_tfrecord + '-?????-of-?????')
actual_output = []
record_iterator = tf.python_io.tf_record_iterator(path=filenames[0])
for record in record_iterator:
actual_output.append(record)
self.assertEqual(len(actual_output), 1)
self.assert_expected_example(tf.train.Example.FromString(
actual_output[0]))
if __name__ == '__main__':
tf.test.main()
......@@ -24,10 +24,18 @@ import six
import tensorflow.compat.v1 as tf
from object_detection.dataset_tools import seq_example_util
from object_detection.utils import tf_version
class SeqExampleUtilTest(tf.test.TestCase):
def materialize_tensors(self, list_of_tensors):
if tf_version.is_tf2():
return [tensor.numpy() for tensor in list_of_tensors]
else:
with self.cached_session() as sess:
return sess.run(list_of_tensors)
def test_make_unlabeled_example(self):
num_frames = 5
image_height = 100
......@@ -41,8 +49,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
image_source_ids = [str(idx) for idx in range(num_frames)]
images_list = tf.unstack(images, axis=0)
encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
with tf.Session() as sess:
encoded_images = sess.run(encoded_images_list)
encoded_images = self.materialize_tensors(encoded_images_list)
seq_example = seq_example_util.make_sequence_example(
dataset_name=dataset_name,
video_id=video_id,
......@@ -109,8 +116,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
dtype=tf.int32), dtype=tf.uint8)
images_list = tf.unstack(images, axis=0)
encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
with tf.Session() as sess:
encoded_images = sess.run(encoded_images_list)
encoded_images = self.materialize_tensors(encoded_images_list)
timestamps = [100000, 110000]
is_annotated = [1, 0]
bboxes = [
......@@ -208,8 +214,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
dtype=tf.int32), dtype=tf.uint8)
images_list = tf.unstack(images, axis=0)
encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
with tf.Session() as sess:
encoded_images = sess.run(encoded_images_list)
encoded_images = self.materialize_tensors(encoded_images_list)
bboxes = [
np.array([[0., 0., 0.75, 0.75],
[0., 0., 1., 1.]], dtype=np.float32),
......
......@@ -52,6 +52,8 @@ EVAL_METRICS_CLASS_DICT = {
coco_evaluation.CocoKeypointEvaluator,
'coco_mask_metrics':
coco_evaluation.CocoMaskEvaluator,
'coco_panoptic_metrics':
coco_evaluation.CocoPanopticSegmentationEvaluator,
'oid_challenge_detection_metrics':
object_detection_evaluation.OpenImagesDetectionChallengeEvaluator,
'oid_challenge_segmentation_metrics':
......
......@@ -18,6 +18,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
from absl.testing import parameterized
import numpy as np
......@@ -30,6 +31,7 @@ from object_detection.core import standard_fields as fields
from object_detection.metrics import coco_evaluation
from object_detection.protos import eval_pb2
from object_detection.utils import test_case
from object_detection.utils import tf_version
class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
......@@ -127,6 +129,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
{'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
{'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
)
@unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
def test_get_eval_metric_ops_for_coco_detections(self, batch_size=1,
max_gt_boxes=None,
scale_to_absolute=False):
......@@ -155,6 +158,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
{'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
{'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
)
@unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
def test_get_eval_metric_ops_for_coco_detections_and_masks(
self, batch_size=1, max_gt_boxes=None, scale_to_absolute=False):
eval_config = eval_pb2.EvalConfig()
......@@ -185,6 +189,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
{'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
{'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
)
@unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
def test_get_eval_metric_ops_for_coco_detections_and_resized_masks(
self, batch_size=1, max_gt_boxes=None, scale_to_absolute=False):
eval_config = eval_pb2.EvalConfig()
......@@ -210,6 +215,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
self.assertAlmostEqual(1.0, metrics['DetectionBoxes_Precision/mAP'])
self.assertAlmostEqual(1.0, metrics['DetectionMasks_Precision/mAP'])
@unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
def test_get_eval_metric_ops_raises_error_with_unsupported_metric(self):
eval_config = eval_pb2.EvalConfig()
eval_config.metrics_set.extend(['unsupported_metric'])
......@@ -334,6 +340,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
dtype=np.float32)
detection_keypoints = np.array([[0.0, 0.0], [0.5, 0.5], [1.0, 1.0]],
dtype=np.float32)
def graph_fn():
detections = {
detection_fields.detection_boxes:
tf.constant(detection_boxes),
......@@ -374,23 +381,26 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
true_image_shapes=true_image_shapes,
original_image_spatial_shapes=original_image_spatial_shapes,
max_gt_boxes=tf.constant(1))
with self.test_session() as sess:
result = sess.run(result)
return (result[input_data_fields.groundtruth_boxes],
result[input_data_fields.groundtruth_keypoints],
result[detection_fields.detection_boxes],
result[detection_fields.detection_keypoints])
(gt_boxes, gt_keypoints, detection_boxes,
detection_keypoints) = self.execute_cpu(graph_fn, [])
self.assertAllEqual(
[[[0., 0., 200., 200.]], [[0.0, 0.0, 150., 150.]]],
result[input_data_fields.groundtruth_boxes])
gt_boxes)
self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
[[[0., 0.], [150., 150.], [300., 300.]]]],
result[input_data_fields.groundtruth_keypoints])
gt_keypoints)
# Predictions from the model are not scaled.
self.assertAllEqual(
[[[0., 0., 200., 200.]], [[0.0, 0.0, 75., 150.]]],
result[detection_fields.detection_boxes])
detection_boxes)
self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
[[[0., 0.], [75., 150.], [150., 300.]]]],
result[detection_fields.detection_keypoints])
detection_keypoints)
if __name__ == '__main__':
......
......@@ -134,6 +134,30 @@ flags.DEFINE_string('config_override', '',
'text proto to override pipeline_config_path.')
flags.DEFINE_boolean('write_inference_graph', False,
'If true, writes inference graph to disk.')
flags.DEFINE_string('additional_output_tensor_names', None,
'Additional Tensors to output, to be specified as a comma '
'separated list of tensor names.')
flags.DEFINE_boolean('use_side_inputs', False,
'If True, uses side inputs as well as image inputs.')
flags.DEFINE_string('side_input_shapes', None,
'If use_side_inputs is True, this explicitly sets '
'the shape of the side input tensors to a fixed size. The '
'dimensions are to be provided as a comma-separated list '
'of integers. A value of -1 can be used for unknown '
'dimensions. A `/` denotes a break, starting the shape of '
'the next side input tensor. This flag is required if '
'using side inputs.')
flags.DEFINE_string('side_input_types', None,
'If use_side_inputs is True, this explicitly sets '
'the type of the side input tensors. The '
'dimensions are to be provided as a comma-separated list '
'of types, each of `string`, `integer`, or `float`. '
'This flag is required if using side inputs.')
flags.DEFINE_string('side_input_names', None,
'If use_side_inputs is True, this explicitly sets '
'the names of the side input tensors required by the model '
'assuming the names will be a comma-separated list of '
'strings. This flag is required if using side inputs.')
tf.app.flags.mark_flag_as_required('pipeline_config_path')
tf.app.flags.mark_flag_as_required('trained_checkpoint_prefix')
tf.app.flags.mark_flag_as_required('output_directory')
......@@ -152,10 +176,30 @@ def main(_):
]
else:
input_shape = None
if FLAGS.use_side_inputs:
side_input_shapes, side_input_names, side_input_types = (
exporter.parse_side_inputs(
FLAGS.side_input_shapes,
FLAGS.side_input_names,
FLAGS.side_input_types))
else:
side_input_shapes = None
side_input_names = None
side_input_types = None
if FLAGS.additional_output_tensor_names:
additional_output_tensor_names = list(
FLAGS.additional_output_tensor_names.split(','))
else:
additional_output_tensor_names = None
exporter.export_inference_graph(
FLAGS.input_type, pipeline_config, FLAGS.trained_checkpoint_prefix,
FLAGS.output_directory, input_shape=input_shape,
write_inference_graph=FLAGS.write_inference_graph)
write_inference_graph=FLAGS.write_inference_graph,
additional_output_tensor_names=additional_output_tensor_names,
use_side_inputs=FLAGS.use_side_inputs,
side_input_shapes=side_input_shapes,
side_input_names=side_input_names,
side_input_types=side_input_types)
if __name__ == '__main__':
......
......@@ -24,16 +24,19 @@ import tensorflow.compat.v1 as tf
from tensorflow.core.framework import attr_value_pb2
from tensorflow.core.framework import types_pb2
from tensorflow.core.protobuf import saver_pb2
from tensorflow.tools.graph_transforms import TransformGraph
from object_detection import exporter
from object_detection.builders import graph_rewriter_builder
from object_detection.builders import model_builder
from object_detection.builders import post_processing_builder
from object_detection.core import box_list
from object_detection.utils import tf_version
_DEFAULT_NUM_CHANNELS = 3
_DEFAULT_NUM_COORD_BOX = 4
if tf_version.is_tf1():
from tensorflow.tools.graph_transforms import TransformGraph # pylint: disable=g-import-not-at-top
def get_const_center_size_encoded_anchors(anchors):
"""Exports center-size encoded anchors as a constant tensor.
......
......@@ -18,6 +18,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import unittest
import numpy as np
import six
import tensorflow.compat.v1 as tf
......@@ -32,6 +33,7 @@ from object_detection.core import model
from object_detection.protos import graph_rewriter_pb2
from object_detection.protos import pipeline_pb2
from object_detection.protos import post_processing_pb2
from object_detection.utils import tf_version
# pylint: disable=g-import-not-at-top
......@@ -82,6 +84,7 @@ class FakeModel(model.DetectionModel):
pass
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class ExportTfliteGraphTest(tf.test.TestCase):
def _save_checkpoint_from_mock_model(self,
......
......@@ -39,6 +39,54 @@ except ImportError:
freeze_graph_with_def_protos = freeze_graph.freeze_graph_with_def_protos
def parse_side_inputs(side_input_shapes_string, side_input_names_string,
side_input_types_string):
"""Parses side input flags.
Args:
side_input_shapes_string: The shape of the side input tensors, provided as a
comma-separated list of integers. A value of -1 is used for unknown
dimensions. A `/` denotes a break, starting the shape of the next side
input tensor.
side_input_names_string: The names of the side input tensors, provided as a
comma-separated list of strings.
side_input_types_string: The type of the side input tensors, provided as a
comma-separated list of types, each of `string`, `integer`, or `float`.
Returns:
side_input_shapes: A list of shapes.
side_input_names: A list of strings.
side_input_types: A list of tensorflow dtypes.
"""
if side_input_shapes_string:
side_input_shapes = []
for side_input_shape_list in side_input_shapes_string.split('/'):
side_input_shape = [
int(dim) if dim != '-1' else None
for dim in side_input_shape_list.split(',')
]
side_input_shapes.append(side_input_shape)
else:
raise ValueError('When using side_inputs, side_input_shapes must be '
'specified in the input flags.')
if side_input_names_string:
side_input_names = list(side_input_names_string.split(','))
else:
raise ValueError('When using side_inputs, side_input_names must be '
'specified in the input flags.')
if side_input_types_string:
typelookup = {'float': tf.float32, 'int': tf.int32, 'string': tf.string}
side_input_types = [
typelookup[side_input_type]
for side_input_type in side_input_types_string.split(',')
]
else:
raise ValueError('When using side_inputs, side_input_types must be '
'specified in the input flags.')
return side_input_shapes, side_input_names, side_input_types
def rewrite_nn_resize_op(is_quantized=False):
"""Replaces a custom nearest-neighbor resize op with the Tensorflow version.
......@@ -140,6 +188,14 @@ def _image_tensor_input_placeholder(input_shape=None):
return input_tensor, input_tensor
def _side_input_tensor_placeholder(side_input_shape, side_input_name,
side_input_type):
"""Returns side input placeholder and side input tensor."""
side_input_tensor = tf.placeholder(
dtype=side_input_type, shape=side_input_shape, name=side_input_name)
return side_input_tensor, side_input_tensor
def _tf_example_input_placeholder(input_shape=None):
"""Returns input that accepts a batch of strings with tf examples.
......@@ -200,7 +256,7 @@ input_placeholder_fn_map = {
'image_tensor': _image_tensor_input_placeholder,
'encoded_image_string_tensor':
_encoded_image_string_tensor_input_placeholder,
'tf_example': _tf_example_input_placeholder,
'tf_example': _tf_example_input_placeholder
}
......@@ -312,7 +368,7 @@ def write_saved_model(saved_model_path,
Args:
saved_model_path: Path to write SavedModel.
frozen_graph_def: tf.GraphDef holding frozen graph.
inputs: The input placeholder tensor.
inputs: A tensor dictionary containing the inputs to a DetectionModel.
outputs: A tensor dictionary containing the outputs of a DetectionModel.
"""
with tf.Graph().as_default():
......@@ -322,8 +378,13 @@ def write_saved_model(saved_model_path,
builder = tf.saved_model.builder.SavedModelBuilder(saved_model_path)
tensor_info_inputs = {
'inputs': tf.saved_model.utils.build_tensor_info(inputs)}
tensor_info_inputs = {}
if isinstance(inputs, dict):
for k, v in inputs.items():
tensor_info_inputs[k] = tf.saved_model.utils.build_tensor_info(v)
else:
tensor_info_inputs['inputs'] = tf.saved_model.utils.build_tensor_info(
inputs)
tensor_info_outputs = {}
for k, v in outputs.items():
tensor_info_outputs[k] = tf.saved_model.utils.build_tensor_info(v)
......@@ -364,11 +425,11 @@ def write_graph_and_checkpoint(inference_graph_def,
def _get_outputs_from_inputs(input_tensors, detection_model,
output_collection_name):
output_collection_name, **side_inputs):
inputs = tf.cast(input_tensors, dtype=tf.float32)
preprocessed_inputs, true_image_shapes = detection_model.preprocess(inputs)
output_tensors = detection_model.predict(
preprocessed_inputs, true_image_shapes)
preprocessed_inputs, true_image_shapes, **side_inputs)
postprocessed_tensors = detection_model.postprocess(
output_tensors, true_image_shapes)
return add_output_tensor_nodes(postprocessed_tensors,
......@@ -376,32 +437,45 @@ def _get_outputs_from_inputs(input_tensors, detection_model,
def build_detection_graph(input_type, detection_model, input_shape,
output_collection_name, graph_hook_fn):
output_collection_name, graph_hook_fn,
use_side_inputs=False, side_input_shapes=None,
side_input_names=None, side_input_types=None):
"""Build the detection graph."""
if input_type not in input_placeholder_fn_map:
raise ValueError('Unknown input type: {}'.format(input_type))
placeholder_args = {}
side_inputs = {}
if input_shape is not None:
if (input_type != 'image_tensor' and
input_type != 'encoded_image_string_tensor' and
input_type != 'tf_example'):
input_type != 'tf_example' and
input_type != 'tf_sequence_example'):
raise ValueError('Can only specify input shape for `image_tensor`, '
'`encoded_image_string_tensor`, or `tf_example` '
'inputs.')
'`encoded_image_string_tensor`, `tf_example`, '
' or `tf_sequence_example` inputs.')
placeholder_args['input_shape'] = input_shape
placeholder_tensor, input_tensors = input_placeholder_fn_map[input_type](
**placeholder_args)
placeholder_tensors = {'inputs': placeholder_tensor}
if use_side_inputs:
for idx, side_input_name in enumerate(side_input_names):
side_input_placeholder, side_input = _side_input_tensor_placeholder(
side_input_shapes[idx], side_input_name, side_input_types[idx])
print(side_input)
side_inputs[side_input_name] = side_input
placeholder_tensors[side_input_name] = side_input_placeholder
outputs = _get_outputs_from_inputs(
input_tensors=input_tensors,
detection_model=detection_model,
output_collection_name=output_collection_name)
output_collection_name=output_collection_name,
**side_inputs)
# Add global step to the graph.
slim.get_or_create_global_step()
if graph_hook_fn: graph_hook_fn()
return outputs, placeholder_tensor
return outputs, placeholder_tensors
def _export_inference_graph(input_type,
......@@ -414,7 +488,11 @@ def _export_inference_graph(input_type,
output_collection_name='inference_op',
graph_hook_fn=None,
write_inference_graph=False,
temp_checkpoint_prefix=''):
temp_checkpoint_prefix='',
use_side_inputs=False,
side_input_shapes=None,
side_input_names=None,
side_input_types=None):
"""Export helper."""
tf.gfile.MakeDirs(output_directory)
frozen_graph_path = os.path.join(output_directory,
......@@ -422,12 +500,16 @@ def _export_inference_graph(input_type,
saved_model_path = os.path.join(output_directory, 'saved_model')
model_path = os.path.join(output_directory, 'model.ckpt')
outputs, placeholder_tensor = build_detection_graph(
outputs, placeholder_tensor_dict = build_detection_graph(
input_type=input_type,
detection_model=detection_model,
input_shape=input_shape,
output_collection_name=output_collection_name,
graph_hook_fn=graph_hook_fn)
graph_hook_fn=graph_hook_fn,
use_side_inputs=use_side_inputs,
side_input_shapes=side_input_shapes,
side_input_names=side_input_names,
side_input_types=side_input_types)
profile_inference_graph(tf.get_default_graph())
saver_kwargs = {}
......@@ -464,7 +546,8 @@ def _export_inference_graph(input_type,
f.write(str(inference_graph_def))
if additional_output_tensor_names is not None:
output_node_names = ','.join(outputs.keys()+additional_output_tensor_names)
output_node_names = ','.join(list(outputs.keys())+(
additional_output_tensor_names))
else:
output_node_names = ','.join(outputs.keys())
......@@ -480,7 +563,7 @@ def _export_inference_graph(input_type,
initializer_nodes='')
write_saved_model(saved_model_path, frozen_graph_def,
placeholder_tensor, outputs)
placeholder_tensor_dict, outputs)
def export_inference_graph(input_type,
......@@ -490,7 +573,11 @@ def export_inference_graph(input_type,
input_shape=None,
output_collection_name='inference_op',
additional_output_tensor_names=None,
write_inference_graph=False):
write_inference_graph=False,
use_side_inputs=False,
side_input_shapes=None,
side_input_names=None,
side_input_types=None):
"""Exports inference graph for the model specified in the pipeline config.
Args:
......@@ -506,6 +593,13 @@ def export_inference_graph(input_type,
additional_output_tensor_names: list of additional output
tensors to include in the frozen graph.
write_inference_graph: If true, writes inference graph to disk.
use_side_inputs: If True, the model requires side_inputs.
side_input_shapes: List of shapes of the side input tensors,
required if use_side_inputs is True.
side_input_names: List of names of the side input tensors,
required if use_side_inputs is True.
side_input_types: List of types of the side input tensors,
required if use_side_inputs is True.
"""
detection_model = model_builder.build(pipeline_config.model,
is_training=False)
......@@ -524,7 +618,11 @@ def export_inference_graph(input_type,
input_shape,
output_collection_name,
graph_hook_fn=graph_rewriter_fn,
write_inference_graph=write_inference_graph)
write_inference_graph=write_inference_graph,
use_side_inputs=use_side_inputs,
side_input_shapes=side_input_shapes,
side_input_names=side_input_names,
side_input_types=side_input_types)
pipeline_config.eval_config.use_moving_averages = False
config_util.save_pipeline_config(pipeline_config, output_directory)
......
# Lint as: python2, python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test for exporter_lib_v2.py."""
from __future__ import division
import io
import os
import unittest
from absl.testing import parameterized
import numpy as np
from PIL import Image
import six
import tensorflow.compat.v2 as tf
from object_detection import exporter_lib_v2
from object_detection.builders import model_builder
from object_detection.core import model
from object_detection.core import standard_fields as fields
from object_detection.protos import pipeline_pb2
from object_detection.utils import dataset_util
from object_detection.utils import tf_version
if six.PY2:
import mock # pylint: disable=g-importing-member,g-import-not-at-top
else:
from unittest import mock # pylint: disable=g-importing-member,g-import-not-at-top
class FakeModel(model.DetectionModel):
def __init__(self, conv_weight_scalar=1.0):
super(FakeModel, self).__init__(num_classes=2)
self._conv = tf.keras.layers.Conv2D(
filters=1, kernel_size=1, strides=(1, 1), padding='valid',
kernel_initializer=tf.keras.initializers.Constant(
value=conv_weight_scalar))
def preprocess(self, inputs):
true_image_shapes = [] # Doesn't matter for the fake model.
return tf.identity(inputs), true_image_shapes
def predict(self, preprocessed_inputs, true_image_shapes):
return {'image': self._conv(preprocessed_inputs)}
def postprocess(self, prediction_dict, true_image_shapes):
predict_tensor_sum = tf.reduce_sum(prediction_dict['image'])
with tf.control_dependencies(list(prediction_dict.values())):
postprocessed_tensors = {
'detection_boxes': tf.constant([[[0.0, 0.0, 0.5, 0.5],
[0.5, 0.5, 0.8, 0.8]],
[[0.5, 0.5, 1.0, 1.0],
[0.0, 0.0, 0.0, 0.0]]], tf.float32),
'detection_scores': predict_tensor_sum + tf.constant(
[[0.7, 0.6], [0.9, 0.0]], tf.float32),
'detection_classes': tf.constant([[0, 1],
[1, 0]], tf.float32),
'num_detections': tf.constant([2, 1], tf.float32),
}
return postprocessed_tensors
def restore_map(self, checkpoint_path, fine_tune_checkpoint_type):
pass
def loss(self, prediction_dict, true_image_shapes):
pass
def regularization_losses(self):
pass
def updates(self):
pass
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class ExportInferenceGraphTest(tf.test.TestCase, parameterized.TestCase):
def _save_checkpoint_from_mock_model(
self, checkpoint_dir, conv_weight_scalar=6.0):
mock_model = FakeModel(conv_weight_scalar)
fake_image = tf.zeros(shape=[1, 10, 10, 3], dtype=tf.float32)
preprocessed_inputs, true_image_shapes = mock_model.preprocess(fake_image)
predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
mock_model.postprocess(predictions, true_image_shapes)
ckpt = tf.train.Checkpoint(model=mock_model)
exported_checkpoint_manager = tf.train.CheckpointManager(
ckpt, checkpoint_dir, max_to_keep=1)
exported_checkpoint_manager.save(checkpoint_number=0)
@parameterized.parameters(
{'input_type': 'image_tensor'},
{'input_type': 'encoded_image_string_tensor'},
{'input_type': 'tf_example'},
)
def test_export_yields_correct_directory_structure(
self, input_type='image_tensor'):
tmp_dir = self.get_temp_dir()
self._save_checkpoint_from_mock_model(tmp_dir)
with mock.patch.object(
model_builder, 'build', autospec=True) as mock_builder:
mock_builder.return_value = FakeModel()
output_directory = os.path.join(tmp_dir, 'output')
pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
exporter_lib_v2.export_inference_graph(
input_type=input_type,
pipeline_config=pipeline_config,
trained_checkpoint_dir=tmp_dir,
output_directory=output_directory)
self.assertTrue(os.path.exists(os.path.join(
output_directory, 'saved_model', 'saved_model.pb')))
self.assertTrue(os.path.exists(os.path.join(
output_directory, 'saved_model', 'variables', 'variables.index')))
self.assertTrue(os.path.exists(os.path.join(
output_directory, 'saved_model', 'variables',
'variables.data-00000-of-00001')))
self.assertTrue(os.path.exists(os.path.join(
output_directory, 'checkpoint', 'ckpt-0.index')))
self.assertTrue(os.path.exists(os.path.join(
output_directory, 'checkpoint', 'ckpt-0.data-00000-of-00001')))
self.assertTrue(os.path.exists(os.path.join(
output_directory, 'pipeline.config')))
def get_dummy_input(self, input_type):
"""Get dummy input for the given input type."""
if input_type == 'image_tensor':
return np.zeros(shape=(1, 20, 20, 3), dtype=np.uint8)
if input_type == 'float_image_tensor':
return np.zeros(shape=(1, 20, 20, 3), dtype=np.float32)
elif input_type == 'encoded_image_string_tensor':
image = Image.new('RGB', (20, 20))
byte_io = io.BytesIO()
image.save(byte_io, 'PNG')
return [byte_io.getvalue()]
elif input_type == 'tf_example':
image_tensor = tf.zeros((20, 20, 3), dtype=tf.uint8)
encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).numpy()
example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded':
dataset_util.bytes_feature(encoded_jpeg),
'image/format':
dataset_util.bytes_feature(six.b('jpeg')),
'image/source_id':
dataset_util.bytes_feature(six.b('image_id')),
})).SerializeToString()
return [example]
@parameterized.parameters(
{'input_type': 'image_tensor'},
{'input_type': 'encoded_image_string_tensor'},
{'input_type': 'tf_example'},
{'input_type': 'float_image_tensor'},
)
def test_export_saved_model_and_run_inference(
self, input_type='image_tensor'):
tmp_dir = self.get_temp_dir()
self._save_checkpoint_from_mock_model(tmp_dir)
with mock.patch.object(
model_builder, 'build', autospec=True) as mock_builder:
mock_builder.return_value = FakeModel()
output_directory = os.path.join(tmp_dir, 'output')
pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
exporter_lib_v2.export_inference_graph(
input_type=input_type,
pipeline_config=pipeline_config,
trained_checkpoint_dir=tmp_dir,
output_directory=output_directory)
saved_model_path = os.path.join(output_directory, 'saved_model')
detect_fn = tf.saved_model.load(saved_model_path)
image = self.get_dummy_input(input_type)
detections = detect_fn(image)
detection_fields = fields.DetectionResultFields
self.assertAllClose(detections[detection_fields.detection_boxes],
[[[0.0, 0.0, 0.5, 0.5],
[0.5, 0.5, 0.8, 0.8]],
[[0.5, 0.5, 1.0, 1.0],
[0.0, 0.0, 0.0, 0.0]]])
self.assertAllClose(detections[detection_fields.detection_scores],
[[0.7, 0.6], [0.9, 0.0]])
self.assertAllClose(detections[detection_fields.detection_classes],
[[1, 2], [2, 1]])
self.assertAllClose(detections[detection_fields.num_detections], [2, 1])
def test_export_checkpoint_and_run_inference_with_image(self):
tmp_dir = self.get_temp_dir()
self._save_checkpoint_from_mock_model(tmp_dir, conv_weight_scalar=2.0)
with mock.patch.object(
model_builder, 'build', autospec=True) as mock_builder:
mock_builder.return_value = FakeModel()
output_directory = os.path.join(tmp_dir, 'output')
pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
exporter_lib_v2.export_inference_graph(
input_type='image_tensor',
pipeline_config=pipeline_config,
trained_checkpoint_dir=tmp_dir,
output_directory=output_directory)
mock_model = FakeModel()
ckpt = tf.compat.v2.train.Checkpoint(
model=mock_model)
checkpoint_dir = os.path.join(tmp_dir, 'output', 'checkpoint')
manager = tf.compat.v2.train.CheckpointManager(
ckpt, checkpoint_dir, max_to_keep=7)
ckpt.restore(manager.latest_checkpoint).expect_partial()
fake_image = tf.ones(shape=[1, 5, 5, 3], dtype=tf.float32)
preprocessed_inputs, true_image_shapes = mock_model.preprocess(fake_image)
predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
detections = mock_model.postprocess(predictions, true_image_shapes)
# 150 = conv_weight_scalar * height * width * channels = 2 * 5 * 5 * 3.
self.assertAllClose(detections['detection_scores'],
[[150 + 0.7, 150 + 0.6], [150 + 0.9, 150 + 0.0]])
if __name__ == '__main__':
tf.enable_v2_behavior()
tf.test.main()
# Lint as: python2, python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to export object detection inference graph."""
import os
import tensorflow.compat.v2 as tf
from object_detection.builders import model_builder
from object_detection.core import standard_fields as fields
from object_detection.data_decoders import tf_example_decoder
from object_detection.utils import config_util
def _decode_image(encoded_image_string_tensor):
image_tensor = tf.image.decode_image(encoded_image_string_tensor,
channels=3)
image_tensor.set_shape((None, None, 3))
return image_tensor
def _decode_tf_example(tf_example_string_tensor):
tensor_dict = tf_example_decoder.TfExampleDecoder().decode(
tf_example_string_tensor)
image_tensor = tensor_dict[fields.InputDataFields.image]
return image_tensor
class DetectionInferenceModule(tf.Module):
"""Detection Inference Module."""
def __init__(self, detection_model):
"""Initializes a module for detection.
Args:
detection_model: The detection model to use for inference.
"""
self._model = detection_model
def _run_inference_on_images(self, image):
"""Cast image to float and run inference.
Args:
image: uint8 Tensor of shape [1, None, None, 3]
Returns:
Tensor dictionary holding detections.
"""
label_id_offset = 1
image = tf.cast(image, tf.float32)
image, shapes = self._model.preprocess(image)
prediction_dict = self._model.predict(image, shapes)
detections = self._model.postprocess(prediction_dict, shapes)
classes_field = fields.DetectionResultFields.detection_classes
detections[classes_field] = (
tf.cast(detections[classes_field], tf.float32) + label_id_offset)
for key, val in detections.items():
detections[key] = tf.cast(val, tf.float32)
return detections
class DetectionFromImageModule(DetectionInferenceModule):
"""Detection Inference Module for image inputs."""
@tf.function(
input_signature=[
tf.TensorSpec(shape=[1, None, None, 3], dtype=tf.uint8)])
def __call__(self, input_tensor):
return self._run_inference_on_images(input_tensor)
class DetectionFromFloatImageModule(DetectionInferenceModule):
"""Detection Inference Module for float image inputs."""
@tf.function(
input_signature=[
tf.TensorSpec(shape=[1, None, None, 3], dtype=tf.float32)])
def __call__(self, input_tensor):
return self._run_inference_on_images(input_tensor)
class DetectionFromEncodedImageModule(DetectionInferenceModule):
"""Detection Inference Module for encoded image string inputs."""
@tf.function(input_signature=[tf.TensorSpec(shape=[1], dtype=tf.string)])
def __call__(self, input_tensor):
with tf.device('cpu:0'):
image = tf.map_fn(
_decode_image,
elems=input_tensor,
dtype=tf.uint8,
parallel_iterations=32,
back_prop=False)
return self._run_inference_on_images(image)
class DetectionFromTFExampleModule(DetectionInferenceModule):
"""Detection Inference Module for TF.Example inputs."""
@tf.function(input_signature=[tf.TensorSpec(shape=[1], dtype=tf.string)])
def __call__(self, input_tensor):
with tf.device('cpu:0'):
image = tf.map_fn(
_decode_tf_example,
elems=input_tensor,
dtype=tf.uint8,
parallel_iterations=32,
back_prop=False)
return self._run_inference_on_images(image)
DETECTION_MODULE_MAP = {
'image_tensor': DetectionFromImageModule,
'encoded_image_string_tensor':
DetectionFromEncodedImageModule,
'tf_example': DetectionFromTFExampleModule,
'float_image_tensor': DetectionFromFloatImageModule
}
def export_inference_graph(input_type,
pipeline_config,
trained_checkpoint_dir,
output_directory):
"""Exports inference graph for the model specified in the pipeline config.
This function creates `output_directory` if it does not already exist,
which will hold a copy of the pipeline config with filename `pipeline.config`,
and two subdirectories named `checkpoint` and `saved_model`
(containing the exported checkpoint and SavedModel respectively).
Args:
input_type: Type of input for the graph. Can be one of ['image_tensor',
'encoded_image_string_tensor', 'tf_example'].
pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
trained_checkpoint_dir: Path to the trained checkpoint file.
output_directory: Path to write outputs.
Raises:
ValueError: if input_type is invalid.
"""
output_checkpoint_directory = os.path.join(output_directory, 'checkpoint')
output_saved_model_directory = os.path.join(output_directory, 'saved_model')
detection_model = model_builder.build(pipeline_config.model,
is_training=False)
ckpt = tf.train.Checkpoint(
model=detection_model)
manager = tf.train.CheckpointManager(
ckpt, trained_checkpoint_dir, max_to_keep=1)
status = ckpt.restore(manager.latest_checkpoint).expect_partial()
if input_type not in DETECTION_MODULE_MAP:
raise ValueError('Unrecognized `input_type`')
detection_module = DETECTION_MODULE_MAP[input_type](detection_model)
# Getting the concrete function traces the graph and forces variables to
# be constructed --- only after this can we save the checkpoint and
# saved model.
concrete_function = detection_module.__call__.get_concrete_function()
status.assert_existing_objects_matched()
exported_checkpoint_manager = tf.train.CheckpointManager(
ckpt, output_checkpoint_directory, max_to_keep=1)
exported_checkpoint_manager.save(checkpoint_number=0)
tf.saved_model.save(detection_module,
output_saved_model_directory,
signatures=concrete_function)
config_util.save_pipeline_config(pipeline_config, output_directory)
# Lint as: python2, python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Tool to export an object detection model for inference.
Prepares an object detection tensorflow graph for inference using model
configuration and a trained checkpoint. Outputs associated checkpoint files,
a SavedModel, and a copy of the model config.
The inference graph contains one of three input nodes depending on the user
specified option.
* `image_tensor`: Accepts a uint8 4-D tensor of shape [1, None, None, 3]
* `float_image_tensor`: Accepts a float32 4-D tensor of shape
[1, None, None, 3]
* `encoded_image_string_tensor`: Accepts a 1-D string tensor of shape [None]
containing encoded PNG or JPEG images. Image resolutions are expected to be
the same if more than 1 image is provided.
* `tf_example`: Accepts a 1-D string tensor of shape [None] containing
serialized TFExample protos. Image resolutions are expected to be the same
if more than 1 image is provided.
and the following output nodes returned by the model.postprocess(..):
* `num_detections`: Outputs float32 tensors of the form [batch]
that specifies the number of valid boxes per image in the batch.
* `detection_boxes`: Outputs float32 tensors of the form
[batch, num_boxes, 4] containing detected boxes.
* `detection_scores`: Outputs float32 tensors of the form
[batch, num_boxes] containing class scores for the detections.
* `detection_classes`: Outputs float32 tensors of the form
[batch, num_boxes] containing classes for the detections.
Example Usage:
--------------
python exporter_main_v2.py \
--input_type image_tensor \
--pipeline_config_path path/to/ssd_inception_v2.config \
--trained_checkpoint_dir path/to/checkpoint \
--output_directory path/to/exported_model_directory
The expected output would be in the directory
path/to/exported_model_directory (which is created if it does not exist)
holding two subdirectories (corresponding to checkpoint and SavedModel,
respectively) and a copy of the pipeline config.
Config overrides (see the `config_override` flag) are text protobufs
(also of type pipeline_pb2.TrainEvalPipelineConfig) which are used to override
certain fields in the provided pipeline_config_path. These are useful for
making small changes to the inference graph that differ from the training or
eval config.
Example Usage (in which we change the second stage post-processing score
threshold to be 0.5):
python exporter_main_v2.py \
--input_type image_tensor \
--pipeline_config_path path/to/ssd_inception_v2.config \
--trained_checkpoint_dir path/to/checkpoint \
--output_directory path/to/exported_model_directory \
--config_override " \
model{ \
faster_rcnn { \
second_stage_post_processing { \
batch_non_max_suppression { \
score_threshold: 0.5 \
} \
} \
} \
}"
"""
from absl import app
from absl import flags
import tensorflow.compat.v2 as tf
from google.protobuf import text_format
from object_detection import exporter_lib_v2
from object_detection.protos import pipeline_pb2
tf.enable_v2_behavior()
FLAGS = flags.FLAGS
flags.DEFINE_string('input_type', 'image_tensor', 'Type of input node. Can be '
'one of [`image_tensor`, `encoded_image_string_tensor`, '
'`tf_example`, `float_image_tensor`]')
flags.DEFINE_string('pipeline_config_path', None,
'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
'file.')
flags.DEFINE_string('trained_checkpoint_dir', None,
'Path to trained checkpoint directory')
flags.DEFINE_string('output_directory', None, 'Path to write outputs.')
flags.DEFINE_string('config_override', '',
'pipeline_pb2.TrainEvalPipelineConfig '
'text proto to override pipeline_config_path.')
flags.mark_flag_as_required('pipeline_config_path')
flags.mark_flag_as_required('trained_checkpoint_dir')
flags.mark_flag_as_required('output_directory')
def main(_):
pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
with tf.io.gfile.GFile(FLAGS.pipeline_config_path, 'r') as f:
text_format.Merge(f.read(), pipeline_config)
text_format.Merge(FLAGS.config_override, pipeline_config)
exporter_lib_v2.export_inference_graph(
FLAGS.input_type, pipeline_config, FLAGS.trained_checkpoint_dir,
FLAGS.output_directory)
if __name__ == '__main__':
app.run(main)
......@@ -19,6 +19,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import unittest
import numpy as np
import six
import tensorflow.compat.v1 as tf
......@@ -33,12 +34,13 @@ from object_detection.core import model
from object_detection.protos import graph_rewriter_pb2
from object_detection.protos import pipeline_pb2
from object_detection.utils import ops
from object_detection.utils import tf_version
from object_detection.utils import variables_helper
if six.PY2:
import mock # pylint: disable=g-import-not-at-top
else:
from unittest import mock # pylint: disable=g-import-not-at-top
mock = unittest.mock # pylint: disable=g-import-not-at-top, g-importing-member
# pylint: disable=g-import-not-at-top
try:
......@@ -113,6 +115,7 @@ class FakeModel(model.DetectionModel):
pass
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class ExportInferenceGraphTest(tf.test.TestCase):
def _save_checkpoint_from_mock_model(self,
......
# Context R-CNN
Context R-CNN is an object detection model that uses contextual features to
improve object detection. See https://arxiv.org/abs/1912.03538 for more details.
## Table of Contents
* [Preparing Context Data for Context R-CNN](#preparing-context-data-for-context-r-cnn)
+ [Generating TfRecords from a set of images and a COCO-CameraTraps style
JSON](#generating-tfrecords-from-a-set-of-images-and-a-coco-cameratraps-style-json)
+ [Generating weakly-supervised bounding box labels for image-labeled data](#generating-weakly-supervised-bounding-box-labels-for-image-labeled-data)
+ [Generating and saving contextual features for each image](#generating-and-saving-contextual-features-for-each-image)
+ [Building up contextual memory banks and storing them for each context
group](#building-up-contextual-memory-banks-and-storing-them-for-each-context-group)
- [Training a Context R-CNN Model](#training-a-context-r-cnn-model)
- [Exporting a Context R-CNN Model](#exporting-a-context-r-cnn-model)
## Preparing Context Data for Context R-CNN
In this section, we will walk through the process of generating TfRecords with
contextual features. We focus on building context from object-centric features
generated with a pre-trained Faster R-CNN model, but you can adapt the provided
code to use alternative feature extractors.
### Generating TfRecords from a set of images and a COCO-CameraTraps style JSON
If your data is already stored in TfRecords, you can skip this first step.
We assume a COCO-CameraTraps json format, as described on
[LILA.science](https://github.com/microsoft/CameraTraps/blob/master/data_management/README.md).
COCO-CameraTraps is a format that adds static-camera-specific fields, such as a
location ID and datetime, to the well-established COCO format. To generate
appropriate context later on, be sure you have specified each contextual group
with a different location ID, which in the static camera case would be the ID of
the camera, as well as the datetime each photo was taken. We assume that empty
images will be labeled 'empty' with class id 0.
To generate TfRecords from your database and local image folder, run
```
python object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_main.py \
--alsologtostderr \
--output_tfrecord_prefix="/path/to/output/tfrecord/location/prefix" \
--image_directory="/path/to/image/folder/" \
--input_annotations_file="path/to/annotations.json"
```
### Generating weakly-supervised bounding box labels for image-labeled data
If all your data already has bounding box labels you can skip this step.
Many camera trap datasets do not have bounding box labels, or only have bounding
box labels for some of the data. We have provided code to add bounding boxes
from a pretrained model (such as the
[Microsoft AI for Earth MegaDetector](https://github.com/microsoft/CameraTraps/blob/master/megadetector.md))
and match the boxes to the image-level class label.
To export your pretrained detection model, run
```
python object_detection/export_inference_graph.py \
--alsologtostderr \
--input_type tf_example \
--pipeline_config_path path/to/faster_rcnn_model.config \
--trained_checkpoint_prefix path/to/model.ckpt \
--output_directory path/to/exported_model_directory
```
To add bounding boxes to your dataset using the above model, run
```
python object_detection/dataset_tools/context_rcnn/generate_detection_data.py \
--alsologtostderr \
--input_tfrecord path/to/input_tfrecord@X \
--output_tfrecord path/to/output_tfrecord@X \
--model_dir path/to/exported_model_directory/saved_model
```
If an image already has bounding box labels, those labels are left unchanged. If
an image is labeled 'empty' (class ID 0), we will not generate boxes for that
image.
### Generating and saving contextual features for each image
We next extract and store features for each image from a pretrained model. This
model can be the same model as above, or be a class-specific detection model
trained on data from your classes of interest.
To export your pretrained detection model, run
```
python object_detection/export_inference_graph.py \
--alsologtostderr \
--input_type tf_example \
--pipeline_config_path path/to/pipeline.config \
--trained_checkpoint_prefix path/to/model.ckpt \
--output_directory path/to/exported_model_directory \
--additional_output_tensor_names detection_features
```
To generate and save contextual features for your data, run
```
python object_detection/dataset_tools/context_rcnn/generate_embedding_data.py \
--alsologtostderr \
--embedding_input_tfrecord path/to/input_tfrecords* \
--embedding_output_tfrecord path/to/output_tfrecords \
--embedding_model_dir path/to/exported_model_directory/saved_model
```
### Building up contextual memory banks and storing them for each context group
To build the context features into memory banks, run
```
python object_detection/dataset_tools/context_rcnn/add_context_to_examples.py \
--input_tfrecord path/to/input_tfrecords* \
--output_tfrecord path/to/output_tfrecords \
--sequence_key image/location \
--time_horizon month
```
For all options, see add_context_to_examples.py. By default, this code builds
TfSequenceExamples, which are more data efficient (this allows you to store the
context features once for each context group, as opposed to once per image). If
you would like to export TfExamples instead, set flag `--output_type
tf_example`.
If you use TfSequenceExamples, you must be sure to set `input_type:
TF_SEQUENCE_EXAMPLE` within your Context R-CNN configs for both
train_input_reader and test_input_reader. See
`object_detection/test_data/context_rcnn_camera_trap.config`
for an example.
## Training a Context R-CNN Model
To train a Context R-CNN model, you must first set up your config file. See
`test_data/context_rcnn_camera_trap.config` for an example. The important
difference between this config and a Faster R-CNN config is the inclusion of a
`context_config` within the model, which defines the necessary Context R-CNN
parameters.
```
context_config {
max_num_context_features: 2000
context_feature_length: 2057
}
```
Once your config file has been updated with your local paths, you can follow
along with documentation for running [locally](running_locally.md), or
[on the cloud](running_on_cloud.md).
## Exporting a Context R-CNN Model
Since Context R-CNN takes context features as well as images as input, we have
to explicitly define the other inputs ("side_inputs") to the model when
exporting, as below. This example is shown with default context feature shapes.
```
python export_inference_graph.py \
--input_type image_tensor \
--input_shape 1,-1,-1,3 \
--pipeline_config_path /path/to/context_rcnn_model/pipeline.config \
--trained_checkpoint_prefix /path/to/context_rcnn_model/model.ckpt \
--output_directory /path/to/output_directory \
--use_side_inputs True \
--side_input_shapes 1,2000,2057/1 \
--side_input_names context_features,valid_context_size \
--side_input_types float,int
```
# Tensorflow detection model zoo
We provide a collection of detection models pre-trained on the [COCO
dataset](http://cocodataset.org), the [Kitti dataset](http://www.cvlibs.net/datasets/kitti/),
the
We provide a collection of detection models pre-trained on the
[COCO dataset](http://cocodataset.org), the
[Kitti dataset](http://www.cvlibs.net/datasets/kitti/), the
[Open Images dataset](https://storage.googleapis.com/openimages/web/index.html),
the [AVA v2.1 dataset](https://research.google.com/ava/) and the
[iNaturalist Species Detection Dataset](https://github.com/visipedia/inat_comp/blob/master/2017/README.md#bounding-boxes).
the [AVA v2.1 dataset](https://research.google.com/ava/) the
[iNaturalist Species Detection Dataset](https://github.com/visipedia/inat_comp/blob/master/2017/README.md#bounding-boxes)
and the
[Snapshot Serengeti Dataset](http://lila.science/datasets/snapshot-serengeti).
These models can be useful for out-of-the-box inference if you are interested in
categories already in those datasets. They are also useful for initializing your
models when training on novel datasets.
......@@ -15,17 +17,17 @@ In the table below, we list each such pre-trained model including:
* a model name that corresponds to a config file that was used to train this
model in the `samples/configs` directory,
* a download link to a tar.gz file containing the pre-trained model,
* model speed --- we report running time in ms per 600x600 image (including all
pre and post-processing), but please be
aware that these timings depend highly on one's specific hardware
configuration (these timings were performed using an Nvidia
GeForce GTX TITAN X card) and should be treated more as relative timings in
many cases. Also note that desktop GPU timing does not always reflect mobile
run time. For example Mobilenet V2 is faster on mobile devices than Mobilenet
V1, but is slightly slower on desktop GPU.
* detector performance on subset of the COCO validation set or Open Images test split as measured by the dataset-specific mAP measure.
Here, higher is better, and we only report bounding box mAP rounded to the
nearest integer.
* model speed --- we report running time in ms per 600x600 image (including
all pre and post-processing), but please be aware that these timings depend
highly on one's specific hardware configuration (these timings were
performed using an Nvidia GeForce GTX TITAN X card) and should be treated
more as relative timings in many cases. Also note that desktop GPU timing
does not always reflect mobile run time. For example Mobilenet V2 is faster
on mobile devices than Mobilenet V1, but is slightly slower on desktop GPU.
* detector performance on subset of the COCO validation set, Open Images test
split, iNaturalist test split, or Snapshot Serengeti LILA.science test
split. as measured by the dataset-specific mAP measure. Here, higher is
better, and we only report bounding box mAP rounded to the nearest integer.
* Output types (`Boxes`, and `Masks` if applicable )
You can un-tar each tar.gz file via, e.g.,:
......@@ -53,57 +55,59 @@ Inside the un-tar'ed directory, you will find:
Some remarks on frozen inference graphs:
* If you try to evaluate the frozen graph, you may find performance numbers for
some of the models to be slightly lower than what we report in the below
tables. This is because we discard detections with scores below a
threshold (typically 0.3) when creating the frozen graph. This corresponds
effectively to picking a point on the precision recall curve of
a detector (and discarding the part past that point), which negatively impacts
standard mAP metrics.
* If you try to evaluate the frozen graph, you may find performance numbers
for some of the models to be slightly lower than what we report in the below
tables. This is because we discard detections with scores below a threshold
(typically 0.3) when creating the frozen graph. This corresponds effectively
to picking a point on the precision recall curve of a detector (and
discarding the part past that point), which negatively impacts standard mAP
metrics.
* Our frozen inference graphs are generated using the
[v1.12.0](https://github.com/tensorflow/tensorflow/tree/v1.12.0)
release version of Tensorflow and we do not guarantee that these will work
with other versions; this being said, each frozen inference graph can be
[v1.12.0](https://github.com/tensorflow/tensorflow/tree/v1.12.0) release
version of Tensorflow and we do not guarantee that these will work with
other versions; this being said, each frozen inference graph can be
regenerated using your current version of Tensorflow by re-running the
[exporter](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/exporting_models.md),
pointing it at the model directory as well as the corresponding config file in
pointing it at the model directory as well as the corresponding config file
in
[samples/configs](https://github.com/tensorflow/models/tree/master/research/object_detection/samples/configs).
## COCO-trained models
| Model name | Speed (ms) | COCO mAP[^1] | Outputs |
| ------------ | :--------------: | :--------------: | :-------------: |
| [ssd_mobilenet_v1_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tar.gz) | 30 | 21 | Boxes |
| [ssd_mobilenet_v1_0.75_depth_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_0.75_depth_300x300_coco14_sync_2018_07_03.tar.gz) | 26 | 18 | Boxes |
| [ssd_mobilenet_v1_quantized_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz) | 29 | 18 | Boxes |
| [ssd_mobilenet_v1_0.75_depth_quantized_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_0.75_depth_quantized_300x300_coco14_sync_2018_07_18.tar.gz) | 29 | 16 | Boxes |
| [ssd_mobilenet_v1_ppn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_ppn_shared_box_predictor_300x300_coco14_sync_2018_07_03.tar.gz) | 26 | 20 | Boxes |
| [ssd_mobilenet_v1_fpn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz) | 56 | 32 | Boxes |
| [ssd_resnet_50_fpn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz) | 76 | 35 | Boxes |
| [ssd_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz) | 31 | 22 | Boxes |
| [ssd_mobilenet_v2_quantized_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz) | 29 | 22 | Boxes |
| [ssdlite_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobilenet_v2_coco_2018_05_09.tar.gz) | 27 | 22 | Boxes |
| [ssd_inception_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_2018_01_28.tar.gz) | 42 | 24 | Boxes |
| [faster_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 58 | 28 | Boxes |
| [faster_rcnn_resnet50_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_coco_2018_01_28.tar.gz) | 89 | 30 | Boxes |
| [faster_rcnn_resnet50_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_lowproposals_coco_2018_01_28.tar.gz) | 64 | | Boxes |
| [rfcn_resnet101_coco](http://download.tensorflow.org/models/object_detection/rfcn_resnet101_coco_2018_01_28.tar.gz) | 92 | 30 | Boxes |
| [faster_rcnn_resnet101_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_2018_01_28.tar.gz) | 106 | 32 | Boxes |
| [faster_rcnn_resnet101_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_lowproposals_coco_2018_01_28.tar.gz) | 82 | | Boxes |
| [faster_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz) | 620 | 37 | Boxes |
| [faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco_2018_01_28.tar.gz) | 241 | | Boxes |
| [faster_rcnn_nas](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_coco_2018_01_28.tar.gz) | 1833 | 43 | Boxes |
| [faster_rcnn_nas_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_lowproposals_coco_2018_01_28.tar.gz) | 540 | | Boxes |
| [mask_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz) | 771 | 36 | Masks |
| [mask_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 79 | 25 | Masks |
| [mask_rcnn_resnet101_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet101_atrous_coco_2018_01_28.tar.gz) | 470 | 33 | Masks |
| [mask_rcnn_resnet50_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet50_atrous_coco_2018_01_28.tar.gz) | 343 | 29 | Masks |
Note: The asterisk (☆) at the end of model name indicates that this model supports TPU training.
Note: If you download the tar.gz file of quantized models and un-tar, you will get different set of files - a checkpoint, a config file and tflite frozen graphs (txt/binary).
Model name | Speed (ms) | COCO mAP[^1] | Outputs
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------: | :----------: | :-----:
[ssd_mobilenet_v1_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tar.gz) | 30 | 21 | Boxes
[ssd_mobilenet_v1_0.75_depth_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_0.75_depth_300x300_coco14_sync_2018_07_03.tar.gz) | 26 | 18 | Boxes
[ssd_mobilenet_v1_quantized_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz) | 29 | 18 | Boxes
[ssd_mobilenet_v1_0.75_depth_quantized_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_0.75_depth_quantized_300x300_coco14_sync_2018_07_18.tar.gz) | 29 | 16 | Boxes
[ssd_mobilenet_v1_ppn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_ppn_shared_box_predictor_300x300_coco14_sync_2018_07_03.tar.gz) | 26 | 20 | Boxes
[ssd_mobilenet_v1_fpn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz) | 56 | 32 | Boxes
[ssd_resnet_50_fpn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz) | 76 | 35 | Boxes
[ssd_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz) | 31 | 22 | Boxes
[ssd_mobilenet_v2_quantized_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz) | 29 | 22 | Boxes
[ssdlite_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobilenet_v2_coco_2018_05_09.tar.gz) | 27 | 22 | Boxes
[ssd_inception_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_2018_01_28.tar.gz) | 42 | 24 | Boxes
[faster_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 58 | 28 | Boxes
[faster_rcnn_resnet50_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_coco_2018_01_28.tar.gz) | 89 | 30 | Boxes
[faster_rcnn_resnet50_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_lowproposals_coco_2018_01_28.tar.gz) | 64 | | Boxes
[rfcn_resnet101_coco](http://download.tensorflow.org/models/object_detection/rfcn_resnet101_coco_2018_01_28.tar.gz) | 92 | 30 | Boxes
[faster_rcnn_resnet101_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_2018_01_28.tar.gz) | 106 | 32 | Boxes
[faster_rcnn_resnet101_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_lowproposals_coco_2018_01_28.tar.gz) | 82 | | Boxes
[faster_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz) | 620 | 37 | Boxes
[faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco_2018_01_28.tar.gz) | 241 | | Boxes
[faster_rcnn_nas](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_coco_2018_01_28.tar.gz) | 1833 | 43 | Boxes
[faster_rcnn_nas_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_lowproposals_coco_2018_01_28.tar.gz) | 540 | | Boxes
[mask_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz) | 771 | 36 | Masks
[mask_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 79 | 25 | Masks
[mask_rcnn_resnet101_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet101_atrous_coco_2018_01_28.tar.gz) | 470 | 33 | Masks
[mask_rcnn_resnet50_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet50_atrous_coco_2018_01_28.tar.gz) | 343 | 29 | Masks
Note: The asterisk (☆) at the end of model name indicates that this model
supports TPU training.
Note: If you download the tar.gz file of quantized models and un-tar, you will
get different set of files - a checkpoint, a config file and tflite frozen
graphs (txt/binary).
### Mobile models
......@@ -115,20 +119,22 @@ Model name
[ssd_mobilenet_v3_small_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v3_small_coco_2020_01_14.tar.gz) | 43 | 15.4 | Boxes
### Pixel4 Edge TPU models
Model name | Pixel 4 Edge TPU Latency (ms) | COCO mAP (fp32/uint8) | Outputs
----------------------------------------------------------------------------------------------------------------------------------- | :------------------: | :------: | :-----:
--------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------: | :-------------------: | :-----:
[ssd_mobiledet_edgetpu_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobiledet_edgetpu_320x320_coco_2020_05_19.tar.gz) | 6.9 | 25.9/25.6 | Boxes
[ssd_mobilenet_edgetpu_coco](https://storage.cloud.google.com/mobilenet_edgetpu/checkpoints/ssdlite_mobilenet_edgetpu_coco_quant.tar.gz) | 6.6 | -/24.3 | Boxes
### Pixel4 DSP models
Model name | Pixel 4 DSP Latency (ms) | COCO mAP (fp32/uint8) | Outputs
----------------------------------------------------------------------------------------------------------------------------------- | :------------------: | :------: | :-----:
------------------------------------------------------------------------------------------------------------------------------------- | :----------------------: | :-------------------: | :-----:
[ssd_mobiledet_dsp_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobiledet_dsp_320x320_coco_2020_05_19.tar.gz) | 12.3 | 28.9/28.8 | Boxes
## Kitti-trained models
Model name | Speed (ms) | Pascal mAP@0.5 | Outputs
----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
----------------------------------------------------------------------------------------------------------------------------------- | :--------: | :------------: | :-----:
[faster_rcnn_resnet101_kitti](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_kitti_2018_01_28.tar.gz) | 79 | 87 | Boxes
## Open Images-trained models
......@@ -140,31 +146,42 @@ Model name
[facessd_mobilenet_v2_quantized_open_image_v4](http://download.tensorflow.org/models/object_detection/facessd_mobilenet_v2_quantized_320x320_open_image_v4.tar.gz) [^3] | 20 | 73 (faces) | Boxes
Model name | Speed (ms) | Open Images mAP@0.5[^4] | Outputs
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------: | :---------------------: | :-----:
---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------: | :---------------------: | :-----:
[faster_rcnn_inception_resnet_v2_atrous_oidv4](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_oid_v4_2018_12_12.tar.gz) | 425 | 54 | Boxes
[ssd_mobilenetv2_oidv4](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_oid_v4_2018_12_12.tar.gz) | 89 | 36 | Boxes
[ssd_resnet_101_fpn_oidv4](http://download.tensorflow.org/models/object_detection/ssd_resnet101_v1_fpn_shared_box_predictor_oid_512x512_sync_2019_01_20.tar.gz) | 237 | 38 | Boxes
## iNaturalist Species-trained models
Model name | Speed (ms) | Pascal mAP@0.5 | Outputs
----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
--------------------------------------------------------------------------------------------------------------------------------- | :--------: | :------------: | :-----:
[faster_rcnn_resnet101_fgvc](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_fgvc_2018_07_19.tar.gz) | 395 | 58 | Boxes
[faster_rcnn_resnet50_fgvc](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_fgvc_2018_07_19.tar.gz) | 366 | 55 | Boxes
## AVA v2.1 trained models
Model name | Speed (ms) | Pascal mAP@0.5 | Outputs
----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
----------------------------------------------------------------------------------------------------------------------------------------- | :--------: | :------------: | :-----:
[faster_rcnn_resnet101_ava_v2.1](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_ava_v2.1_2018_04_30.tar.gz) | 93 | 11 | Boxes
[^1]: See [MSCOCO evaluation protocol](http://cocodataset.org/#detections-eval). The COCO mAP numbers here are evaluated on COCO 14 minival set (note that our split is different from COCO 17 Val). A full list of image ids used in our split could be fould [here](https://github.com/tensorflow/models/blob/master/research/object_detection/data/mscoco_minival_ids.txt).
[^2]: This is PASCAL mAP with a slightly different way of true positives computation: see [Open Images evaluation protocols](evaluation_protocols.md), oid_V2_detection_metrics.
[^3]: Non-face boxes are dropped during training and non-face groundtruth boxes are ignored when evaluating.
[^4]: This is Open Images Challenge metric: see [Open Images evaluation protocols](evaluation_protocols.md), oid_challenge_detection_metrics.
## Snapshot Serengeti Camera Trap trained models
Model name | COCO mAP@0.5 | Outputs
--------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-----:
[faster_rcnn_resnet101_snapshot_serengeti](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_snapshot_serengeti_2020_06_10.tar.gz) | 38 | Boxes
[context_rcnn_resnet101_snapshot_serengeti](http://download.tensorflow.org/models/object_detection/context_rcnn_resnet101_snapshot_serengeti_2020_06_10.tar.gz) | 56 | Boxes
[^1]: See [MSCOCO evaluation protocol](http://cocodataset.org/#detections-eval).
The COCO mAP numbers here are evaluated on COCO 14 minival set (note that
our split is different from COCO 17 Val). A full list of image ids used in
our split could be fould
[here](https://github.com/tensorflow/models/blob/master/research/object_detection/data/mscoco_minival_ids.txt).
[^2]: This is PASCAL mAP with a slightly different way of true positives
computation: see
[Open Images evaluation protocols](evaluation_protocols.md),
oid_V2_detection_metrics.
[^3]: Non-face boxes are dropped during training and non-face groundtruth boxes
are ignored when evaluating.
[^4]: This is Open Images Challenge metric: see
[Open Images evaluation protocols](evaluation_protocols.md),
oid_challenge_detection_metrics.
......@@ -15,7 +15,7 @@
r"""Tests for detection_inference.py."""
import os
import unittest
import numpy as np
from PIL import Image
import six
......@@ -25,6 +25,7 @@ from google.protobuf import text_format
from object_detection.core import standard_fields
from object_detection.inference import detection_inference
from object_detection.utils import dataset_util
from object_detection.utils import tf_version
def get_mock_tfrecord_path():
......@@ -74,6 +75,7 @@ def create_mock_graph():
fl.write(graph_def.SerializeToString())
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class InferDetectionsTests(tf.test.TestCase):
def test_simple(self):
......
......@@ -64,7 +64,6 @@ def _multiclass_scores_or_one_hot_labels(multiclass_scores,
[tf.shape(groundtruth_boxes)[0], num_classes])
def false_fn():
return tf.one_hot(groundtruth_classes, num_classes)
return tf.cond(tf.size(multiclass_scores) > 0, true_fn, false_fn)
......@@ -1006,14 +1005,21 @@ def get_reduce_to_frame_fn(input_reader_config, is_training):
`reduce_to_frame_fn` for the dataset builder
"""
if input_reader_config.input_type != (
input_reader_pb2.InputType.TF_SEQUENCE_EXAMPLE):
return lambda d: d
input_reader_pb2.InputType.Value('TF_SEQUENCE_EXAMPLE')):
return lambda dataset, dataset_map_fn, batch_size, config: dataset
else:
def reduce_to_frame(dataset):
def reduce_to_frame(dataset, dataset_map_fn, batch_size,
input_reader_config):
"""Returns a function reducing sequence tensors to single frame tensors.
Args:
dataset: A tf dataset containing sequence tensors.
dataset_map_fn: A function that handles whether to
map_with_legacy_function for this dataset
batch_size: used if map_with_legacy_function is true to determine
num_parallel_calls
input_reader_config: used if map_with_legacy_function is true to
determine num_parallel_calls
Returns:
A tf dataset containing single frame tensors.
......@@ -1046,13 +1052,14 @@ def get_reduce_to_frame_fn(input_reader_config, is_training):
# Copy all context tensors.
out_tensor_dict[key] = tensor_dict[key]
return out_tensor_dict
dataset = dataset.map(get_single_frame, tf.data.experimental.AUTOTUNE)
dataset = dataset_map_fn(dataset, get_single_frame, batch_size,
input_reader_config)
else:
dataset = dataset.map(util_ops.tile_context_tensors,
tf.data.experimental.AUTOTUNE)
dataset = dataset_map_fn(dataset, util_ops.tile_context_tensors,
batch_size, input_reader_config)
dataset = dataset.unbatch()
# Decode frame here as SequenceExample tensors contain encoded images.
dataset = dataset.map(util_ops.decode_image,
tf.data.experimental.AUTOTUNE)
dataset = dataset_map_fn(dataset, util_ops.decode_image, batch_size,
input_reader_config)
return dataset
return reduce_to_frame
......@@ -20,10 +20,11 @@ from __future__ import print_function
import functools
import os
import unittest
from absl import logging
from absl.testing import parameterized
import numpy as np
import six
import tensorflow.compat.v1 as tf
from object_detection import inputs
......@@ -31,6 +32,13 @@ from object_detection.core import preprocessor
from object_detection.core import standard_fields as fields
from object_detection.utils import config_util
from object_detection.utils import test_case
from object_detection.utils import test_utils
from object_detection.utils import tf_version
if six.PY2:
import mock # pylint: disable=g-import-not-at-top
else:
from unittest import mock # pylint: disable=g-import-not-at-top, g-importing-member
FLAGS = tf.flags.FLAGS
......@@ -86,7 +94,8 @@ def _make_initializable_iterator(dataset):
return iterator
class InputsTest(test_case.TestCase, parameterized.TestCase):
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only tests under TF2.X.')
class InputFnTest(test_case.TestCase, parameterized.TestCase):
def test_faster_rcnn_resnet50_train_input(self):
"""Tests the training input function for FasterRcnnResnet50."""
......@@ -402,7 +411,7 @@ class InputsTest(test_case.TestCase, parameterized.TestCase):
def test_ssd_inceptionV2_eval_input_with_additional_channels(
self, eval_batch_size=1):
"""Tests the eval input function for SSDInceptionV2 with additional channels.
"""Tests the eval input function for SSDInceptionV2 with additional channel.
Args:
eval_batch_size: Batch size for eval set.
......@@ -638,6 +647,7 @@ class DataAugmentationFnTest(test_case.TestCase):
data_augmentation_fn = functools.partial(
inputs.augment_input_data,
data_augmentation_options=data_augmentation_options)
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(10, 10, 3).astype(np.float32)),
......@@ -645,17 +655,12 @@ class DataAugmentationFnTest(test_case.TestCase):
tf.constant(np.array([[.5, .5, 1., 1.]], np.float32))
}
augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict)
with self.test_session() as sess:
augmented_tensor_dict_out = sess.run(augmented_tensor_dict)
self.assertAllEqual(
augmented_tensor_dict_out[fields.InputDataFields.image].shape,
[20, 20, 3]
)
self.assertAllClose(
augmented_tensor_dict_out[fields.InputDataFields.groundtruth_boxes],
[[10, 10, 20, 20]]
)
return (augmented_tensor_dict[fields.InputDataFields.image],
augmented_tensor_dict[fields.InputDataFields.
groundtruth_boxes])
image, groundtruth_boxes = self.execute_cpu(graph_fn, [])
self.assertAllEqual(image.shape, [20, 20, 3])
self.assertAllClose(groundtruth_boxes, [[10, 10, 20, 20]])
def test_apply_image_and_box_augmentation_with_scores(self):
data_augmentation_options = [
......@@ -669,6 +674,7 @@ class DataAugmentationFnTest(test_case.TestCase):
data_augmentation_fn = functools.partial(
inputs.augment_input_data,
data_augmentation_options=data_augmentation_options)
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(10, 10, 3).astype(np.float32)),
......@@ -680,26 +686,16 @@ class DataAugmentationFnTest(test_case.TestCase):
tf.constant(np.array([0.8], np.float32)),
}
augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict)
with self.test_session() as sess:
augmented_tensor_dict_out = sess.run(augmented_tensor_dict)
self.assertAllEqual(
augmented_tensor_dict_out[fields.InputDataFields.image].shape,
[20, 20, 3]
)
self.assertAllClose(
augmented_tensor_dict_out[fields.InputDataFields.groundtruth_boxes],
[[10, 10, 20, 20]]
)
self.assertAllClose(
augmented_tensor_dict_out[fields.InputDataFields.groundtruth_classes],
[1.0]
)
self.assertAllClose(
augmented_tensor_dict_out[
fields.InputDataFields.groundtruth_weights],
[0.8]
)
return (augmented_tensor_dict[fields.InputDataFields.image],
augmented_tensor_dict[fields.InputDataFields.groundtruth_boxes],
augmented_tensor_dict[fields.InputDataFields.groundtruth_classes],
augmented_tensor_dict[fields.InputDataFields.groundtruth_weights])
(image, groundtruth_boxes,
groundtruth_classes, groundtruth_weights) = self.execute_cpu(graph_fn, [])
self.assertAllEqual(image.shape, [20, 20, 3])
self.assertAllClose(groundtruth_boxes, [[10, 10, 20, 20]])
self.assertAllClose(groundtruth_classes.shape, [1.0])
self.assertAllClose(groundtruth_weights, [0.8])
def test_include_masks_in_data_augmentation(self):
data_augmentation_options = [
......@@ -712,6 +708,7 @@ class DataAugmentationFnTest(test_case.TestCase):
data_augmentation_fn = functools.partial(
inputs.augment_input_data,
data_augmentation_options=data_augmentation_options)
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(10, 10, 3).astype(np.float32)),
......@@ -719,14 +716,12 @@ class DataAugmentationFnTest(test_case.TestCase):
tf.constant(np.zeros([2, 10, 10], np.uint8))
}
augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict)
with self.test_session() as sess:
augmented_tensor_dict_out = sess.run(augmented_tensor_dict)
self.assertAllEqual(
augmented_tensor_dict_out[fields.InputDataFields.image].shape,
[20, 20, 3])
self.assertAllEqual(augmented_tensor_dict_out[
fields.InputDataFields.groundtruth_instance_masks].shape, [2, 20, 20])
return (augmented_tensor_dict[fields.InputDataFields.image],
augmented_tensor_dict[fields.InputDataFields.
groundtruth_instance_masks])
image, masks = self.execute_cpu(graph_fn, [])
self.assertAllEqual(image.shape, [20, 20, 3])
self.assertAllEqual(masks.shape, [2, 20, 20])
def test_include_keypoints_in_data_augmentation(self):
data_augmentation_options = [
......@@ -740,6 +735,7 @@ class DataAugmentationFnTest(test_case.TestCase):
data_augmentation_fn = functools.partial(
inputs.augment_input_data,
data_augmentation_options=data_augmentation_options)
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(10, 10, 3).astype(np.float32)),
......@@ -749,21 +745,14 @@ class DataAugmentationFnTest(test_case.TestCase):
tf.constant(np.array([[[0.5, 1.0], [0.5, 0.5]]], np.float32))
}
augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict)
with self.test_session() as sess:
augmented_tensor_dict_out = sess.run(augmented_tensor_dict)
self.assertAllEqual(
augmented_tensor_dict_out[fields.InputDataFields.image].shape,
[20, 20, 3]
)
self.assertAllClose(
augmented_tensor_dict_out[fields.InputDataFields.groundtruth_boxes],
[[10, 10, 20, 20]]
)
self.assertAllClose(
augmented_tensor_dict_out[fields.InputDataFields.groundtruth_keypoints],
[[[10, 20], [10, 10]]]
)
return (augmented_tensor_dict[fields.InputDataFields.image],
augmented_tensor_dict[fields.InputDataFields.groundtruth_boxes],
augmented_tensor_dict[fields.InputDataFields.
groundtruth_keypoints])
image, boxes, keypoints = self.execute_cpu(graph_fn, [])
self.assertAllEqual(image.shape, [20, 20, 3])
self.assertAllClose(boxes, [[10, 10, 20, 20]])
self.assertAllClose(keypoints, [[[10, 20], [10, 10]]])
def _fake_model_preprocessor_fn(image):
......@@ -787,13 +776,12 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
def test_combine_additional_channels_if_present(self):
image = np.random.rand(4, 4, 3).astype(np.float32)
additional_channels = np.random.rand(4, 4, 2).astype(np.float32)
def graph_fn(image, additional_channels):
tensor_dict = {
fields.InputDataFields.image:
tf.constant(image),
fields.InputDataFields.image_additional_channels:
tf.constant(additional_channels),
fields.InputDataFields.image: image,
fields.InputDataFields.image_additional_channels: additional_channels,
fields.InputDataFields.groundtruth_classes:
tf.constant(np.array([1, 1], np.int32))
tf.constant([1, 1], tf.int32)
}
input_transformation_fn = functools.partial(
......@@ -801,23 +789,22 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=_fake_model_preprocessor_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=1)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
self.assertAllEqual(transformed_inputs[fields.InputDataFields.image].dtype,
tf.float32)
self.assertAllEqual(transformed_inputs[fields.InputDataFields.image].shape,
[4, 4, 5])
self.assertAllClose(transformed_inputs[fields.InputDataFields.image],
np.concatenate((image, additional_channels), axis=2))
out_tensors = input_transformation_fn(tensor_dict=tensor_dict)
return out_tensors[fields.InputDataFields.image]
out_image = self.execute_cpu(graph_fn, [image, additional_channels])
self.assertAllEqual(out_image.dtype, tf.float32)
self.assertAllEqual(out_image.shape, [4, 4, 5])
self.assertAllClose(out_image, np.concatenate((image, additional_channels),
axis=2))
def test_use_multiclass_scores_when_present(self):
image = np.random.rand(4, 4, 3).astype(np.float32)
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(image),
fields.InputDataFields.image: tf.constant(np.random.rand(4, 4, 3).
astype(np.float32)),
fields.InputDataFields.groundtruth_boxes:
tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]], np.float32)),
tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]],
np.float32)),
fields.InputDataFields.multiclass_scores:
tf.constant(np.array([0.2, 0.3, 0.5, 0.1, 0.6, 0.3], np.float32)),
fields.InputDataFields.groundtruth_classes:
......@@ -829,23 +816,26 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=_fake_model_preprocessor_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=3, use_multiclass_scores=True)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
return transformed_inputs[fields.InputDataFields.groundtruth_classes]
groundtruth_classes = self.execute_cpu(graph_fn, [])
self.assertAllClose(
np.array([[0.2, 0.3, 0.5], [0.1, 0.6, 0.3]], np.float32),
transformed_inputs[fields.InputDataFields.groundtruth_classes])
groundtruth_classes)
@unittest.skipIf(tf_version.is_tf2(), ('Skipping due to different behaviour '
'in TF 2.X'))
def test_use_multiclass_scores_when_not_present(self):
image = np.random.rand(4, 4, 3).astype(np.float32)
def graph_fn():
zero_num_elements = tf.random.uniform([], minval=0, maxval=1,
dtype=tf.int32)
tensor_dict = {
fields.InputDataFields.image:
tf.constant(image),
tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
fields.InputDataFields.groundtruth_boxes:
tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]], np.float32)),
fields.InputDataFields.multiclass_scores:
tf.placeholder(tf.float32),
tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]],
np.float32)),
fields.InputDataFields.multiclass_scores: tf.zeros(zero_num_elements),
fields.InputDataFields.groundtruth_classes:
tf.constant(np.array([1, 2], np.int32))
}
......@@ -855,17 +845,13 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=_fake_model_preprocessor_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=3, use_multiclass_scores=True)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict),
feed_dict={
tensor_dict[fields.InputDataFields.multiclass_scores]:
np.array([], dtype=np.float32)
})
transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
return transformed_inputs[fields.InputDataFields.groundtruth_classes]
groundtruth_classes = self.execute_cpu(graph_fn, [])
self.assertAllClose(
np.array([[0, 1, 0], [0, 0, 1]], np.float32),
transformed_inputs[fields.InputDataFields.groundtruth_classes])
groundtruth_classes)
@parameterized.parameters(
{'labeled_classes': [1, 2]},
......@@ -916,6 +902,7 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
transformed_inputs[fields.InputDataFields.groundtruth_labeled_classes])
def test_returns_correct_class_label_encodings(self):
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
......@@ -930,18 +917,17 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=_fake_model_preprocessor_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_classes],
[[0, 0, 1], [1, 0, 0]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_confidences],
[[0, 0, 1], [1, 0, 0]])
transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
return (transformed_inputs[fields.InputDataFields.groundtruth_classes],
transformed_inputs[fields.InputDataFields.
groundtruth_confidences])
(groundtruth_classes, groundtruth_confidences) = self.execute_cpu(graph_fn,
[])
self.assertAllClose(groundtruth_classes, [[0, 0, 1], [1, 0, 0]])
self.assertAllClose(groundtruth_confidences, [[0, 0, 1], [1, 0, 0]])
def test_returns_correct_labels_with_unrecognized_class(self):
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
......@@ -973,46 +959,46 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=_fake_model_preprocessor_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_classes],
[[0, 0, 1], [1, 0, 0]])
self.assertAllEqual(
transformed_inputs[fields.InputDataFields.num_groundtruth_boxes], 2)
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_area], [.5, .3])
self.assertAllEqual(
transformed_inputs[fields.InputDataFields.groundtruth_confidences],
[[0, 0, 1], [1, 0, 0]])
self.assertAllClose(
transformed_inputs = input_transformation_fn(tensor_dict)
return (transformed_inputs[fields.InputDataFields.groundtruth_classes],
transformed_inputs[fields.InputDataFields.num_groundtruth_boxes],
transformed_inputs[fields.InputDataFields.groundtruth_area],
transformed_inputs[fields.InputDataFields.
groundtruth_confidences],
transformed_inputs[fields.InputDataFields.groundtruth_boxes],
[[0, 0, 1, 1], [.5, .5, 1, 1]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
[[[.1, .1]], [[.5, .5]]])
self.assertAllEqual(
transformed_inputs[
fields.InputDataFields.groundtruth_keypoint_visibilities],
[[True, True], [True, True]])
self.assertAllEqual(
transformed_inputs[
fields.InputDataFields.groundtruth_instance_masks].shape, [2, 4, 4])
self.assertAllEqual(
transformed_inputs[fields.InputDataFields.
groundtruth_keypoint_visibilities],
transformed_inputs[fields.InputDataFields.
groundtruth_instance_masks],
transformed_inputs[fields.InputDataFields.groundtruth_is_crowd],
[False, False])
self.assertAllEqual(
transformed_inputs[fields.InputDataFields.groundtruth_difficult],
[0, 1])
transformed_inputs[fields.InputDataFields.groundtruth_difficult])
(groundtruth_classes, num_groundtruth_boxes, groundtruth_area,
groundtruth_confidences, groundtruth_boxes, groundtruth_keypoints,
groundtruth_keypoint_visibilities, groundtruth_instance_masks,
groundtruth_is_crowd, groundtruth_difficult) = self.execute_cpu(graph_fn,
[])
self.assertAllClose(groundtruth_classes, [[0, 0, 1], [1, 0, 0]])
self.assertAllEqual(num_groundtruth_boxes, 2)
self.assertAllClose(groundtruth_area, [.5, .3])
self.assertAllEqual(groundtruth_confidences, [[0, 0, 1], [1, 0, 0]])
self.assertAllClose(groundtruth_boxes, [[0, 0, 1, 1], [.5, .5, 1, 1]])
self.assertAllClose(groundtruth_keypoints, [[[.1, .1]], [[.5, .5]]])
self.assertAllEqual(groundtruth_keypoint_visibilities,
[[True, True], [True, True]])
self.assertAllEqual(groundtruth_instance_masks.shape, [2, 4, 4])
self.assertAllEqual(groundtruth_is_crowd, [False, False])
self.assertAllEqual(groundtruth_difficult, [0, 1])
def test_returns_correct_merged_boxes(self):
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
fields.InputDataFields.groundtruth_boxes:
tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]], np.float32)),
tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]],
np.float32)),
fields.InputDataFields.groundtruth_classes:
tf.constant(np.array([3, 1], np.int32))
}
......@@ -1024,24 +1010,29 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes,
merge_multiple_boxes=True)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
transformed_inputs = input_transformation_fn(tensor_dict)
return (transformed_inputs[fields.InputDataFields.groundtruth_boxes],
transformed_inputs[fields.InputDataFields.groundtruth_classes],
transformed_inputs[fields.InputDataFields.
groundtruth_confidences],
transformed_inputs[fields.InputDataFields.num_groundtruth_boxes])
(groundtruth_boxes, groundtruth_classes, groundtruth_confidences,
num_groundtruth_boxes) = self.execute_cpu(graph_fn, [])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_boxes],
groundtruth_boxes,
[[.5, .5, 1., 1.]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_classes],
groundtruth_classes,
[[1, 0, 1]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_confidences],
groundtruth_confidences,
[[1, 0, 1]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.num_groundtruth_boxes],
num_groundtruth_boxes,
1)
def test_returns_correct_groundtruth_confidences_when_input_present(self):
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
......@@ -1058,18 +1049,21 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=_fake_model_preprocessor_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
transformed_inputs = input_transformation_fn(tensor_dict)
return (transformed_inputs[fields.InputDataFields.groundtruth_classes],
transformed_inputs[fields.InputDataFields.
groundtruth_confidences])
groundtruth_classes, groundtruth_confidences = self.execute_cpu(graph_fn,
[])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_classes],
groundtruth_classes,
[[0, 0, 1], [1, 0, 0]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_confidences],
groundtruth_confidences,
[[0, 0, 1], [-1, 0, 0]])
def test_returns_resized_masks(self):
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
......@@ -1099,23 +1093,24 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
image_resizer_fn=fake_image_resizer_fn,
num_classes=num_classes,
retain_original_image=True)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
self.assertAllEqual(transformed_inputs[
fields.InputDataFields.original_image].dtype, tf.uint8)
self.assertAllEqual(transformed_inputs[
fields.InputDataFields.original_image_spatial_shape], [4, 4])
self.assertAllEqual(transformed_inputs[
fields.InputDataFields.original_image].shape, [8, 8, 3])
self.assertAllEqual(transformed_inputs[
fields.InputDataFields.groundtruth_instance_masks].shape, [2, 8, 8])
transformed_inputs = input_transformation_fn(tensor_dict)
return (transformed_inputs[fields.InputDataFields.original_image],
transformed_inputs[fields.InputDataFields.
original_image_spatial_shape],
transformed_inputs[fields.InputDataFields.
groundtruth_instance_masks])
(original_image, original_image_shape,
groundtruth_instance_masks) = self.execute_cpu(graph_fn, [])
self.assertEqual(original_image.dtype, np.uint8)
self.assertAllEqual(original_image_shape, [4, 4])
self.assertAllEqual(original_image.shape, [8, 8, 3])
self.assertAllEqual(groundtruth_instance_masks.shape, [2, 8, 8])
def test_applies_model_preprocess_fn_to_image_tensor(self):
np_image = np.random.randint(256, size=(4, 4, 3))
def graph_fn(image):
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np_image),
fields.InputDataFields.image: image,
fields.InputDataFields.groundtruth_classes:
tf.constant(np.array([3, 1], np.int32))
}
......@@ -1129,21 +1124,18 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=fake_model_preprocessor_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
self.assertAllClose(transformed_inputs[fields.InputDataFields.image],
np_image / 255.)
self.assertAllClose(transformed_inputs[fields.InputDataFields.
true_image_shape],
[4, 4, 3])
transformed_inputs = input_transformation_fn(tensor_dict)
return (transformed_inputs[fields.InputDataFields.image],
transformed_inputs[fields.InputDataFields.true_image_shape])
image, true_image_shape = self.execute_cpu(graph_fn, [np_image])
self.assertAllClose(image, np_image / 255.)
self.assertAllClose(true_image_shape, [4, 4, 3])
def test_applies_data_augmentation_fn_to_tensor_dict(self):
np_image = np.random.randint(256, size=(4, 4, 3))
def graph_fn(image):
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np_image),
fields.InputDataFields.image: image,
fields.InputDataFields.groundtruth_classes:
tf.constant(np.array([3, 1], np.int32))
}
......@@ -1158,21 +1150,20 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes,
data_augmentation_fn=add_one_data_augmentation_fn)
with self.test_session() as sess:
augmented_tensor_dict = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
self.assertAllEqual(augmented_tensor_dict[fields.InputDataFields.image],
np_image + 1)
transformed_inputs = input_transformation_fn(tensor_dict)
return (transformed_inputs[fields.InputDataFields.image],
transformed_inputs[fields.InputDataFields.groundtruth_classes])
image, groundtruth_classes = self.execute_cpu(graph_fn, [np_image])
self.assertAllEqual(image, np_image + 1)
self.assertAllEqual(
augmented_tensor_dict[fields.InputDataFields.groundtruth_classes],
groundtruth_classes,
[[0, 0, 0, 1], [0, 1, 0, 0]])
def test_applies_data_augmentation_fn_before_model_preprocess_fn(self):
np_image = np.random.randint(256, size=(4, 4, 3))
def graph_fn(image):
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np_image),
fields.InputDataFields.image: image,
fields.InputDataFields.groundtruth_classes:
tf.constant(np.array([3, 1], np.int32))
}
......@@ -1191,15 +1182,13 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes,
data_augmentation_fn=add_five_to_image_data_augmentation_fn)
with self.test_session() as sess:
augmented_tensor_dict = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
self.assertAllEqual(augmented_tensor_dict[fields.InputDataFields.image],
(np_image + 5) * 2)
transformed_inputs = input_transformation_fn(tensor_dict)
return transformed_inputs[fields.InputDataFields.image]
image = self.execute_cpu(graph_fn, [np_image])
self.assertAllEqual(image, (np_image + 5) * 2)
def test_resize_with_padding(self):
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(100, 50, 3).astype(np.float32)),
......@@ -1218,18 +1207,19 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=_fake_resize50_preprocess_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes,)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
transformed_inputs = input_transformation_fn(tensor_dict)
return (transformed_inputs[fields.InputDataFields.groundtruth_boxes],
transformed_inputs[fields.InputDataFields.groundtruth_keypoints])
groundtruth_boxes, groundtruth_keypoints = self.execute_cpu(graph_fn, [])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_boxes],
groundtruth_boxes,
[[.5, .25, 1., .5], [.0, .0, .5, .25]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
groundtruth_keypoints,
[[[.1, .1]], [[.3, .2]]])
def test_groundtruth_keypoint_weights(self):
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(100, 50, 3).astype(np.float32)),
......@@ -1253,19 +1243,23 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes,
keypoint_type_weight=keypoint_type_weight)
transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
return (transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
transformed_inputs[fields.InputDataFields.
groundtruth_keypoint_weights])
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
groundtruth_keypoints, groundtruth_keypoint_weights = self.execute_cpu(
graph_fn, [])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
groundtruth_keypoints,
[[[0.1, 0.1], [0.3, 0.2]],
[[0.5, 0.3], [0.7, 0.4]]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_keypoint_weights],
groundtruth_keypoint_weights,
[[1.0, 0.0], [1.0, 2.0]])
def test_groundtruth_keypoint_weights_default(self):
def graph_fn():
tensor_dict = {
fields.InputDataFields.image:
tf.constant(np.random.rand(100, 50, 3).astype(np.float32)),
......@@ -1285,16 +1279,18 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
model_preprocess_fn=_fake_resize50_preprocess_fn,
image_resizer_fn=_fake_image_resizer_fn,
num_classes=num_classes)
with self.test_session() as sess:
transformed_inputs = sess.run(
input_transformation_fn(tensor_dict=tensor_dict))
transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
return (transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
transformed_inputs[fields.InputDataFields.
groundtruth_keypoint_weights])
groundtruth_keypoints, groundtruth_keypoint_weights = self.execute_cpu(
graph_fn, [])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
groundtruth_keypoints,
[[[0.1, 0.1], [0.3, 0.2]],
[[0.5, 0.3], [0.7, 0.4]]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_keypoint_weights],
groundtruth_keypoint_weights,
[[1.0, 1.0], [1.0, 1.0]])
......@@ -1303,15 +1299,15 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
def test_pad_images_boxes_and_classes(self):
input_tensor_dict = {
fields.InputDataFields.image:
tf.placeholder(tf.float32, [None, None, 3]),
tf.random.uniform([3, 3, 3]),
fields.InputDataFields.groundtruth_boxes:
tf.placeholder(tf.float32, [None, 4]),
tf.random.uniform([2, 4]),
fields.InputDataFields.groundtruth_classes:
tf.placeholder(tf.int32, [None, 3]),
tf.random.uniform([2, 3], minval=0, maxval=2, dtype=tf.int32),
fields.InputDataFields.true_image_shape:
tf.placeholder(tf.int32, [3]),
tf.constant([3, 3, 3]),
fields.InputDataFields.original_image_spatial_shape:
tf.placeholder(tf.int32, [2])
tf.constant([3, 3])
}
padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
tensor_dict=input_tensor_dict,
......@@ -1336,69 +1332,35 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
.shape.as_list(), [3, 3])
def test_clip_boxes_and_classes(self):
def graph_fn():
input_tensor_dict = {
fields.InputDataFields.groundtruth_boxes:
tf.placeholder(tf.float32, [None, 4]),
tf.random.uniform([5, 4]),
fields.InputDataFields.groundtruth_classes:
tf.placeholder(tf.int32, [None, 3]),
tf.random.uniform([2, 3], maxval=10, dtype=tf.int32),
fields.InputDataFields.num_groundtruth_boxes:
tf.placeholder(tf.int32, [])
tf.constant(5)
}
padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
tensor_dict=input_tensor_dict,
max_num_boxes=3,
num_classes=3,
spatial_image_shape=[5, 6])
self.assertAllEqual(
padded_tensor_dict[fields.InputDataFields.groundtruth_boxes]
.shape.as_list(), [3, 4])
self.assertAllEqual(
padded_tensor_dict[fields.InputDataFields.groundtruth_classes]
.shape.as_list(), [3, 3])
with self.test_session() as sess:
out_tensor_dict = sess.run(
padded_tensor_dict,
feed_dict={
input_tensor_dict[fields.InputDataFields.groundtruth_boxes]:
np.random.rand(5, 4),
input_tensor_dict[fields.InputDataFields.groundtruth_classes]:
np.random.rand(2, 3),
input_tensor_dict[fields.InputDataFields.num_groundtruth_boxes]:
5,
})
self.assertAllEqual(
out_tensor_dict[fields.InputDataFields.groundtruth_boxes].shape, [3, 4])
self.assertAllEqual(
out_tensor_dict[fields.InputDataFields.groundtruth_classes].shape,
[3, 3])
self.assertEqual(
out_tensor_dict[fields.InputDataFields.num_groundtruth_boxes],
3)
def test_do_not_pad_dynamic_images(self):
input_tensor_dict = {
fields.InputDataFields.image:
tf.placeholder(tf.float32, [None, None, 3]),
}
padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
tensor_dict=input_tensor_dict,
max_num_boxes=3,
num_classes=3,
spatial_image_shape=[None, None])
self.assertAllEqual(
padded_tensor_dict[fields.InputDataFields.image].shape.as_list(),
[None, None, 3])
return (padded_tensor_dict[fields.InputDataFields.groundtruth_boxes],
padded_tensor_dict[fields.InputDataFields.groundtruth_classes],
padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes])
(groundtruth_boxes, groundtruth_classes,
num_groundtruth_boxes) = self.execute_cpu(graph_fn, [])
self.assertAllEqual(groundtruth_boxes.shape, [3, 4])
self.assertAllEqual(groundtruth_classes.shape, [3, 3])
self.assertEqual(num_groundtruth_boxes, 3)
def test_images_and_additional_channels(self):
input_tensor_dict = {
fields.InputDataFields.image:
tf.placeholder(tf.float32, [None, None, 5]),
test_utils.image_with_dynamic_shape(4, 3, 5),
fields.InputDataFields.image_additional_channels:
tf.placeholder(tf.float32, [None, None, 2]),
test_utils.image_with_dynamic_shape(4, 3, 2),
}
padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
tensor_dict=input_tensor_dict,
......@@ -1418,11 +1380,11 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
def test_images_and_additional_channels_errors(self):
input_tensor_dict = {
fields.InputDataFields.image:
tf.placeholder(tf.float32, [None, None, 3]),
test_utils.image_with_dynamic_shape(10, 10, 3),
fields.InputDataFields.image_additional_channels:
tf.placeholder(tf.float32, [None, None, 2]),
test_utils.image_with_dynamic_shape(10, 10, 2),
fields.InputDataFields.original_image:
tf.placeholder(tf.float32, [None, None, 3]),
test_utils.image_with_dynamic_shape(10, 10, 3),
}
with self.assertRaises(ValueError):
_ = inputs.pad_input_data_to_static_shapes(
......@@ -1434,7 +1396,7 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
def test_gray_images(self):
input_tensor_dict = {
fields.InputDataFields.image:
tf.placeholder(tf.float32, [None, None, 1]),
test_utils.image_with_dynamic_shape(4, 4, 1),
}
padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
tensor_dict=input_tensor_dict,
......@@ -1449,9 +1411,9 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
def test_gray_images_and_additional_channels(self):
input_tensor_dict = {
fields.InputDataFields.image:
tf.placeholder(tf.float32, [None, None, 3]),
test_utils.image_with_dynamic_shape(4, 4, 3),
fields.InputDataFields.image_additional_channels:
tf.placeholder(tf.float32, [None, None, 2]),
test_utils.image_with_dynamic_shape(4, 4, 2),
}
# pad_input_data_to_static_shape assumes that image is already concatenated
# with additional channels.
......@@ -1469,11 +1431,14 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
.shape.as_list(), [5, 6, 2])
def test_keypoints(self):
keypoints = test_utils.keypoints_with_dynamic_shape(10, 16, 4)
visibilities = tf.cast(tf.random.uniform(tf.shape(keypoints)[:-1], minval=0,
maxval=2, dtype=tf.int32), tf.bool)
input_tensor_dict = {
fields.InputDataFields.groundtruth_keypoints:
tf.placeholder(tf.float32, [None, 16, 4]),
test_utils.keypoints_with_dynamic_shape(10, 16, 4),
fields.InputDataFields.groundtruth_keypoint_visibilities:
tf.placeholder(tf.bool, [None, 16]),
visibilities
}
padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
tensor_dict=input_tensor_dict,
......@@ -1493,12 +1458,12 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
context_memory_size = 8
context_feature_length = 10
max_num_context_features = 20
def graph_fn():
input_tensor_dict = {
fields.InputDataFields.context_features:
tf.placeholder(tf.float32,
[context_memory_size, context_feature_length]),
tf.ones([context_memory_size, context_feature_length]),
fields.InputDataFields.context_feature_length:
tf.placeholder(tf.float32, [])
tf.constant(context_feature_length)
}
padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
tensor_dict=input_tensor_dict,
......@@ -1512,20 +1477,57 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
padded_tensor_dict[
fields.InputDataFields.context_features].shape.as_list(),
[max_num_context_features, context_feature_length])
return padded_tensor_dict[fields.InputDataFields.valid_context_size]
with self.test_session() as sess:
feed_dict = {
input_tensor_dict[fields.InputDataFields.context_features]:
np.ones([context_memory_size, context_feature_length],
dtype=np.float32),
input_tensor_dict[fields.InputDataFields.context_feature_length]:
context_feature_length
valid_context_size = self.execute_cpu(graph_fn, [])
self.assertEqual(valid_context_size, context_memory_size)
class NegativeSizeTest(test_case.TestCase):
"""Test for inputs and related funcitons."""
def test_negative_size_error(self):
"""Test that error is raised for negative size boxes."""
def graph_fn():
tensors = {
fields.InputDataFields.image: tf.zeros((128, 128, 3)),
fields.InputDataFields.groundtruth_classes:
tf.constant([1, 1], tf.int32),
fields.InputDataFields.groundtruth_boxes:
tf.constant([[0.5, 0.5, 0.4, 0.5]], tf.float32)
}
padded_tensor_dict_out = sess.run(padded_tensor_dict, feed_dict=feed_dict)
tensors = inputs.transform_input_data(
tensors, _fake_model_preprocessor_fn, _fake_image_resizer_fn,
num_classes=10)
return tensors[fields.InputDataFields.groundtruth_boxes]
with self.assertRaises(tf.errors.InvalidArgumentError):
self.execute_cpu(graph_fn, [])
def test_negative_size_no_assert(self):
"""Test that negative size boxes are filtered out without assert.
This test simulates the behaviour when we run on TPU and Assert ops are
not supported.
"""
self.assertEqual(
padded_tensor_dict_out[fields.InputDataFields.valid_context_size],
context_memory_size)
tensors = {
fields.InputDataFields.image: tf.zeros((128, 128, 3)),
fields.InputDataFields.groundtruth_classes:
tf.constant([1, 1], tf.int32),
fields.InputDataFields.groundtruth_boxes:
tf.constant([[0.5, 0.5, 0.4, 0.5], [0.5, 0.5, 0.6, 0.6]],
tf.float32)
}
with mock.patch.object(tf, 'Assert') as tf_assert:
tf_assert.return_value = tf.no_op()
tensors = inputs.transform_input_data(
tensors, _fake_model_preprocessor_fn, _fake_image_resizer_fn,
num_classes=10)
self.assertAllClose(tensors[fields.InputDataFields.groundtruth_boxes],
[[0.5, 0.5, 0.6, 0.6]])
if __name__ == '__main__':
......
......@@ -14,7 +14,7 @@
# ==============================================================================
"""Tests for object_detection.trainer."""
import unittest
import tensorflow.compat.v1 as tf
import tf_slim as slim
from google.protobuf import text_format
......@@ -24,6 +24,7 @@ from object_detection.core import model
from object_detection.core import standard_fields as fields
from object_detection.legacy import trainer
from object_detection.protos import train_pb2
from object_detection.utils import tf_version
NUMBER_OF_CLASSES = 2
......@@ -197,6 +198,7 @@ class FakeDetectionModel(model.DetectionModel):
pass
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class TrainerTest(tf.test.TestCase):
def test_configure_trainer_and_train_two_steps(self):
......
......@@ -14,14 +14,18 @@
# ==============================================================================
"""Tests for object_detection.core.bipartite_matcher."""
import unittest
import numpy as np
import tensorflow.compat.v1 as tf
from object_detection.matchers import bipartite_matcher
from object_detection.utils import test_case
from object_detection.utils import tf_version
if tf_version.is_tf1():
from object_detection.matchers import bipartite_matcher # pylint: disable=g-import-not-at-top
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class GreedyBipartiteMatcherTest(test_case.TestCase):
def test_get_expected_matches_when_all_rows_are_valid(self):
......
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""The CenterNet meta architecture as described in the "Objects as Points" paper [1].
[1]: https://arxiv.org/abs/1904.07850
"""
import abc
import collections
import functools
import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow.compat.v2 as tf2
from object_detection.core import box_list
from object_detection.core import box_list_ops
from object_detection.core import keypoint_ops
from object_detection.core import model
from object_detection.core import standard_fields as fields
from object_detection.core import target_assigner as cn_assigner
from object_detection.utils import shape_utils
# Number of channels needed to predict size and offsets.
NUM_OFFSET_CHANNELS = 2
NUM_SIZE_CHANNELS = 2
# Error range for detecting peaks.
PEAK_EPSILON = 1e-6
# Constants shared between all keypoint tasks.
UNMATCHED_KEYPOINT_SCORE = 0.1
KEYPOINT_CANDIDATE_SEARCH_SCALE = 0.3
class CenterNetFeatureExtractor(tf.keras.Model):
"""Base class for feature extractors for the CenterNet meta architecture.
Child classes are expected to override the _output_model property which will
return 1 or more tensors predicted by the feature extractor.
"""
__metaclass__ = abc.ABCMeta
def __init__(self, name=None, channel_means=(0., 0., 0.),
channel_stds=(1., 1., 1.), bgr_ordering=False):
"""Initializes a CenterNet feature extractor.
Args:
name: str, the name used for the underlying keras model.
channel_means: A tuple of floats, denoting the mean of each channel
which will be subtracted from it. If None or empty, we use 0s.
channel_stds: A tuple of floats, denoting the standard deviation of each
channel. Each channel will be divided by its standard deviation value.
If None or empty, we use 1s.
bgr_ordering: bool, if set will change the channel ordering to be in the
[blue, red, green] order.
"""
super(CenterNetFeatureExtractor, self).__init__(name=name)
if channel_means is None or len(channel_means) == 0: # pylint:disable=g-explicit-length-test
channel_means = [0., 0., 0.]
if channel_stds is None or len(channel_stds) == 0: # pylint:disable=g-explicit-length-test
channel_stds = [1., 1., 1.]
self._channel_means = channel_means
self._channel_stds = channel_stds
self._bgr_ordering = bgr_ordering
def preprocess(self, inputs):
"""Converts a batch of unscaled images to a scale suitable for the model.
This method normalizes the image using the given `channel_means` and
`channels_stds` values at initialization time while optionally flipping
the channel order if `bgr_ordering` is set.
Args:
inputs: a [batch, height, width, channels] float32 tensor
Returns:
outputs: a [batch, height, width, channels] float32 tensor
"""
if self._bgr_ordering:
red, green, blue = tf.unstack(inputs, axis=3)
inputs = tf.stack([blue, green, red], axis=3)
channel_means = tf.reshape(tf.constant(self._channel_means),
[1, 1, 1, -1])
channel_stds = tf.reshape(tf.constant(self._channel_stds),
[1, 1, 1, -1])
return (inputs - channel_means)/channel_stds
@property
@abc.abstractmethod
def out_stride(self):
"""The stride in the output image of the network."""
pass
@property
@abc.abstractmethod
def num_feature_outputs(self):
"""Ther number of feature outputs returned by the feature extractor."""
pass
def make_prediction_net(num_out_channels, kernel_size=3, num_filters=256,
bias_fill=None):
"""Creates a network to predict the given number of output channels.
This function is intended to make the prediction heads for the CenterNet
meta architecture.
Args:
num_out_channels: Number of output channels.
kernel_size: The size of the conv kernel in the intermediate layer
num_filters: The number of filters in the intermediate conv layer.
bias_fill: If not None, is used to initialize the bias in the final conv
layer.
Returns:
net: A keras module which when called on an input tensor of size
[batch_size, height, width, num_in_channels] returns an output
of size [batch_size, height, width, num_out_channels]
"""
out_conv = tf.keras.layers.Conv2D(num_out_channels, kernel_size=1)
if bias_fill is not None:
out_conv.bias_initializer = tf.keras.initializers.constant(bias_fill)
net = tf.keras.Sequential(
[tf.keras.layers.Conv2D(num_filters, kernel_size=kernel_size,
padding='same'),
tf.keras.layers.ReLU(),
out_conv]
)
return net
def _to_float32(x):
return tf.cast(x, tf.float32)
def _get_shape(tensor, num_dims):
tf.Assert(tensor.get_shape().ndims == num_dims, [tensor])
return shape_utils.combined_static_and_dynamic_shape(tensor)
def _flatten_spatial_dimensions(batch_images):
batch_size, height, width, channels = _get_shape(batch_images, 4)
return tf.reshape(batch_images, [batch_size, height * width,
channels])
def top_k_feature_map_locations(feature_map, max_pool_kernel_size=3, k=100,
per_channel=False):
"""Returns the top k scores and their locations in a feature map.
Given a feature map, the top k values (based on activation) are returned. If
`per_channel` is True, the top k values **per channel** are returned.
The `max_pool_kernel_size` argument allows for selecting local peaks in a
region. This filtering is done per channel, so nothing prevents two values at
the same location to be returned.
Args:
feature_map: [batch, height, width, channels] float32 feature map.
max_pool_kernel_size: integer, the max pool kernel size to use to pull off
peak score locations in a neighborhood (independently for each channel).
For example, to make sure no two neighboring values (in the same channel)
are returned, set max_pool_kernel_size=3. If None or 1, will not apply max
pooling.
k: The number of highest scoring locations to return.
per_channel: If True, will return the top k scores and locations per
feature map channel. If False, the top k across the entire feature map
(height x width x channels) are returned.
Returns:
Tuple of
scores: A [batch, N] float32 tensor with scores from the feature map in
descending order. If per_channel is False, N = k. Otherwise,
N = k * channels, and the first k elements correspond to channel 0, the
second k correspond to channel 1, etc.
y_indices: A [batch, N] int tensor with y indices of the top k feature map
locations. If per_channel is False, N = k. Otherwise,
N = k * channels.
x_indices: A [batch, N] int tensor with x indices of the top k feature map
locations. If per_channel is False, N = k. Otherwise,
N = k * channels.
channel_indices: A [batch, N] int tensor with channel indices of the top k
feature map locations. If per_channel is False, N = k. Otherwise,
N = k * channels.
"""
if not max_pool_kernel_size or max_pool_kernel_size == 1:
feature_map_peaks = feature_map
else:
feature_map_max_pool = tf.nn.max_pool(
feature_map, ksize=max_pool_kernel_size, strides=1, padding='SAME')
feature_map_peak_mask = tf.math.abs(
feature_map - feature_map_max_pool) < PEAK_EPSILON
# Zero out everything that is not a peak.
feature_map_peaks = (
feature_map * _to_float32(feature_map_peak_mask))
batch_size, _, width, num_channels = _get_shape(feature_map, 4)
if per_channel:
# Perform top k over batch and channels.
feature_map_peaks_transposed = tf.transpose(feature_map_peaks,
perm=[0, 3, 1, 2])
feature_map_peaks_transposed = tf.reshape(
feature_map_peaks_transposed, [batch_size, num_channels, -1])
scores, peak_flat_indices = tf.math.top_k(feature_map_peaks_transposed, k=k)
# Convert the indices such that they represent the location in the full
# (flattened) feature map of size [batch, height * width * channels].
channel_idx = tf.range(num_channels)[tf.newaxis, :, tf.newaxis]
peak_flat_indices = num_channels * peak_flat_indices + channel_idx
scores = tf.reshape(scores, [batch_size, -1])
peak_flat_indices = tf.reshape(peak_flat_indices, [batch_size, -1])
else:
feature_map_peaks_flat = tf.reshape(feature_map_peaks, [batch_size, -1])
scores, peak_flat_indices = tf.math.top_k(feature_map_peaks_flat, k=k)
# Get x, y and channel indices corresponding to the top indices in the flat
# array.
y_indices, x_indices, channel_indices = (
row_col_channel_indices_from_flattened_indices(
peak_flat_indices, width, num_channels))
return scores, y_indices, x_indices, channel_indices
def prediction_tensors_to_boxes(detection_scores, y_indices, x_indices,
channel_indices, height_width_predictions,
offset_predictions):
"""Converts CenterNet class-center, offset and size predictions to boxes.
Args:
detection_scores: A [batch, num_boxes] float32 tensor with detection
scores in range [0, 1].
y_indices: A [batch, num_boxes] int32 tensor with y indices corresponding to
object center locations (expressed in output coordinate frame).
x_indices: A [batch, num_boxes] int32 tensor with x indices corresponding to
object center locations (expressed in output coordinate frame).
channel_indices: A [batch, num_boxes] int32 tensor with channel indices
corresponding to object classes.
height_width_predictions: A float tensor of shape [batch_size, height,
width, 2] representing the height and width of a box centered at each
pixel.
offset_predictions: A float tensor of shape [batch_size, height, width, 2]
representing the y and x offsets of a box centered at each pixel. This
helps reduce the error from downsampling.
Returns:
detection_boxes: A tensor of shape [batch_size, num_boxes, 4] holding the
the raw bounding box coordinates of boxes.
detection_classes: An integer tensor of shape [batch_size, num_boxes]
indicating the predicted class for each box.
detection_scores: A float tensor of shape [batch_size, num_boxes] indicating
the score for each box.
num_detections: An integer tensor of shape [batch_size,] indicating the
number of boxes detected for each sample in the batch.
"""
_, _, width, _ = _get_shape(height_width_predictions, 4)
peak_spatial_indices = flattened_indices_from_row_col_indices(
y_indices, x_indices, width)
y_indices = _to_float32(y_indices)
x_indices = _to_float32(x_indices)
height_width_flat = _flatten_spatial_dimensions(height_width_predictions)
offsets_flat = _flatten_spatial_dimensions(offset_predictions)
height_width = tf.gather(height_width_flat, peak_spatial_indices,
batch_dims=1)
offsets = tf.gather(offsets_flat, peak_spatial_indices, batch_dims=1)
heights, widths = tf.unstack(height_width, axis=2)
y_offsets, x_offsets = tf.unstack(offsets, axis=2)
detection_classes = channel_indices
num_detections = tf.reduce_sum(tf.to_int32(detection_scores > 0), axis=1)
boxes = tf.stack([y_indices + y_offsets - heights / 2.0,
x_indices + x_offsets - widths / 2.0,
y_indices + y_offsets + heights / 2.0,
x_indices + x_offsets + widths / 2.0], axis=2)
return boxes, detection_classes, detection_scores, num_detections
def prediction_tensors_to_keypoint_candidates(
keypoint_heatmap_predictions,
keypoint_heatmap_offsets,
keypoint_score_threshold=0.1,
max_pool_kernel_size=1,
max_candidates=20):
"""Convert keypoint heatmap predictions and offsets to keypoint candidates.
Args:
keypoint_heatmap_predictions: A float tensor of shape [batch_size, height,
width, num_keypoints] representing the per-keypoint heatmaps.
keypoint_heatmap_offsets: A float tensor of shape [batch_size, height,
width, 2] (or [batch_size, height, width, 2 * num_keypoints] if
'per_keypoint_offset' is set True) representing the per-keypoint offsets.
keypoint_score_threshold: float, the threshold for considering a keypoint
a candidate.
max_pool_kernel_size: integer, the max pool kernel size to use to pull off
peak score locations in a neighborhood. For example, to make sure no two
neighboring values for the same keypoint are returned, set
max_pool_kernel_size=3. If None or 1, will not apply any local filtering.
max_candidates: integer, maximum number of keypoint candidates per
keypoint type.
Returns:
keypoint_candidates: A tensor of shape
[batch_size, max_candidates, num_keypoints, 2] holding the
location of keypoint candidates in [y, x] format (expressed in absolute
coordinates in the output coordinate frame).
keypoint_scores: A float tensor of shape
[batch_size, max_candidates, num_keypoints] with the scores for each
keypoint candidate. The scores come directly from the heatmap predictions.
num_keypoint_candidates: An integer tensor of shape
[batch_size, num_keypoints] with the number of candidates for each
keypoint type, as it's possible to filter some candidates due to the score
threshold.
"""
batch_size, _, width, num_keypoints = _get_shape(
keypoint_heatmap_predictions, 4)
# Get x, y and channel indices corresponding to the top indices in the
# keypoint heatmap predictions.
# Note that the top k candidates are produced for **each keypoint type**.
# Might be worth eventually trying top k in the feature map, independent of
# the keypoint type.
keypoint_scores, y_indices, x_indices, channel_indices = (
top_k_feature_map_locations(keypoint_heatmap_predictions,
max_pool_kernel_size=max_pool_kernel_size,
k=max_candidates,
per_channel=True))
peak_spatial_indices = flattened_indices_from_row_col_indices(
y_indices, x_indices, width)
y_indices = _to_float32(y_indices)
x_indices = _to_float32(x_indices)
offsets_flat = _flatten_spatial_dimensions(keypoint_heatmap_offsets)
selected_offsets = tf.gather(offsets_flat, peak_spatial_indices, batch_dims=1)
_, num_indices, num_channels = _get_shape(selected_offsets, 3)
if num_channels > 2:
reshaped_offsets = tf.reshape(selected_offsets,
[batch_size, num_indices, -1, 2])
offsets = tf.gather(reshaped_offsets, channel_indices, batch_dims=2)
else:
offsets = selected_offsets
y_offsets, x_offsets = tf.unstack(offsets, axis=2)
keypoint_candidates = tf.stack([y_indices + y_offsets,
x_indices + x_offsets], axis=2)
keypoint_candidates = tf.reshape(
keypoint_candidates,
[batch_size, num_keypoints, max_candidates, 2])
keypoint_candidates = tf.transpose(keypoint_candidates, [0, 2, 1, 3])
keypoint_scores = tf.reshape(
keypoint_scores,
[batch_size, num_keypoints, max_candidates])
keypoint_scores = tf.transpose(keypoint_scores, [0, 2, 1])
num_candidates = tf.reduce_sum(
tf.to_int32(keypoint_scores >= keypoint_score_threshold), axis=1)
return keypoint_candidates, keypoint_scores, num_candidates
def regressed_keypoints_at_object_centers(regressed_keypoint_predictions,
y_indices, x_indices):
"""Returns the regressed keypoints at specified object centers.
The original keypoint predictions are regressed relative to each feature map
location. The returned keypoints are expressed in absolute coordinates in the
output frame (i.e. the center offsets are added to each individual regressed
set of keypoints).
Args:
regressed_keypoint_predictions: A float tensor of shape
[batch_size, height, width, 2 * num_keypoints] holding regressed
keypoints. The last dimension has keypoint coordinates ordered as follows:
[y0, x0, y1, x1, ..., y{J-1}, x{J-1}] where J is the number of keypoints.
y_indices: A [batch, num_instances] int tensor holding y indices for object
centers. These indices correspond to locations in the output feature map.
x_indices: A [batch, num_instances] int tensor holding x indices for object
centers. These indices correspond to locations in the output feature map.
Returns:
A float tensor of shape [batch_size, num_objects, 2 * num_keypoints] where
regressed keypoints are gathered at the provided locations, and converted
to absolute coordinates in the output coordinate frame.
"""
batch_size, _, width, _ = _get_shape(regressed_keypoint_predictions, 4)
flattened_indices = flattened_indices_from_row_col_indices(
y_indices, x_indices, width)
_, num_instances = _get_shape(flattened_indices, 2)
regressed_keypoints_flat = _flatten_spatial_dimensions(
regressed_keypoint_predictions)
relative_regressed_keypoints = tf.gather(
regressed_keypoints_flat, flattened_indices, batch_dims=1)
relative_regressed_keypoints = tf.reshape(
relative_regressed_keypoints,
[batch_size, num_instances, -1, 2])
relative_regressed_keypoints_y, relative_regressed_keypoints_x = tf.unstack(
relative_regressed_keypoints, axis=3)
y_indices = _to_float32(tf.expand_dims(y_indices, axis=-1))
x_indices = _to_float32(tf.expand_dims(x_indices, axis=-1))
absolute_regressed_keypoints = tf.stack(
[y_indices + relative_regressed_keypoints_y,
x_indices + relative_regressed_keypoints_x],
axis=3)
return tf.reshape(absolute_regressed_keypoints,
[batch_size, num_instances, -1])
def refine_keypoints(regressed_keypoints, keypoint_candidates, keypoint_scores,
num_keypoint_candidates, bboxes=None,
unmatched_keypoint_score=0.1, box_scale=1.2,
candidate_search_scale=0.3,
candidate_ranking_mode='min_distance'):
"""Refines regressed keypoints by snapping to the nearest candidate keypoints.
The initial regressed keypoints represent a full set of keypoints regressed
from the centers of the objects. The keypoint candidates are estimated
independently from heatmaps, and are not associated with any object instances.
This function refines the regressed keypoints by "snapping" to the
nearest/highest score/highest score-distance ratio (depending on the
candidate_ranking_mode) candidate of the same keypoint type (e.g. "nose").
If no candidates are nearby, the regressed keypoint remains unchanged.
In order to snap a regressed keypoint to a candidate keypoint, the following
must be satisfied:
- the candidate keypoint must be of the same type as the regressed keypoint
- the candidate keypoint must not lie outside the predicted boxes (or the
boxes which encloses the regressed keypoints for the instance if `bboxes` is
not provided). Note that the box is scaled by
`regressed_box_scale` in height and width, to provide some margin around the
keypoints
- the distance to the closest candidate keypoint cannot exceed
candidate_search_scale * max(height, width), where height and width refer to
the bounding box for the instance.
Note that the same candidate keypoint is allowed to snap to regressed
keypoints in difference instances.
Args:
regressed_keypoints: A float tensor of shape
[batch_size, num_instances, num_keypoints, 2] with the initial regressed
keypoints.
keypoint_candidates: A tensor of shape
[batch_size, max_candidates, num_keypoints, 2] holding the location of
keypoint candidates in [y, x] format (expressed in absolute coordinates in
the output coordinate frame).
keypoint_scores: A float tensor of shape
[batch_size, max_candidates, num_keypoints] indicating the scores for
keypoint candidates.
num_keypoint_candidates: An integer tensor of shape
[batch_size, num_keypoints] indicating the number of valid candidates for
each keypoint type, as there may be padding (dim 1) of
`keypoint_candidates` and `keypoint_scores`.
bboxes: A tensor of shape [batch_size, num_instances, 4] with predicted
bounding boxes for each instance, expressed in the output coordinate
frame. If not provided, boxes will be computed from regressed keypoints.
unmatched_keypoint_score: float, the default score to use for regressed
keypoints that are not successfully snapped to a nearby candidate.
box_scale: float, the multiplier to expand the bounding boxes (either the
provided boxes or those which tightly cover the regressed keypoints) for
an instance. This scale is typically larger than 1.0 when not providing
`bboxes`.
candidate_search_scale: float, the scale parameter that multiplies the
largest dimension of a bounding box. The resulting distance becomes a
search radius for candidates in the vicinity of each regressed keypoint.
candidate_ranking_mode: A string as one of ['min_distance',
'score_distance_ratio'] indicating how to select the candidate. If invalid
value is provided, an ValueError will be raised.
Returns:
A tuple with:
refined_keypoints: A float tensor of shape
[batch_size, num_instances, num_keypoints, 2] with the final, refined
keypoints.
refined_scores: A float tensor of shape
[batch_size, num_instances, num_keypoints] with scores associated with all
instances and keypoints in `refined_keypoints`.
Raises:
ValueError: if provided candidate_ranking_mode is not one of
['min_distance', 'score_distance_ratio']
"""
batch_size, num_instances, num_keypoints, _ = (
shape_utils.combined_static_and_dynamic_shape(regressed_keypoints))
max_candidates = keypoint_candidates.shape[1]
# Replace all invalid (i.e. padded) keypoint candidates with NaN.
# This will prevent them from being considered.
range_tiled = tf.tile(
tf.reshape(tf.range(max_candidates), [1, max_candidates, 1]),
[batch_size, 1, num_keypoints])
num_candidates_tiled = tf.tile(tf.expand_dims(num_keypoint_candidates, 1),
[1, max_candidates, 1])
invalid_candidates = range_tiled >= num_candidates_tiled
nan_mask = tf.where(
invalid_candidates,
np.nan * tf.ones_like(invalid_candidates, dtype=tf.float32),
tf.ones_like(invalid_candidates, dtype=tf.float32))
keypoint_candidates_with_nans = tf.math.multiply(
keypoint_candidates, tf.expand_dims(nan_mask, -1))
# Pairwise squared distances between regressed keypoints and candidate
# keypoints (for a single keypoint type).
# Shape [batch_size, num_instances, max_candidates, num_keypoints].
regressed_keypoint_expanded = tf.expand_dims(regressed_keypoints,
axis=2)
keypoint_candidates_expanded = tf.expand_dims(
keypoint_candidates_with_nans, axis=1)
sqrd_distances = tf.math.reduce_sum(
tf.math.squared_difference(regressed_keypoint_expanded,
keypoint_candidates_expanded),
axis=-1)
distances = tf.math.sqrt(sqrd_distances)
# Determine the candidates that have the minimum distance to the regressed
# keypoints. Shape [batch_size, num_instances, num_keypoints].
min_distances = tf.math.reduce_min(distances, axis=2)
if candidate_ranking_mode == 'min_distance':
nearby_candidate_inds = tf.math.argmin(distances, axis=2)
elif candidate_ranking_mode == 'score_distance_ratio':
# tiled_keypoint_scores:
# Shape [batch_size, num_instances, max_candidates, num_keypoints].
tiled_keypoint_scores = tf.tile(
tf.expand_dims(keypoint_scores, axis=1),
multiples=[1, num_instances, 1, 1])
ranking_scores = tiled_keypoint_scores / (distances + 1e-6)
nearby_candidate_inds = tf.math.argmax(ranking_scores, axis=2)
else:
raise ValueError('Not recognized candidate_ranking_mode: %s' %
candidate_ranking_mode)
# Gather the coordinates and scores corresponding to the closest candidates.
# Shape of tensors are [batch_size, num_instances, num_keypoints, 2] and
# [batch_size, num_instances, num_keypoints], respectively.
nearby_candidate_coords, nearby_candidate_scores = (
_gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
nearby_candidate_inds))
if bboxes is None:
# Create bboxes from regressed keypoints.
# Shape [batch_size * num_instances, 4].
regressed_keypoints_flattened = tf.reshape(
regressed_keypoints, [-1, num_keypoints, 2])
bboxes_flattened = keypoint_ops.keypoints_to_enclosing_bounding_boxes(
regressed_keypoints_flattened)
else:
bboxes_flattened = tf.reshape(bboxes, [-1, 4])
# Scale the bounding boxes.
# Shape [batch_size, num_instances, 4].
boxlist = box_list.BoxList(bboxes_flattened)
boxlist_scaled = box_list_ops.scale_height_width(
boxlist, box_scale, box_scale)
bboxes_scaled = boxlist_scaled.get()
bboxes = tf.reshape(bboxes_scaled, [batch_size, num_instances, 4])
# Get ymin, xmin, ymax, xmax bounding box coordinates, tiled per keypoint.
# Shape [batch_size, num_instances, num_keypoints].
bboxes_tiled = tf.tile(tf.expand_dims(bboxes, 2), [1, 1, num_keypoints, 1])
ymin, xmin, ymax, xmax = tf.unstack(bboxes_tiled, axis=3)
# Produce a mask that indicates whether the original regressed keypoint
# should be used instead of a candidate keypoint.
# Shape [batch_size, num_instances, num_keypoints].
search_radius = (
tf.math.maximum(ymax - ymin, xmax - xmin) * candidate_search_scale)
mask = (tf.cast(nearby_candidate_coords[:, :, :, 0] < ymin, tf.int32) +
tf.cast(nearby_candidate_coords[:, :, :, 0] > ymax, tf.int32) +
tf.cast(nearby_candidate_coords[:, :, :, 1] < xmin, tf.int32) +
tf.cast(nearby_candidate_coords[:, :, :, 1] > xmax, tf.int32) +
# Filter out the chosen candidate with score lower than unmatched
# keypoint score.
tf.cast(nearby_candidate_scores <
unmatched_keypoint_score, tf.int32) +
tf.cast(min_distances > search_radius, tf.int32))
mask = mask > 0
# Create refined keypoints where candidate keypoints replace original
# regressed keypoints if they are in the vicinity of the regressed keypoints.
# Shape [batch_size, num_instances, num_keypoints, 2].
refined_keypoints = tf.where(
tf.tile(tf.expand_dims(mask, -1), [1, 1, 1, 2]),
regressed_keypoints,
nearby_candidate_coords)
# Update keypoints scores. In the case where we use the original regressed
# keypoints, we use a default score of `unmatched_keypoint_score`.
# Shape [batch_size, num_instances, num_keypoints].
refined_scores = tf.where(
mask,
unmatched_keypoint_score * tf.ones_like(nearby_candidate_scores),
nearby_candidate_scores)
return refined_keypoints, refined_scores
def _pad_to_full_keypoint_dim(keypoint_coords, keypoint_scores, keypoint_inds,
num_total_keypoints):
"""Scatter keypoint elements into tensors with full keypoints dimension.
Args:
keypoint_coords: a [batch_size, num_instances, num_keypoints, 2] float32
tensor.
keypoint_scores: a [batch_size, num_instances, num_keypoints] float32
tensor.
keypoint_inds: a list of integers that indicate the keypoint indices for
this specific keypoint class. These indices are used to scatter into
tensors that have a `num_total_keypoints` dimension.
num_total_keypoints: The total number of keypoints that this model predicts.
Returns:
A tuple with
keypoint_coords_padded: a
[batch_size, num_instances, num_total_keypoints,2] float32 tensor.
keypoint_scores_padded: a [batch_size, num_instances, num_total_keypoints]
float32 tensor.
"""
batch_size, num_instances, _, _ = (
shape_utils.combined_static_and_dynamic_shape(keypoint_coords))
kpt_coords_transposed = tf.transpose(keypoint_coords, [2, 0, 1, 3])
kpt_scores_transposed = tf.transpose(keypoint_scores, [2, 0, 1])
kpt_inds_tensor = tf.expand_dims(keypoint_inds, axis=-1)
kpt_coords_scattered = tf.scatter_nd(
indices=kpt_inds_tensor,
updates=kpt_coords_transposed,
shape=[num_total_keypoints, batch_size, num_instances, 2])
kpt_scores_scattered = tf.scatter_nd(
indices=kpt_inds_tensor,
updates=kpt_scores_transposed,
shape=[num_total_keypoints, batch_size, num_instances])
keypoint_coords_padded = tf.transpose(kpt_coords_scattered, [1, 2, 0, 3])
keypoint_scores_padded = tf.transpose(kpt_scores_scattered, [1, 2, 0])
return keypoint_coords_padded, keypoint_scores_padded
def _pad_to_full_instance_dim(keypoint_coords, keypoint_scores, instance_inds,
max_instances):
"""Scatter keypoint elements into tensors with full instance dimension.
Args:
keypoint_coords: a [batch_size, num_instances, num_keypoints, 2] float32
tensor.
keypoint_scores: a [batch_size, num_instances, num_keypoints] float32
tensor.
instance_inds: a list of integers that indicate the instance indices for
these keypoints. These indices are used to scatter into tensors
that have a `max_instances` dimension.
max_instances: The maximum number of instances detected by the model.
Returns:
A tuple with
keypoint_coords_padded: a [batch_size, max_instances, num_keypoints, 2]
float32 tensor.
keypoint_scores_padded: a [batch_size, max_instances, num_keypoints]
float32 tensor.
"""
batch_size, _, num_keypoints, _ = (
shape_utils.combined_static_and_dynamic_shape(keypoint_coords))
kpt_coords_transposed = tf.transpose(keypoint_coords, [1, 0, 2, 3])
kpt_scores_transposed = tf.transpose(keypoint_scores, [1, 0, 2])
instance_inds = tf.expand_dims(instance_inds, axis=-1)
kpt_coords_scattered = tf.scatter_nd(
indices=instance_inds,
updates=kpt_coords_transposed,
shape=[max_instances, batch_size, num_keypoints, 2])
kpt_scores_scattered = tf.scatter_nd(
indices=instance_inds,
updates=kpt_scores_transposed,
shape=[max_instances, batch_size, num_keypoints])
keypoint_coords_padded = tf.transpose(kpt_coords_scattered, [1, 0, 2, 3])
keypoint_scores_padded = tf.transpose(kpt_scores_scattered, [1, 0, 2])
return keypoint_coords_padded, keypoint_scores_padded
def _gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
indices):
"""Gathers keypoint candidate coordinates and scores at indices.
Args:
keypoint_candidates: a float tensor of shape [batch_size, max_candidates,
num_keypoints, 2] with candidate coordinates.
keypoint_scores: a float tensor of shape [batch_size, max_candidates,
num_keypoints] with keypoint scores.
indices: an integer tensor of shape [batch_size, num_indices, num_keypoints]
with indices.
Returns:
A tuple with
gathered_keypoint_candidates: a float tensor of shape [batch_size,
num_indices, num_keypoints, 2] with gathered coordinates.
gathered_keypoint_scores: a float tensor of shape [batch_size,
num_indices, num_keypoints, 2].
"""
# Transpose tensors so that all batch dimensions are up front.
keypoint_candidates_transposed = tf.transpose(keypoint_candidates,
[0, 2, 1, 3])
keypoint_scores_transposed = tf.transpose(keypoint_scores, [0, 2, 1])
nearby_candidate_inds_transposed = tf.transpose(indices,
[0, 2, 1])
nearby_candidate_coords_tranposed = tf.gather(
keypoint_candidates_transposed, nearby_candidate_inds_transposed,
batch_dims=2)
nearby_candidate_scores_transposed = tf.gather(
keypoint_scores_transposed, nearby_candidate_inds_transposed,
batch_dims=2)
gathered_keypoint_candidates = tf.transpose(nearby_candidate_coords_tranposed,
[0, 2, 1, 3])
gathered_keypoint_scores = tf.transpose(nearby_candidate_scores_transposed,
[0, 2, 1])
return gathered_keypoint_candidates, gathered_keypoint_scores
def flattened_indices_from_row_col_indices(row_indices, col_indices, num_cols):
"""Get the index in a flattened array given row and column indices."""
return (row_indices * num_cols) + col_indices
def row_col_channel_indices_from_flattened_indices(indices, num_cols,
num_channels):
"""Computes row, column and channel indices from flattened indices.
Args:
indices: An integer tensor of any shape holding the indices in the flattened
space.
num_cols: Number of columns in the image (width).
num_channels: Number of channels in the image.
Returns:
row_indices: The row indices corresponding to each of the input indices.
Same shape as indices.
col_indices: The column indices corresponding to each of the input indices.
Same shape as indices.
channel_indices. The channel indices corresponding to each of the input
indices.
"""
row_indices = (indices // num_channels) // num_cols
col_indices = (indices // num_channels) % num_cols
channel_indices = indices % num_channels
return row_indices, col_indices, channel_indices
def get_valid_anchor_weights_in_flattened_image(true_image_shapes, height,
width):
"""Computes valid anchor weights for an image assuming pixels will be flattened.
This function is useful when we only want to penalize valid areas in the
image in the case when padding is used. The function assumes that the loss
function will be applied after flattening the spatial dimensions and returns
anchor weights accordingly.
Args:
true_image_shapes: An integer tensor of shape [batch_size, 3] representing
the true image shape (without padding) for each sample in the batch.
height: height of the prediction from the network.
width: width of the prediction from the network.
Returns:
valid_anchor_weights: a float tensor of shape [batch_size, height * width]
with 1s in locations where the spatial coordinates fall within the height
and width in true_image_shapes.
"""
indices = tf.reshape(tf.range(height * width), [1, -1])
batch_size = tf.shape(true_image_shapes)[0]
batch_indices = tf.ones((batch_size, 1), dtype=tf.int32) * indices
y_coords, x_coords, _ = row_col_channel_indices_from_flattened_indices(
batch_indices, width, 1)
max_y, max_x = true_image_shapes[:, 0], true_image_shapes[:, 1]
max_x = _to_float32(tf.expand_dims(max_x, 1))
max_y = _to_float32(tf.expand_dims(max_y, 1))
x_coords = _to_float32(x_coords)
y_coords = _to_float32(y_coords)
valid_mask = tf.math.logical_and(x_coords < max_x, y_coords < max_y)
return _to_float32(valid_mask)
def convert_strided_predictions_to_normalized_boxes(boxes, stride,
true_image_shapes):
"""Converts predictions in the output space to normalized boxes.
Boxes falling outside the valid image boundary are clipped to be on the
boundary.
Args:
boxes: A tensor of shape [batch_size, num_boxes, 4] holding the raw
coordinates of boxes in the model's output space.
stride: The stride in the output space.
true_image_shapes: A tensor of shape [batch_size, 3] representing the true
shape of the input not considering padding.
Returns:
boxes: A tensor of shape [batch_size, num_boxes, 4] representing the
coordinates of the normalized boxes.
"""
def _normalize_boxlist(args):
boxes, height, width = args
boxes = box_list_ops.scale(boxes, stride, stride)
boxes = box_list_ops.to_normalized_coordinates(boxes, height, width)
boxes = box_list_ops.clip_to_window(boxes, [0., 0., 1., 1.],
filter_nonoverlapping=False)
return boxes
box_lists = [box_list.BoxList(boxes) for boxes in tf.unstack(boxes, axis=0)]
true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
true_heights_list = tf.unstack(true_heights, axis=0)
true_widths_list = tf.unstack(true_widths, axis=0)
box_lists = list(map(_normalize_boxlist,
zip(box_lists, true_heights_list, true_widths_list)))
boxes = tf.stack([box_list_instance.get() for
box_list_instance in box_lists], axis=0)
return boxes
def convert_strided_predictions_to_normalized_keypoints(
keypoint_coords, keypoint_scores, stride, true_image_shapes,
clip_out_of_frame_keypoints=False):
"""Converts predictions in the output space to normalized keypoints.
If clip_out_of_frame_keypoints=False, keypoint coordinates falling outside
the valid image boundary are normalized but not clipped; If
clip_out_of_frame_keypoints=True, keypoint coordinates falling outside the
valid image boundary are clipped to the closest image boundary and the scores
will be set to 0.0.
Args:
keypoint_coords: A tensor of shape
[batch_size, num_instances, num_keypoints, 2] holding the raw coordinates
of keypoints in the model's output space.
keypoint_scores: A tensor of shape
[batch_size, num_instances, num_keypoints] holding the keypoint scores.
stride: The stride in the output space.
true_image_shapes: A tensor of shape [batch_size, 3] representing the true
shape of the input not considering padding.
clip_out_of_frame_keypoints: A boolean indicating whether keypoints outside
the image boundary should be clipped. If True, keypoint coords will be
clipped to image boundary. If False, keypoints are normalized but not
filtered based on their location.
Returns:
keypoint_coords_normalized: A tensor of shape
[batch_size, num_instances, num_keypoints, 2] representing the coordinates
of the normalized keypoints.
keypoint_scores: A tensor of shape
[batch_size, num_instances, num_keypoints] representing the updated
keypoint scores.
"""
# Flatten keypoints and scores.
batch_size, _, _, _ = (
shape_utils.combined_static_and_dynamic_shape(keypoint_coords))
# Scale and normalize keypoints.
true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
yscale = float(stride) / tf.cast(true_heights, tf.float32)
xscale = float(stride) / tf.cast(true_widths, tf.float32)
yx_scale = tf.stack([yscale, xscale], axis=1)
keypoint_coords_normalized = keypoint_coords * tf.reshape(
yx_scale, [batch_size, 1, 1, 2])
if clip_out_of_frame_keypoints:
# Determine the keypoints that are in the true image regions.
valid_indices = tf.logical_and(
tf.logical_and(keypoint_coords_normalized[:, :, :, 0] >= 0.0,
keypoint_coords_normalized[:, :, :, 0] <= 1.0),
tf.logical_and(keypoint_coords_normalized[:, :, :, 1] >= 0.0,
keypoint_coords_normalized[:, :, :, 1] <= 1.0))
batch_window = tf.tile(
tf.constant([[0.0, 0.0, 1.0, 1.0]], dtype=tf.float32),
multiples=[batch_size, 1])
def clip_to_window(inputs):
keypoints, window = inputs
return keypoint_ops.clip_to_window(keypoints, window)
keypoint_coords_normalized = tf.map_fn(
clip_to_window, (keypoint_coords_normalized, batch_window),
dtype=tf.float32, back_prop=False)
keypoint_scores = tf.where(valid_indices, keypoint_scores,
tf.zeros_like(keypoint_scores))
return keypoint_coords_normalized, keypoint_scores
def convert_strided_predictions_to_instance_masks(
boxes, classes, masks, stride, mask_height, mask_width,
true_image_shapes, score_threshold=0.5):
"""Converts predicted full-image masks into instance masks.
For each predicted detection box:
* Crop and resize the predicted mask based on the detected bounding box
coordinates and class prediction. Uses bilinear resampling.
* Binarize the mask using the provided score threshold.
Args:
boxes: A tensor of shape [batch, max_detections, 4] holding the predicted
boxes, in normalized coordinates (relative to the true image dimensions).
classes: An integer tensor of shape [batch, max_detections] containing the
detected class for each box (0-indexed).
masks: A [batch, output_height, output_width, num_classes] float32
tensor with class probabilities.
stride: The stride in the output space.
mask_height: The desired resized height for instance masks.
mask_width: The desired resized width for instance masks.
true_image_shapes: A tensor of shape [batch, 3] representing the true
shape of the inputs not considering padding.
score_threshold: The threshold at which to convert predicted mask
into foreground pixels.
Returns:
A [batch_size, max_detections, mask_height, mask_width] uint8 tensor with
predicted foreground mask for each instance. The masks take values in
{0, 1}.
"""
_, output_height, output_width, _ = (
shape_utils.combined_static_and_dynamic_shape(masks))
input_height = stride * output_height
input_width = stride * output_width
# Boxes are in normalized coordinates relative to true image shapes. Convert
# coordinates to be normalized relative to input image shapes (since masks
# may still have padding).
# Then crop and resize each mask.
def crop_and_threshold_masks(args):
"""Crops masks based on detection boxes."""
boxes, classes, masks, true_height, true_width = args
boxlist = box_list.BoxList(boxes)
y_scale = true_height / input_height
x_scale = true_width / input_width
boxlist = box_list_ops.scale(boxlist, y_scale, x_scale)
boxes = boxlist.get()
# Convert masks from [input_height, input_width, num_classes] to
# [num_classes, input_height, input_width, 1].
masks_4d = tf.transpose(masks, perm=[2, 0, 1])[:, :, :, tf.newaxis]
cropped_masks = tf2.image.crop_and_resize(
masks_4d,
boxes=boxes,
box_indices=classes,
crop_size=[mask_height, mask_width],
method='bilinear')
masks_3d = tf.squeeze(cropped_masks, axis=3)
masks_binarized = tf.math.greater_equal(masks_3d, score_threshold)
return tf.cast(masks_binarized, tf.uint8)
true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
masks_for_image = shape_utils.static_or_dynamic_map_fn(
crop_and_threshold_masks,
elems=[boxes, classes, masks, true_heights, true_widths],
dtype=tf.uint8,
back_prop=False)
masks = tf.stack(masks_for_image, axis=0)
return masks
class ObjectDetectionParams(
collections.namedtuple('ObjectDetectionParams', [
'localization_loss', 'scale_loss_weight', 'offset_loss_weight',
'task_loss_weight'
])):
"""Namedtuple to host object detection related parameters.
This is a wrapper class over the fields that are either the hyper-parameters
or the loss functions needed for the object detection task. The class is
immutable after constructed. Please see the __new__ function for detailed
information for each fields.
"""
__slots__ = ()
def __new__(cls,
localization_loss,
scale_loss_weight,
offset_loss_weight,
task_loss_weight=1.0):
"""Constructor with default values for ObjectDetectionParams.
Args:
localization_loss: a object_detection.core.losses.Loss object to compute
the loss for the center offset and height/width predictions in
CenterNet.
scale_loss_weight: float, The weight for localizing box size. Note that
the scale loss is dependent on the input image size, since we penalize
the raw height and width. This constant may need to be adjusted
depending on the input size.
offset_loss_weight: float, The weight for localizing center offsets.
task_loss_weight: float, the weight of the object detection loss.
Returns:
An initialized ObjectDetectionParams namedtuple.
"""
return super(ObjectDetectionParams,
cls).__new__(cls, localization_loss, scale_loss_weight,
offset_loss_weight, task_loss_weight)
class KeypointEstimationParams(
collections.namedtuple('KeypointEstimationParams', [
'task_name', 'class_id', 'keypoint_indices', 'classification_loss',
'localization_loss', 'keypoint_labels', 'keypoint_std_dev',
'keypoint_heatmap_loss_weight', 'keypoint_offset_loss_weight',
'keypoint_regression_loss_weight', 'keypoint_candidate_score_threshold',
'heatmap_bias_init', 'num_candidates_per_keypoint', 'task_loss_weight',
'peak_max_pool_kernel_size', 'unmatched_keypoint_score', 'box_scale',
'candidate_search_scale', 'candidate_ranking_mode',
'offset_peak_radius', 'per_keypoint_offset'
])):
"""Namedtuple to host object detection related parameters.
This is a wrapper class over the fields that are either the hyper-parameters
or the loss functions needed for the keypoint estimation task. The class is
immutable after constructed. Please see the __new__ function for detailed
information for each fields.
"""
__slots__ = ()
def __new__(cls,
task_name,
class_id,
keypoint_indices,
classification_loss,
localization_loss,
keypoint_labels=None,
keypoint_std_dev=None,
keypoint_heatmap_loss_weight=1.0,
keypoint_offset_loss_weight=1.0,
keypoint_regression_loss_weight=1.0,
keypoint_candidate_score_threshold=0.1,
heatmap_bias_init=-2.19,
num_candidates_per_keypoint=100,
task_loss_weight=1.0,
peak_max_pool_kernel_size=3,
unmatched_keypoint_score=0.1,
box_scale=1.2,
candidate_search_scale=0.3,
candidate_ranking_mode='min_distance',
offset_peak_radius=0,
per_keypoint_offset=False):
"""Constructor with default values for KeypointEstimationParams.
Args:
task_name: string, the name of the task this namedtuple corresponds to.
Note that it should be an unique identifier of the task.
class_id: int, the ID of the class that contains the target keypoints to
considered in this task. For example, if the task is human pose
estimation, the class id should correspond to the "human" class. Note
that the ID is 0-based, meaning that class 0 corresponds to the first
non-background object class.
keypoint_indices: A list of integers representing the indicies of the
keypoints to be considered in this task. This is used to retrieve the
subset of the keypoints from gt_keypoints that should be considered in
this task.
classification_loss: an object_detection.core.losses.Loss object to
compute the loss for the class predictions in CenterNet.
localization_loss: an object_detection.core.losses.Loss object to compute
the loss for the center offset and height/width predictions in
CenterNet.
keypoint_labels: A list of strings representing the label text of each
keypoint, e.g. "nose", 'left_shoulder". Note that the length of this
list should be equal to keypoint_indices.
keypoint_std_dev: A list of float represent the standard deviation of the
Gaussian kernel used to generate the keypoint heatmap. It is to provide
the flexibility of using different sizes of Gaussian kernel for each
keypoint class.
keypoint_heatmap_loss_weight: float, The weight for the keypoint heatmap.
keypoint_offset_loss_weight: float, The weight for the keypoint offsets
loss.
keypoint_regression_loss_weight: float, The weight for keypoint regression
loss. Note that the loss is dependent on the input image size, since we
penalize the raw height and width. This constant may need to be adjusted
depending on the input size.
keypoint_candidate_score_threshold: float, The heatmap score threshold for
a keypoint to become a valid candidate.
heatmap_bias_init: float, the initial value of bias in the convolutional
kernel of the class prediction head. If set to None, the bias is
initialized with zeros.
num_candidates_per_keypoint: The maximum number of candidates to retrieve
for each keypoint.
task_loss_weight: float, the weight of the keypoint estimation loss.
peak_max_pool_kernel_size: Max pool kernel size to use to pull off peak
score locations in a neighborhood (independently for each keypoint
types).
unmatched_keypoint_score: The default score to use for regressed keypoints
that are not successfully snapped to a nearby candidate.
box_scale: The multiplier to expand the bounding boxes (either the
provided boxes or those which tightly cover the regressed keypoints).
candidate_search_scale: The scale parameter that multiplies the largest
dimension of a bounding box. The resulting distance becomes a search
radius for candidates in the vicinity of each regressed keypoint.
candidate_ranking_mode: One of ['min_distance', 'score_distance_ratio']
indicating how to select the keypoint candidate.
offset_peak_radius: The radius (in the unit of output pixel) around
groundtruth heatmap peak to assign the offset targets. If set 0, then
the offset target will only be assigned to the heatmap peak (same
behavior as the original paper).
per_keypoint_offset: A bool indicates whether to assign offsets for each
keypoint channel separately. If set False, the output offset target has
the shape [batch_size, out_height, out_width, 2] (same behavior as the
original paper). If set True, the output offset target has the shape
[batch_size, out_height, out_width, 2 * num_keypoints] (recommended when
the offset_peak_radius is not zero).
Returns:
An initialized KeypointEstimationParams namedtuple.
"""
return super(KeypointEstimationParams, cls).__new__(
cls, task_name, class_id, keypoint_indices, classification_loss,
localization_loss, keypoint_labels, keypoint_std_dev,
keypoint_heatmap_loss_weight, keypoint_offset_loss_weight,
keypoint_regression_loss_weight, keypoint_candidate_score_threshold,
heatmap_bias_init, num_candidates_per_keypoint, task_loss_weight,
peak_max_pool_kernel_size, unmatched_keypoint_score, box_scale,
candidate_search_scale, candidate_ranking_mode, offset_peak_radius,
per_keypoint_offset)
class ObjectCenterParams(
collections.namedtuple('ObjectCenterParams', [
'classification_loss', 'object_center_loss_weight', 'heatmap_bias_init',
'min_box_overlap_iou', 'max_box_predictions', 'use_only_known_classes'
])):
"""Namedtuple to store object center prediction related parameters."""
__slots__ = ()
def __new__(cls,
classification_loss,
object_center_loss_weight,
heatmap_bias_init=-2.19,
min_box_overlap_iou=0.7,
max_box_predictions=100,
use_labeled_classes=False):
"""Constructor with default values for ObjectCenterParams.
Args:
classification_loss: an object_detection.core.losses.Loss object to
compute the loss for the class predictions in CenterNet.
object_center_loss_weight: float, The weight for the object center loss.
heatmap_bias_init: float, the initial value of bias in the convolutional
kernel of the object center prediction head. If set to None, the bias is
initialized with zeros.
min_box_overlap_iou: float, the minimum IOU overlap that predicted boxes
need have with groundtruth boxes to not be penalized. This is used for
computing the class specific center heatmaps.
max_box_predictions: int, the maximum number of boxes to predict.
use_labeled_classes: boolean, compute the loss only labeled classes.
Returns:
An initialized ObjectCenterParams namedtuple.
"""
return super(ObjectCenterParams,
cls).__new__(cls, classification_loss,
object_center_loss_weight, heatmap_bias_init,
min_box_overlap_iou, max_box_predictions,
use_labeled_classes)
class MaskParams(
collections.namedtuple('MaskParams', [
'classification_loss', 'task_loss_weight', 'mask_height', 'mask_width',
'score_threshold', 'heatmap_bias_init'
])):
"""Namedtuple to store mask prediction related parameters."""
__slots__ = ()
def __new__(cls,
classification_loss,
task_loss_weight=1.0,
mask_height=256,
mask_width=256,
score_threshold=0.5,
heatmap_bias_init=-2.19):
"""Constructor with default values for MaskParams.
Args:
classification_loss: an object_detection.core.losses.Loss object to
compute the loss for the semantic segmentation predictions in CenterNet.
task_loss_weight: float, The loss weight for the segmentation task.
mask_height: The height of the resized instance segmentation mask.
mask_width: The width of the resized instance segmentation mask.
score_threshold: The threshold at which to convert predicted mask
probabilities (after passing through sigmoid) into foreground pixels.
heatmap_bias_init: float, the initial value of bias in the convolutional
kernel of the semantic segmentation prediction head. If set to None, the
bias is initialized with zeros.
Returns:
An initialized MaskParams namedtuple.
"""
return super(MaskParams,
cls).__new__(cls, classification_loss,
task_loss_weight, mask_height, mask_width,
score_threshold, heatmap_bias_init)
# The following constants are used to generate the keys of the
# (prediction, loss, target assigner,...) dictionaries used in CenterNetMetaArch
# class.
DETECTION_TASK = 'detection_task'
OBJECT_CENTER = 'object_center'
BOX_SCALE = 'box/scale'
BOX_OFFSET = 'box/offset'
KEYPOINT_REGRESSION = 'keypoint/regression'
KEYPOINT_HEATMAP = 'keypoint/heatmap'
KEYPOINT_OFFSET = 'keypoint/offset'
SEGMENTATION_TASK = 'segmentation_task'
SEGMENTATION_HEATMAP = 'segmentation/heatmap'
LOSS_KEY_PREFIX = 'Loss'
def get_keypoint_name(task_name, head_name):
return '%s/%s' % (task_name, head_name)
def get_num_instances_from_weights(groundtruth_weights_list):
"""Computes the number of instances/boxes from the weights in a batch.
Args:
groundtruth_weights_list: A list of float tensors with shape
[max_num_instances] representing whether there is an actual instance in
the image (with non-zero value) or is padded to match the
max_num_instances (with value 0.0). The list represents the batch
dimension.
Returns:
A scalar integer tensor incidating how many instances/boxes are in the
images in the batch. Note that this function is usually used to normalize
the loss so the minimum return value is 1 to avoid weird behavior.
"""
num_instances = tf.reduce_sum(
[tf.math.count_nonzero(w) for w in groundtruth_weights_list])
num_instances = tf.maximum(num_instances, 1)
return num_instances
class CenterNetMetaArch(model.DetectionModel):
"""The CenterNet meta architecture [1].
[1]: https://arxiv.org/abs/1904.07850
"""
def __init__(self,
is_training,
add_summaries,
num_classes,
feature_extractor,
image_resizer_fn,
object_center_params,
object_detection_params=None,
keypoint_params_dict=None,
mask_params=None):
"""Initializes a CenterNet model.
Args:
is_training: Set to True if this model is being built for training.
add_summaries: Whether to add tf summaries in the model.
num_classes: int, The number of classes that the model should predict.
feature_extractor: A CenterNetFeatureExtractor to use to extract features
from an image.
image_resizer_fn: a callable for image resizing. This callable always
takes a rank-3 image tensor (corresponding to a single image) and
returns a rank-3 image tensor, possibly with new spatial dimensions and
a 1-D tensor of shape [3] indicating shape of true image within the
resized image tensor as the resized image tensor could be padded. See
builders/image_resizer_builder.py.
object_center_params: An ObjectCenterParams namedtuple. This object holds
the hyper-parameters for object center prediction. This is required by
either object detection or keypoint estimation tasks.
object_detection_params: An ObjectDetectionParams namedtuple. This object
holds the hyper-parameters necessary for object detection. Please see
the class definition for more details.
keypoint_params_dict: A dictionary that maps from task name to the
corresponding KeypointEstimationParams namedtuple. This object holds the
hyper-parameters necessary for multiple keypoint estimations. Please
see the class definition for more details.
mask_params: A MaskParams namedtuple. This object
holds the hyper-parameters for segmentation. Please see the class
definition for more details.
"""
assert object_detection_params or keypoint_params_dict
# Shorten the name for convenience and better formatting.
self._is_training = is_training
# The Objects as Points paper attaches loss functions to multiple
# (`num_feature_outputs`) feature maps in the the backbone. E.g.
# for the hourglass backbone, `num_feature_outputs` is 2.
self._feature_extractor = feature_extractor
self._num_feature_outputs = feature_extractor.num_feature_outputs
self._stride = self._feature_extractor.out_stride
self._image_resizer_fn = image_resizer_fn
self._center_params = object_center_params
self._od_params = object_detection_params
self._kp_params_dict = keypoint_params_dict
self._mask_params = mask_params
# Construct the prediction head nets.
self._prediction_head_dict = self._construct_prediction_heads(
num_classes,
self._num_feature_outputs,
class_prediction_bias_init=self._center_params.heatmap_bias_init)
# Initialize the target assigners.
self._target_assigner_dict = self._initialize_target_assigners(
stride=self._stride,
min_box_overlap_iou=self._center_params.min_box_overlap_iou)
# Will be used in VOD single_frame_meta_arch for tensor reshape.
self._batched_prediction_tensor_names = []
super(CenterNetMetaArch, self).__init__(num_classes)
@property
def batched_prediction_tensor_names(self):
if not self._batched_prediction_tensor_names:
raise RuntimeError('Must call predict() method to get batched prediction '
'tensor names.')
return self._batched_prediction_tensor_names
def _construct_prediction_heads(self, num_classes, num_feature_outputs,
class_prediction_bias_init):
"""Constructs the prediction heads based on the specific parameters.
Args:
num_classes: An integer indicating how many classes in total to predict.
num_feature_outputs: An integer indicating how many feature outputs to use
for calculating the loss. The Objects as Points paper attaches loss
functions to multiple (`num_feature_outputs`) feature maps in the the
backbone. E.g. for the hourglass backbone, `num_feature_outputs` is 2.
class_prediction_bias_init: float, the initial value of bias in the
convolutional kernel of the class prediction head. If set to None, the
bias is initialized with zeros.
Returns:
A dictionary of keras modules generated by calling make_prediction_net
function.
"""
prediction_heads = {}
prediction_heads[OBJECT_CENTER] = [
make_prediction_net(num_classes, bias_fill=class_prediction_bias_init)
for _ in range(num_feature_outputs)
]
if self._od_params is not None:
prediction_heads[BOX_SCALE] = [
make_prediction_net(NUM_SIZE_CHANNELS)
for _ in range(num_feature_outputs)
]
prediction_heads[BOX_OFFSET] = [
make_prediction_net(NUM_OFFSET_CHANNELS)
for _ in range(num_feature_outputs)
]
if self._kp_params_dict is not None:
for task_name, kp_params in self._kp_params_dict.items():
num_keypoints = len(kp_params.keypoint_indices)
prediction_heads[get_keypoint_name(task_name, KEYPOINT_HEATMAP)] = [
make_prediction_net(
num_keypoints, bias_fill=kp_params.heatmap_bias_init)
for _ in range(num_feature_outputs)
]
prediction_heads[get_keypoint_name(task_name, KEYPOINT_REGRESSION)] = [
make_prediction_net(NUM_OFFSET_CHANNELS * num_keypoints)
for _ in range(num_feature_outputs)
]
if kp_params.per_keypoint_offset:
prediction_heads[get_keypoint_name(task_name, KEYPOINT_OFFSET)] = [
make_prediction_net(NUM_OFFSET_CHANNELS * num_keypoints)
for _ in range(num_feature_outputs)
]
else:
prediction_heads[get_keypoint_name(task_name, KEYPOINT_OFFSET)] = [
make_prediction_net(NUM_OFFSET_CHANNELS)
for _ in range(num_feature_outputs)
]
if self._mask_params is not None:
prediction_heads[SEGMENTATION_HEATMAP] = [
make_prediction_net(num_classes,
bias_fill=class_prediction_bias_init)
for _ in range(num_feature_outputs)]
return prediction_heads
def _initialize_target_assigners(self, stride, min_box_overlap_iou):
"""Initializes the target assigners and puts them in a dictionary.
Args:
stride: An integer indicating the stride of the image.
min_box_overlap_iou: float, the minimum IOU overlap that predicted boxes
need have with groundtruth boxes to not be penalized. This is used for
computing the class specific center heatmaps.
Returns:
A dictionary of initialized target assigners for each task.
"""
target_assigners = {}
target_assigners[OBJECT_CENTER] = (
cn_assigner.CenterNetCenterHeatmapTargetAssigner(
stride, min_box_overlap_iou))
if self._od_params is not None:
target_assigners[DETECTION_TASK] = (
cn_assigner.CenterNetBoxTargetAssigner(stride))
if self._kp_params_dict is not None:
for task_name, kp_params in self._kp_params_dict.items():
target_assigners[task_name] = (
cn_assigner.CenterNetKeypointTargetAssigner(
stride=stride,
class_id=kp_params.class_id,
keypoint_indices=kp_params.keypoint_indices,
keypoint_std_dev=kp_params.keypoint_std_dev,
peak_radius=kp_params.offset_peak_radius,
per_keypoint_offset=kp_params.per_keypoint_offset))
if self._mask_params is not None:
target_assigners[SEGMENTATION_TASK] = (
cn_assigner.CenterNetMaskTargetAssigner(stride))
return target_assigners
def _compute_object_center_loss(self, input_height, input_width,
object_center_predictions, per_pixel_weights):
"""Computes the object center loss.
Args:
input_height: An integer scalar tensor representing input image height.
input_width: An integer scalar tensor representing input image width.
object_center_predictions: A list of float tensors of shape [batch_size,
out_height, out_width, num_classes] representing the object center
feature maps.
per_pixel_weights: A float tensor of shape [batch_size,
out_height * out_width, 1] with 1s in locations where the spatial
coordinates fall within the height and width in true_image_shapes.
Returns:
A float scalar tensor representing the object center loss per instance.
"""
gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
if self._center_params.use_only_known_classes:
gt_labeled_classes_list = self.groundtruth_lists(
fields.InputDataFields.groundtruth_labeled_classes)
batch_labeled_classes = tf.stack(gt_labeled_classes_list, axis=0)
batch_labeled_classes_shape = tf.shape(batch_labeled_classes)
batch_labeled_classes = tf.reshape(
batch_labeled_classes,
[batch_labeled_classes_shape[0], 1, batch_labeled_classes_shape[-1]])
per_pixel_weights = per_pixel_weights * batch_labeled_classes
# Convert the groundtruth to targets.
assigner = self._target_assigner_dict[OBJECT_CENTER]
heatmap_targets = assigner.assign_center_targets_from_boxes(
height=input_height,
width=input_width,
gt_boxes_list=gt_boxes_list,
gt_classes_list=gt_classes_list,
gt_weights_list=gt_weights_list)
flattened_heatmap_targets = _flatten_spatial_dimensions(heatmap_targets)
num_boxes = _to_float32(get_num_instances_from_weights(gt_weights_list))
loss = 0.0
object_center_loss = self._center_params.classification_loss
# Loop through each feature output head.
for pred in object_center_predictions:
pred = _flatten_spatial_dimensions(pred)
loss += object_center_loss(
pred, flattened_heatmap_targets, weights=per_pixel_weights)
loss_per_instance = tf.reduce_sum(loss) / (
float(len(object_center_predictions)) * num_boxes)
return loss_per_instance
def _compute_object_detection_losses(self, input_height, input_width,
prediction_dict, per_pixel_weights):
"""Computes the weighted object detection losses.
This wrapper function calls the function which computes the losses for
object detection task and applies corresponding weights to the losses.
Args:
input_height: An integer scalar tensor representing input image height.
input_width: An integer scalar tensor representing input image width.
prediction_dict: A dictionary holding predicted tensors output by
"predict" function. See "predict" function for more detailed
description.
per_pixel_weights: A float tensor of shape [batch_size,
out_height * out_width, 1] with 1s in locations where the spatial
coordinates fall within the height and width in true_image_shapes.
Returns:
A dictionary of scalar float tensors representing the weighted losses for
object detection task:
BOX_SCALE: the weighted scale (height/width) loss.
BOX_OFFSET: the weighted object offset loss.
"""
od_scale_loss, od_offset_loss = self._compute_box_scale_and_offset_loss(
scale_predictions=prediction_dict[BOX_SCALE],
offset_predictions=prediction_dict[BOX_OFFSET],
input_height=input_height,
input_width=input_width)
loss_dict = {}
loss_dict[BOX_SCALE] = (
self._od_params.scale_loss_weight * od_scale_loss)
loss_dict[BOX_OFFSET] = (
self._od_params.offset_loss_weight * od_offset_loss)
return loss_dict
def _compute_box_scale_and_offset_loss(self, input_height, input_width,
scale_predictions, offset_predictions):
"""Computes the scale loss of the object detection task.
Args:
input_height: An integer scalar tensor representing input image height.
input_width: An integer scalar tensor representing input image width.
scale_predictions: A list of float tensors of shape [batch_size,
out_height, out_width, 2] representing the prediction heads of the model
for object scale (i.e height and width).
offset_predictions: A list of float tensors of shape [batch_size,
out_height, out_width, 2] representing the prediction heads of the model
for object offset.
Returns:
A tuple of two losses:
scale_loss: A float scalar tensor representing the object height/width
loss normalized by total number of boxes.
offset_loss: A float scalar tensor representing the object offset loss
normalized by total number of boxes
"""
# TODO(vighneshb) Explore a size invariant version of scale loss.
gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
num_boxes = _to_float32(get_num_instances_from_weights(gt_weights_list))
num_predictions = float(len(scale_predictions))
assigner = self._target_assigner_dict[DETECTION_TASK]
(batch_indices, batch_height_width_targets, batch_offset_targets,
batch_weights) = assigner.assign_size_and_offset_targets(
height=input_height,
width=input_width,
gt_boxes_list=gt_boxes_list,
gt_weights_list=gt_weights_list)
batch_weights = tf.expand_dims(batch_weights, -1)
scale_loss = 0
offset_loss = 0
localization_loss_fn = self._od_params.localization_loss
for scale_pred, offset_pred in zip(scale_predictions, offset_predictions):
# Compute the scale loss.
scale_pred = cn_assigner.get_batch_predictions_from_indices(
scale_pred, batch_indices)
scale_loss += localization_loss_fn(
scale_pred, batch_height_width_targets, weights=batch_weights)
# Compute the offset loss.
offset_pred = cn_assigner.get_batch_predictions_from_indices(
offset_pred, batch_indices)
offset_loss += localization_loss_fn(
offset_pred, batch_offset_targets, weights=batch_weights)
scale_loss = tf.reduce_sum(scale_loss) / (
num_predictions * num_boxes)
offset_loss = tf.reduce_sum(offset_loss) / (
num_predictions * num_boxes)
return scale_loss, offset_loss
def _compute_keypoint_estimation_losses(self, task_name, input_height,
input_width, prediction_dict,
per_pixel_weights):
"""Computes the weighted keypoint losses."""
kp_params = self._kp_params_dict[task_name]
heatmap_key = get_keypoint_name(task_name, KEYPOINT_HEATMAP)
offset_key = get_keypoint_name(task_name, KEYPOINT_OFFSET)
regression_key = get_keypoint_name(task_name, KEYPOINT_REGRESSION)
heatmap_loss = self._compute_kp_heatmap_loss(
input_height=input_height,
input_width=input_width,
task_name=task_name,
heatmap_predictions=prediction_dict[heatmap_key],
classification_loss_fn=kp_params.classification_loss,
per_pixel_weights=per_pixel_weights)
offset_loss = self._compute_kp_offset_loss(
input_height=input_height,
input_width=input_width,
task_name=task_name,
offset_predictions=prediction_dict[offset_key],
localization_loss_fn=kp_params.localization_loss)
reg_loss = self._compute_kp_regression_loss(
input_height=input_height,
input_width=input_width,
task_name=task_name,
regression_predictions=prediction_dict[regression_key],
localization_loss_fn=kp_params.localization_loss)
loss_dict = {}
loss_dict[heatmap_key] = (
kp_params.keypoint_heatmap_loss_weight * heatmap_loss)
loss_dict[offset_key] = (
kp_params.keypoint_offset_loss_weight * offset_loss)
loss_dict[regression_key] = (
kp_params.keypoint_regression_loss_weight * reg_loss)
return loss_dict
def _compute_kp_heatmap_loss(self, input_height, input_width, task_name,
heatmap_predictions, classification_loss_fn,
per_pixel_weights):
"""Computes the heatmap loss of the keypoint estimation task.
Args:
input_height: An integer scalar tensor representing input image height.
input_width: An integer scalar tensor representing input image width.
task_name: A string representing the name of the keypoint task.
heatmap_predictions: A list of float tensors of shape [batch_size,
out_height, out_width, num_keypoints] representing the prediction heads
of the model for keypoint heatmap.
classification_loss_fn: An object_detection.core.losses.Loss object to
compute the loss for the class predictions in CenterNet.
per_pixel_weights: A float tensor of shape [batch_size,
out_height * out_width, 1] with 1s in locations where the spatial
coordinates fall within the height and width in true_image_shapes.
Returns:
loss: A float scalar tensor representing the object keypoint heatmap loss
normalized by number of instances.
"""
gt_keypoints_list = self.groundtruth_lists(fields.BoxListFields.keypoints)
gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
assigner = self._target_assigner_dict[task_name]
(keypoint_heatmap, num_instances_per_kp_type,
valid_mask_batch) = assigner.assign_keypoint_heatmap_targets(
height=input_height,
width=input_width,
gt_keypoints_list=gt_keypoints_list,
gt_weights_list=gt_weights_list,
gt_classes_list=gt_classes_list,
gt_boxes_list=gt_boxes_list)
flattened_valid_mask = _flatten_spatial_dimensions(
tf.expand_dims(valid_mask_batch, axis=-1))
flattened_heapmap_targets = _flatten_spatial_dimensions(keypoint_heatmap)
# Sum over the number of instances per keypoint types to get the total
# number of keypoints. Note that this is used to normalized the loss and we
# keep the minimum value to be 1 to avoid generating weird loss value when
# no keypoint is in the image batch.
num_instances = tf.maximum(
tf.cast(tf.reduce_sum(num_instances_per_kp_type), dtype=tf.float32),
1.0)
loss = 0.0
# Loop through each feature output head.
for pred in heatmap_predictions:
pred = _flatten_spatial_dimensions(pred)
unweighted_loss = classification_loss_fn(
pred,
flattened_heapmap_targets,
weights=tf.ones_like(per_pixel_weights))
# Apply the weights after the loss function to have full control over it.
loss += unweighted_loss * per_pixel_weights * flattened_valid_mask
loss = tf.reduce_sum(loss) / (
float(len(heatmap_predictions)) * num_instances)
return loss
def _compute_kp_offset_loss(self, input_height, input_width, task_name,
offset_predictions, localization_loss_fn):
"""Computes the offset loss of the keypoint estimation task.
Args:
input_height: An integer scalar tensor representing input image height.
input_width: An integer scalar tensor representing input image width.
task_name: A string representing the name of the keypoint task.
offset_predictions: A list of float tensors of shape [batch_size,
out_height, out_width, 2] representing the prediction heads of the model
for keypoint offset.
localization_loss_fn: An object_detection.core.losses.Loss object to
compute the loss for the keypoint offset predictions in CenterNet.
Returns:
loss: A float scalar tensor representing the keypoint offset loss
normalized by number of total keypoints.
"""
gt_keypoints_list = self.groundtruth_lists(fields.BoxListFields.keypoints)
gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
assigner = self._target_assigner_dict[task_name]
(batch_indices, batch_offsets,
batch_weights) = assigner.assign_keypoints_offset_targets(
height=input_height,
width=input_width,
gt_keypoints_list=gt_keypoints_list,
gt_weights_list=gt_weights_list,
gt_classes_list=gt_classes_list)
# Keypoint offset loss.
loss = 0.0
for prediction in offset_predictions:
batch_size, out_height, out_width, channels = _get_shape(prediction, 4)
if channels > 2:
prediction = tf.reshape(
prediction, shape=[batch_size, out_height, out_width, -1, 2])
prediction = cn_assigner.get_batch_predictions_from_indices(
prediction, batch_indices)
# The dimensions passed are not as per the doc string but the loss
# still computes the correct value.
unweighted_loss = localization_loss_fn(
prediction,
batch_offsets,
weights=tf.expand_dims(tf.ones_like(batch_weights), -1))
# Apply the weights after the loss function to have full control over it.
loss += batch_weights * tf.reduce_sum(unweighted_loss, axis=1)
loss = tf.reduce_sum(loss) / (
float(len(offset_predictions)) *
tf.maximum(tf.reduce_sum(batch_weights), 1.0))
return loss
def _compute_kp_regression_loss(self, input_height, input_width, task_name,
regression_predictions, localization_loss_fn):
"""Computes the keypoint regression loss of the keypoint estimation task.
Args:
input_height: An integer scalar tensor representing input image height.
input_width: An integer scalar tensor representing input image width.
task_name: A string representing the name of the keypoint task.
regression_predictions: A list of float tensors of shape [batch_size,
out_height, out_width, 2 * num_keypoints] representing the prediction
heads of the model for keypoint regression offset.
localization_loss_fn: An object_detection.core.losses.Loss object to
compute the loss for the keypoint regression offset predictions in
CenterNet.
Returns:
loss: A float scalar tensor representing the keypoint regression offset
loss normalized by number of total keypoints.
"""
gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
gt_keypoints_list = self.groundtruth_lists(fields.BoxListFields.keypoints)
gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
# keypoint regression offset loss.
assigner = self._target_assigner_dict[task_name]
(batch_indices, batch_regression_offsets,
batch_weights) = assigner.assign_joint_regression_targets(
height=input_height,
width=input_width,
gt_keypoints_list=gt_keypoints_list,
gt_classes_list=gt_classes_list,
gt_weights_list=gt_weights_list,
gt_boxes_list=gt_boxes_list)
loss = 0.0
for prediction in regression_predictions:
batch_size, out_height, out_width, _ = _get_shape(prediction, 4)
reshaped_prediction = tf.reshape(
prediction, shape=[batch_size, out_height, out_width, -1, 2])
reg_prediction = cn_assigner.get_batch_predictions_from_indices(
reshaped_prediction, batch_indices)
unweighted_loss = localization_loss_fn(
reg_prediction,
batch_regression_offsets,
weights=tf.expand_dims(tf.ones_like(batch_weights), -1))
# Apply the weights after the loss function to have full control over it.
loss += batch_weights * tf.reduce_sum(unweighted_loss, axis=1)
loss = tf.reduce_sum(loss) / (
float(len(regression_predictions)) *
tf.maximum(tf.reduce_sum(batch_weights), 1.0))
return loss
def _compute_segmentation_losses(self, prediction_dict, per_pixel_weights):
"""Computes all the losses associated with segmentation.
Args:
prediction_dict: The dictionary returned from the predict() method.
per_pixel_weights: A float tensor of shape [batch_size,
out_height * out_width, 1] with 1s in locations where the spatial
coordinates fall within the height and width in true_image_shapes.
Returns:
A dictionary with segmentation losses.
"""
segmentation_heatmap = prediction_dict[SEGMENTATION_HEATMAP]
mask_loss = self._compute_mask_loss(
segmentation_heatmap, per_pixel_weights)
losses = {
SEGMENTATION_HEATMAP: mask_loss
}
return losses
def _compute_mask_loss(self, segmentation_predictions,
per_pixel_weights):
"""Computes the mask loss.
Args:
segmentation_predictions: A list of float32 tensors of shape [batch_size,
out_height, out_width, num_classes].
per_pixel_weights: A float tensor of shape [batch_size,
out_height * out_width, 1] with 1s in locations where the spatial
coordinates fall within the height and width in true_image_shapes.
Returns:
A float scalar tensor representing the mask loss.
"""
gt_masks_list = self.groundtruth_lists(fields.BoxListFields.masks)
gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
# Convert the groundtruth to targets.
assigner = self._target_assigner_dict[SEGMENTATION_TASK]
heatmap_targets = assigner.assign_segmentation_targets(
gt_masks_list=gt_masks_list,
gt_classes_list=gt_classes_list)
flattened_heatmap_targets = _flatten_spatial_dimensions(heatmap_targets)
loss = 0.0
mask_loss_fn = self._mask_params.classification_loss
total_pixels_in_loss = tf.reduce_sum(per_pixel_weights)
# Loop through each feature output head.
for pred in segmentation_predictions:
pred = _flatten_spatial_dimensions(pred)
loss += mask_loss_fn(
pred, flattened_heatmap_targets, weights=per_pixel_weights)
# TODO(ronnyvotel): Consider other ways to normalize loss.
total_loss = tf.reduce_sum(loss) / (
float(len(segmentation_predictions)) * total_pixels_in_loss)
return total_loss
def preprocess(self, inputs):
outputs = shape_utils.resize_images_and_return_shapes(
inputs, self._image_resizer_fn)
resized_inputs, true_image_shapes = outputs
return (self._feature_extractor.preprocess(resized_inputs),
true_image_shapes)
def predict(self, preprocessed_inputs, _):
"""Predicts CenterNet prediction tensors given an input batch.
Feature extractors are free to produce predictions from multiple feature
maps and therefore we return a dictionary mapping strings to lists.
E.g. the hourglass backbone produces two feature maps.
Args:
preprocessed_inputs: a [batch, height, width, channels] float32 tensor
representing a batch of images.
Returns:
prediction_dict: a dictionary holding predicted tensors with
'preprocessed_inputs' - The input image after being resized and
preprocessed by the feature extractor.
'object_center' - A list of size num_feature_outputs containing
float tensors of size [batch_size, output_height, output_width,
num_classes] representing the predicted object center heatmap logits.
'box/scale' - [optional] A list of size num_feature_outputs holding
float tensors of size [batch_size, output_height, output_width, 2]
representing the predicted box height and width at each output
location. This field exists only when object detection task is
specified.
'box/offset' - [optional] A list of size num_feature_outputs holding
float tensors of size [batch_size, output_height, output_width, 2]
representing the predicted y and x offsets at each output location.
'$TASK_NAME/keypoint_heatmap' - [optional] A list of size
num_feature_outputs holding float tensors of size [batch_size,
output_height, output_width, num_keypoints] representing the predicted
keypoint heatmap logits.
'$TASK_NAME/keypoint_offset' - [optional] A list of size
num_feature_outputs holding float tensors of size [batch_size,
output_height, output_width, 2] representing the predicted keypoint
offsets at each output location.
'$TASK_NAME/keypoint_regression' - [optional] A list of size
num_feature_outputs holding float tensors of size [batch_size,
output_height, output_width, 2 * num_keypoints] representing the
predicted keypoint regression at each output location.
'segmentation/heatmap' - [optional] A list of size num_feature_outputs
holding float tensors of size [batch_size, output_height,
output_width, num_classes] representing the mask logits.
Note the $TASK_NAME is provided by the KeypointEstimation namedtuple
used to differentiate between different keypoint tasks.
"""
features_list = self._feature_extractor(preprocessed_inputs)
predictions = {}
for head_name, heads in self._prediction_head_dict.items():
predictions[head_name] = [
head(feature) for (feature, head) in zip(features_list, heads)
]
predictions['preprocessed_inputs'] = preprocessed_inputs
self._batched_prediction_tensor_names = predictions.keys()
return predictions
def loss(self, prediction_dict, true_image_shapes, scope=None):
"""Computes scalar loss tensors with respect to provided groundtruth.
This function implements the various CenterNet losses.
Args:
prediction_dict: a dictionary holding predicted tensors returned by
"predict" function.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
the form [height, width, channels] indicating the shapes of true images
in the resized images, as resized images can be padded with zeros.
scope: Optional scope name.
Returns:
A dictionary mapping the keys ['Loss/object_center', 'Loss/box/scale',
'Loss/box/offset', 'Loss/$TASK_NAME/keypoint/heatmap',
'Loss/$TASK_NAME/keypoint/offset',
'Loss/$TASK_NAME/keypoint/regression', 'Loss/segmentation/heatmap'] to
scalar tensors corresponding to the losses for different tasks. Note the
$TASK_NAME is provided by the KeypointEstimation namedtuple used to
differentiate between different keypoint tasks.
"""
_, input_height, input_width, _ = _get_shape(
prediction_dict['preprocessed_inputs'], 4)
output_height, output_width = (input_height // self._stride,
input_width // self._stride)
# TODO(vighneshb) Explore whether using floor here is safe.
output_true_image_shapes = tf.ceil(
tf.to_float(true_image_shapes) / self._stride)
valid_anchor_weights = get_valid_anchor_weights_in_flattened_image(
output_true_image_shapes, output_height, output_width)
valid_anchor_weights = tf.expand_dims(valid_anchor_weights, 2)
object_center_loss = self._compute_object_center_loss(
object_center_predictions=prediction_dict[OBJECT_CENTER],
input_height=input_height,
input_width=input_width,
per_pixel_weights=valid_anchor_weights)
losses = {
OBJECT_CENTER:
self._center_params.object_center_loss_weight * object_center_loss
}
if self._od_params is not None:
od_losses = self._compute_object_detection_losses(
input_height=input_height,
input_width=input_width,
prediction_dict=prediction_dict,
per_pixel_weights=valid_anchor_weights)
for key in od_losses:
od_losses[key] = od_losses[key] * self._od_params.task_loss_weight
losses.update(od_losses)
if self._kp_params_dict is not None:
for task_name, params in self._kp_params_dict.items():
kp_losses = self._compute_keypoint_estimation_losses(
task_name=task_name,
input_height=input_height,
input_width=input_width,
prediction_dict=prediction_dict,
per_pixel_weights=valid_anchor_weights)
for key in kp_losses:
kp_losses[key] = kp_losses[key] * params.task_loss_weight
losses.update(kp_losses)
if self._mask_params is not None:
seg_losses = self._compute_segmentation_losses(
prediction_dict=prediction_dict,
per_pixel_weights=valid_anchor_weights)
for key in seg_losses:
seg_losses[key] = seg_losses[key] * self._mask_params.task_loss_weight
losses.update(seg_losses)
# Prepend the LOSS_KEY_PREFIX to the keys in the dictionary such that the
# losses will be grouped together in Tensorboard.
return dict([('%s/%s' % (LOSS_KEY_PREFIX, key), val)
for key, val in losses.items()])
def postprocess(self, prediction_dict, true_image_shapes, **params):
"""Produces boxes given a prediction dict returned by predict().
Although predict returns a list of tensors, only the last tensor in
each list is used for making box predictions.
Args:
prediction_dict: a dictionary holding predicted tensors from "predict"
function.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
the form [height, width, channels] indicating the shapes of true images
in the resized images, as resized images can be padded with zeros.
**params: Currently ignored.
Returns:
detections: a dictionary containing the following fields
detection_boxes - A tensor of shape [batch, max_detections, 4]
holding the predicted boxes.
detection_scores: A tensor of shape [batch, max_detections] holding
the predicted score for each box.
detection_classes: An integer tensor of shape [batch, max_detections]
containing the detected class for each box.
num_detections: An integer tensor of shape [batch] containing the
number of detected boxes for each sample in the batch.
detection_keypoints: (Optional) A float tensor of shape [batch,
max_detections, num_keypoints, 2] with normalized keypoints. Any
invalid keypoints have their coordinates and scores set to 0.0.
detection_keypoint_scores: (Optional) A float tensor of shape [batch,
max_detection, num_keypoints] with scores for each keypoint.
detection_masks: (Optional) An int tensor of shape [batch,
max_detections, mask_height, mask_width] with binarized masks for each
detection.
"""
object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
# Get x, y and channel indices corresponding to the top indices in the class
# center predictions.
detection_scores, y_indices, x_indices, channel_indices = (
top_k_feature_map_locations(
object_center_prob, max_pool_kernel_size=3,
k=self._center_params.max_box_predictions))
boxes_strided, classes, scores, num_detections = (
prediction_tensors_to_boxes(
detection_scores, y_indices, x_indices, channel_indices,
prediction_dict[BOX_SCALE][-1], prediction_dict[BOX_OFFSET][-1]))
boxes = convert_strided_predictions_to_normalized_boxes(
boxes_strided, self._stride, true_image_shapes)
postprocess_dict = {
fields.DetectionResultFields.detection_boxes: boxes,
fields.DetectionResultFields.detection_scores: scores,
fields.DetectionResultFields.detection_classes: classes,
fields.DetectionResultFields.num_detections: num_detections,
}
if self._kp_params_dict:
keypoints, keypoint_scores = self._postprocess_keypoints(
prediction_dict, classes, y_indices, x_indices,
boxes_strided, num_detections)
keypoints, keypoint_scores = (
convert_strided_predictions_to_normalized_keypoints(
keypoints, keypoint_scores, self._stride, true_image_shapes,
clip_out_of_frame_keypoints=True))
postprocess_dict.update({
fields.DetectionResultFields.detection_keypoints: keypoints,
fields.DetectionResultFields.detection_keypoint_scores:
keypoint_scores
})
if self._mask_params:
masks = tf.nn.sigmoid(prediction_dict[SEGMENTATION_HEATMAP][-1])
instance_masks = convert_strided_predictions_to_instance_masks(
boxes, classes, masks, self._stride, self._mask_params.mask_height,
self._mask_params.mask_width, true_image_shapes,
self._mask_params.score_threshold)
postprocess_dict.update({
fields.DetectionResultFields.detection_masks:
instance_masks
})
return postprocess_dict
def _postprocess_keypoints(self, prediction_dict, classes, y_indices,
x_indices, boxes, num_detections):
"""Performs postprocessing on keypoint predictions.
Args:
prediction_dict: a dictionary holding predicted tensors, returned from the
predict() method. This dictionary should contain keypoint prediction
feature maps for each keypoint task.
classes: A [batch_size, max_detections] int tensor with class indices for
all detected objects.
y_indices: A [batch_size, max_detections] int tensor with y indices for
all object centers.
x_indices: A [batch_size, max_detections] int tensor with x indices for
all object centers.
boxes: A [batch_size, max_detections, 4] float32 tensor with bounding
boxes in (un-normalized) output space.
num_detections: A [batch_size] int tensor with the number of valid
detections for each image.
Returns:
A tuple of
keypoints: a [batch_size, max_detection, num_total_keypoints, 2] float32
tensor with keypoints in the output (strided) coordinate frame.
keypoint_scores: a [batch_size, max_detections, num_total_keypoints]
float32 tensor with keypoint scores.
"""
total_num_keypoints = sum(len(kp_dict.keypoint_indices) for kp_dict
in self._kp_params_dict.values())
batch_size, max_detections, _ = _get_shape(boxes, 3)
kpt_coords_for_example_list = []
kpt_scores_for_example_list = []
for ex_ind in range(batch_size):
kpt_coords_for_class_list = []
kpt_scores_for_class_list = []
instance_inds_for_class_list = []
for task_name, kp_params in self._kp_params_dict.items():
keypoint_heatmap = prediction_dict[
get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]
keypoint_offsets = prediction_dict[
get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1]
keypoint_regression = prediction_dict[
get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
instance_inds = self._get_instance_indices(
classes, num_detections, ex_ind, kp_params.class_id)
def true_fn(
keypoint_heatmap, keypoint_offsets, keypoint_regression,
classes, y_indices, x_indices, boxes, instance_inds,
ex_ind, kp_params):
"""Logics to execute when instance_inds is not an empty set."""
# Postprocess keypoints and scores for class and single image. Shapes
# are [1, num_instances_i, num_keypoints_i, 2] and
# [1, num_instances_i, num_keypoints_i], respectively. Note that
# num_instances_i and num_keypoints_i refers to the number of
# instances and keypoints for class i, respectively.
kpt_coords_for_class, kpt_scores_for_class = (
self._postprocess_keypoints_for_class_and_image(
keypoint_heatmap, keypoint_offsets, keypoint_regression,
classes, y_indices, x_indices, boxes, instance_inds,
ex_ind, kp_params))
# Expand keypoint dimension (with padding) so that coordinates and
# scores have shape [1, num_instances_i, num_total_keypoints, 2] and
# [1, num_instances_i, num_total_keypoints], respectively.
kpts_coords_for_class_padded, kpt_scores_for_class_padded = (
_pad_to_full_keypoint_dim(
kpt_coords_for_class, kpt_scores_for_class,
kp_params.keypoint_indices, total_num_keypoints))
return kpts_coords_for_class_padded, kpt_scores_for_class_padded
def false_fn():
"""Logics to execute when the instance_inds is an empty set."""
return (tf.zeros([1, 0, total_num_keypoints, 2], dtype=tf.float32),
tf.zeros([1, 0, total_num_keypoints], dtype=tf.float32))
true_fn = functools.partial(
true_fn, keypoint_heatmap, keypoint_offsets, keypoint_regression,
classes, y_indices, x_indices, boxes, instance_inds, ex_ind,
kp_params)
results = tf.cond(tf.size(instance_inds) > 0, true_fn, false_fn)
kpt_coords_for_class_list.append(results[0])
kpt_scores_for_class_list.append(results[1])
instance_inds_for_class_list.append(instance_inds)
# Concatenate all keypoints across all classes (single example).
kpt_coords_for_example = tf.concat(kpt_coords_for_class_list, axis=1)
kpt_scores_for_example = tf.concat(kpt_scores_for_class_list, axis=1)
instance_inds_for_example = tf.concat(instance_inds_for_class_list,
axis=0)
if tf.size(instance_inds_for_example) > 0:
# Scatter into tensor where instances align with original detection
# instances. New shape of keypoint coordinates and scores are
# [1, max_detections, num_total_keypoints, 2] and
# [1, max_detections, num_total_keypoints], respectively.
kpt_coords_for_example_all_det, kpt_scores_for_example_all_det = (
_pad_to_full_instance_dim(
kpt_coords_for_example, kpt_scores_for_example,
instance_inds_for_example,
self._center_params.max_box_predictions))
else:
kpt_coords_for_example_all_det = tf.zeros(
[1, max_detections, total_num_keypoints, 2], dtype=tf.float32)
kpt_scores_for_example_all_det = tf.zeros(
[1, max_detections, total_num_keypoints], dtype=tf.float32)
kpt_coords_for_example_list.append(kpt_coords_for_example_all_det)
kpt_scores_for_example_list.append(kpt_scores_for_example_all_det)
# Concatenate all keypoints and scores from all examples in the batch.
# Shapes are [batch_size, max_detections, num_total_keypoints, 2] and
# [batch_size, max_detections, num_total_keypoints], respectively.
keypoints = tf.concat(kpt_coords_for_example_list, axis=0)
keypoint_scores = tf.concat(kpt_scores_for_example_list, axis=0)
return keypoints, keypoint_scores
def _get_instance_indices(self, classes, num_detections, batch_index,
class_id):
"""Gets the instance indices that match the target class ID.
Args:
classes: A [batch_size, max_detections] int tensor with class indices for
all detected objects.
num_detections: A [batch_size] int tensor with the number of valid
detections for each image.
batch_index: An integer specifying the index for an example in the batch.
class_id: Class id
Returns:
instance_inds: A [num_instances] int tensor where each element indicates
the instance location within the `classes` tensor. This is useful to
associate the refined keypoints with the original detections (i.e.
boxes)
"""
classes = classes[batch_index:batch_index+1, ...]
_, max_detections = shape_utils.combined_static_and_dynamic_shape(
classes)
# Get the detection indices corresponding to the target class.
valid_detections_with_kpt_class = tf.math.logical_and(
tf.range(max_detections) < num_detections[batch_index],
classes[0] == class_id)
instance_inds = tf.where(valid_detections_with_kpt_class)[:, 0]
return instance_inds
def _postprocess_keypoints_for_class_and_image(
self, keypoint_heatmap, keypoint_offsets, keypoint_regression, classes,
y_indices, x_indices, boxes, indices_with_kpt_class, batch_index,
kp_params):
"""Postprocess keypoints for a single image and class.
This function performs the following postprocessing operations on a single
image and single keypoint class:
- Converts keypoints scores to range [0, 1] with sigmoid.
- Determines the detections that correspond to the specified keypoint class.
- Gathers the regressed keypoints at the detection (i.e. box) centers.
- Gathers keypoint candidates from the keypoint heatmaps.
- Snaps regressed keypoints to nearby keypoint candidates.
Args:
keypoint_heatmap: A [batch_size, height, width, num_keypoints] float32
tensor with keypoint heatmaps.
keypoint_offsets: A [batch_size, height, width, 2] float32 tensor with
local offsets to keypoint centers.
keypoint_regression: A [batch_size, height, width, 2 * num_keypoints]
float32 tensor with regressed offsets to all keypoints.
classes: A [batch_size, max_detections] int tensor with class indices for
all detected objects.
y_indices: A [batch_size, max_detections] int tensor with y indices for
all object centers.
x_indices: A [batch_size, max_detections] int tensor with x indices for
all object centers.
boxes: A [batch_size, max_detections, 4] float32 tensor with detected
boxes in the output (strided) frame.
indices_with_kpt_class: A [num_instances] int tensor where each element
indicates the instance location within the `classes` tensor. This is
useful to associate the refined keypoints with the original detections
(i.e. boxes)
batch_index: An integer specifying the index for an example in the batch.
kp_params: A `KeypointEstimationParams` object with parameters for a
single keypoint class.
Returns:
A tuple of
refined_keypoints: A [1, num_instances, num_keypoints, 2] float32 tensor
with refined keypoints for a single class in a single image, expressed
in the output (strided) coordinate frame. Note that `num_instances` is a
dynamic dimension, and corresponds to the number of valid detections
for the specific class.
refined_scores: A [1, num_instances, num_keypoints] float32 tensor with
keypoint scores.
"""
keypoint_indices = kp_params.keypoint_indices
num_keypoints = len(keypoint_indices)
keypoint_heatmap = tf.nn.sigmoid(
keypoint_heatmap[batch_index:batch_index+1, ...])
keypoint_offsets = keypoint_offsets[batch_index:batch_index+1, ...]
keypoint_regression = keypoint_regression[batch_index:batch_index+1, ...]
y_indices = y_indices[batch_index:batch_index+1, ...]
x_indices = x_indices[batch_index:batch_index+1, ...]
# Gather the feature map locations corresponding to the object class.
y_indices_for_kpt_class = tf.gather(y_indices, indices_with_kpt_class,
axis=1)
x_indices_for_kpt_class = tf.gather(x_indices, indices_with_kpt_class,
axis=1)
boxes_for_kpt_class = tf.gather(boxes, indices_with_kpt_class, axis=1)
# Gather the regressed keypoints. Final tensor has shape
# [1, num_instances, num_keypoints, 2].
regressed_keypoints_for_objects = regressed_keypoints_at_object_centers(
keypoint_regression, y_indices_for_kpt_class, x_indices_for_kpt_class)
regressed_keypoints_for_objects = tf.reshape(
regressed_keypoints_for_objects, [1, -1, num_keypoints, 2])
# Get the candidate keypoints and scores.
# The shape of keypoint_candidates and keypoint_scores is:
# [1, num_candidates_per_keypoint, num_keypoints, 2] and
# [1, num_candidates_per_keypoint, num_keypoints], respectively.
keypoint_candidates, keypoint_scores, num_keypoint_candidates = (
prediction_tensors_to_keypoint_candidates(
keypoint_heatmap, keypoint_offsets,
keypoint_score_threshold=(
kp_params.keypoint_candidate_score_threshold),
max_pool_kernel_size=kp_params.peak_max_pool_kernel_size,
max_candidates=kp_params.num_candidates_per_keypoint))
# Get the refined keypoints and scores, of shape
# [1, num_instances, num_keypoints, 2] and
# [1, num_instances, num_keypoints], respectively.
refined_keypoints, refined_scores = refine_keypoints(
regressed_keypoints_for_objects, keypoint_candidates, keypoint_scores,
num_keypoint_candidates, bboxes=boxes_for_kpt_class,
unmatched_keypoint_score=kp_params.unmatched_keypoint_score,
box_scale=kp_params.box_scale,
candidate_search_scale=kp_params.candidate_search_scale,
candidate_ranking_mode=kp_params.candidate_ranking_mode)
return refined_keypoints, refined_scores
def regularization_losses(self):
return []
def restore_map(self, fine_tune_checkpoint_type='classification',
load_all_detection_checkpoint_vars=False):
if fine_tune_checkpoint_type == 'classification':
return {'feature_extractor': self._feature_extractor.get_base_model()}
if fine_tune_checkpoint_type == 'detection':
return {'feature_extractor': self._feature_extractor.get_model()}
else:
raise ValueError('Unknown fine tune checkpoint type - {}'.format(
fine_tune_checkpoint_type))
def updates(self):
raise RuntimeError('This model is intended to be used with model_lib_v2 '
'which does not support updates()')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment