Unverified Commit ca552843 authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'panoptic-segmentation' into panoptic-segmentation

parents 7e2f7a35 6b90e134
...@@ -16,9 +16,8 @@ ...@@ -16,9 +16,8 @@
import orbit import orbit
import tensorflow as tf import tensorflow as tf
from official.modeling import grad_utils
from official.modeling import performance from official.modeling import performance
from official.staging.training import grad_utils
from official.utils.flags import core as flags_core from official.utils.flags import core as flags_core
from official.vision.image_classification.resnet import common from official.vision.image_classification.resnet import common
from official.vision.image_classification.resnet import imagenet_preprocessing from official.vision.image_classification.resnet import imagenet_preprocessing
......
...@@ -21,9 +21,11 @@ import tensorflow as tf ...@@ -21,9 +21,11 @@ import tensorflow as tf
class SpatialPyramidPooling(tf.keras.layers.Layer): class SpatialPyramidPooling(tf.keras.layers.Layer):
"""Implements the Atrous Spatial Pyramid Pooling. """Implements the Atrous Spatial Pyramid Pooling.
Reference: References:
[Rethinking Atrous Convolution for Semantic Image Segmentation]( [Rethinking Atrous Convolution for Semantic Image Segmentation](
https://arxiv.org/pdf/1706.05587.pdf) https://arxiv.org/pdf/1706.05587.pdf)
[Encoder-Decoder with Atrous Separable Convolution for Semantic Image
Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
""" """
def __init__( def __init__(
...@@ -39,6 +41,7 @@ class SpatialPyramidPooling(tf.keras.layers.Layer): ...@@ -39,6 +41,7 @@ class SpatialPyramidPooling(tf.keras.layers.Layer):
kernel_initializer='glorot_uniform', kernel_initializer='glorot_uniform',
kernel_regularizer=None, kernel_regularizer=None,
interpolation='bilinear', interpolation='bilinear',
use_depthwise_convolution=False,
**kwargs): **kwargs):
"""Initializes `SpatialPyramidPooling`. """Initializes `SpatialPyramidPooling`.
...@@ -60,6 +63,10 @@ class SpatialPyramidPooling(tf.keras.layers.Layer): ...@@ -60,6 +63,10 @@ class SpatialPyramidPooling(tf.keras.layers.Layer):
kernel_regularizer: Kernel regularizer for conv layers. Defaults to None. kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
interpolation: The interpolation method for upsampling. Defaults to interpolation: The interpolation method for upsampling. Defaults to
`bilinear`. `bilinear`.
use_depthwise_convolution: Allows spatial pooling to be separable
depthwise convolusions. [Encoder-Decoder with Atrous Separable
Convolution for Semantic Image Segmentation](
https://arxiv.org/pdf/1802.02611.pdf)
**kwargs: Other keyword arguments for the layer. **kwargs: Other keyword arguments for the layer.
""" """
super(SpatialPyramidPooling, self).__init__(**kwargs) super(SpatialPyramidPooling, self).__init__(**kwargs)
...@@ -76,6 +83,7 @@ class SpatialPyramidPooling(tf.keras.layers.Layer): ...@@ -76,6 +83,7 @@ class SpatialPyramidPooling(tf.keras.layers.Layer):
self.interpolation = interpolation self.interpolation = interpolation
self.input_spec = tf.keras.layers.InputSpec(ndim=4) self.input_spec = tf.keras.layers.InputSpec(ndim=4)
self.pool_kernel_size = pool_kernel_size self.pool_kernel_size = pool_kernel_size
self.use_depthwise_convolution = use_depthwise_convolution
def build(self, input_shape): def build(self, input_shape):
height = input_shape[1] height = input_shape[1]
...@@ -109,9 +117,20 @@ class SpatialPyramidPooling(tf.keras.layers.Layer): ...@@ -109,9 +117,20 @@ class SpatialPyramidPooling(tf.keras.layers.Layer):
self.aspp_layers.append(conv_sequential) self.aspp_layers.append(conv_sequential)
for dilation_rate in self.dilation_rates: for dilation_rate in self.dilation_rates:
conv_sequential = tf.keras.Sequential([ leading_layers = []
kernel_size = (3, 3)
if self.use_depthwise_convolution:
leading_layers += [
tf.keras.layers.DepthwiseConv2D(
depth_multiplier=1, kernel_size=kernel_size,
padding='same', depthwise_regularizer=self.kernel_regularizer,
depthwise_initializer=self.kernel_initializer,
dilation_rate=dilation_rate, use_bias=False)
]
kernel_size = (1, 1)
conv_sequential = tf.keras.Sequential(leading_layers + [
tf.keras.layers.Conv2D( tf.keras.layers.Conv2D(
filters=self.output_channels, kernel_size=(3, 3), filters=self.output_channels, kernel_size=kernel_size,
padding='same', kernel_regularizer=self.kernel_regularizer, padding='same', kernel_regularizer=self.kernel_regularizer,
kernel_initializer=self.kernel_initializer, kernel_initializer=self.kernel_initializer,
dilation_rate=dilation_rate, use_bias=False), dilation_rate=dilation_rate, use_bias=False),
......
...@@ -91,10 +91,10 @@ class TestEvaluatorWithOutputsAggregation(standard_runner.StandardEvaluator): ...@@ -91,10 +91,10 @@ class TestEvaluatorWithOutputsAggregation(standard_runner.StandardEvaluator):
super().__init__(eval_dataset=dataset, options=options) super().__init__(eval_dataset=dataset, options=options)
def eval_begin(self): def eval_begin(self):
return tf.constant((0.0,)) return {"logits": tf.constant((0.0,))}
def eval_reduce(self, state, step_outputs): def eval_reduce(self, state, step_outputs):
state = tf.concat([state, step_outputs], 0) state["logits"] = tf.concat([state["logits"], step_outputs], 0)
return state return state
def eval_step(self, iterator): def eval_step(self, iterator):
...@@ -107,7 +107,7 @@ class TestEvaluatorWithOutputsAggregation(standard_runner.StandardEvaluator): ...@@ -107,7 +107,7 @@ class TestEvaluatorWithOutputsAggregation(standard_runner.StandardEvaluator):
self.strategy.run(replica_step, args=(next(iterator),))) self.strategy.run(replica_step, args=(next(iterator),)))
def eval_end(self, outputs): def eval_end(self, outputs):
return tf.reduce_sum(outputs) return tf.reduce_sum(outputs["logits"])
class StandardRunnerTest(parameterized.TestCase): class StandardRunnerTest(parameterized.TestCase):
......
...@@ -159,6 +159,21 @@ def create_tf_while_loop_fn_with_state(step_fn): ...@@ -159,6 +159,21 @@ def create_tf_while_loop_fn_with_state(step_fn):
"`num_steps` should be a `tf.Tensor`. Passing a Python value can " "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
"cause unnecessary retracing when wrapped by `tf.function`.") "cause unnecessary retracing when wrapped by `tf.function`.")
def _get_relaxed_tensor_shape(t):
"""Returns a `TensorShape` with all `None` dimensions."""
if not tf.is_tensor(t):
return None
shape = t.shape
if shape.rank is not None and shape.rank > 0:
return tf.TensorShape([None] * shape.rank)
return shape
def _get_relaxed_shape_structure(s):
"""Returns the relaxed shape of the input nested structure `s`."""
return tf.nest.pack_sequence_as(
state, [_get_relaxed_tensor_shape(t) for t in tf.nest.flatten(s)])
for _ in tf.range(num_steps): for _ in tf.range(num_steps):
# Clear out the outer name scope so the ops created inside `tf.while_loop` # Clear out the outer name scope so the ops created inside `tf.while_loop`
# don't get "while/" as name prefix. # don't get "while/" as name prefix.
...@@ -167,9 +182,7 @@ def create_tf_while_loop_fn_with_state(step_fn): ...@@ -167,9 +182,7 @@ def create_tf_while_loop_fn_with_state(step_fn):
# across iterations. This is useful to aggregate outputs from each step # across iterations. This is useful to aggregate outputs from each step
# and concat to `state`. # and concat to `state`.
tf.autograph.experimental.set_loop_options( tf.autograph.experimental.set_loop_options(
shape_invariants=[(t, tf.TensorShape([None] * t.shape.rank)) shape_invariants=[(state, _get_relaxed_shape_structure(state))])
for t in tf.nest.flatten(state)
if tf.is_tensor(t)])
outputs = step_fn(iterator) outputs = step_fn(iterator)
state = reduce_fn(state, outputs) state = reduce_fn(state, outputs)
return state return state
......
...@@ -46,7 +46,7 @@ def char_accuracy(predictions, targets, rej_char, streaming=False): ...@@ -46,7 +46,7 @@ def char_accuracy(predictions, targets, rej_char, streaming=False):
correct_chars, weights), axis=1), correct_chars, weights), axis=1),
tf.reduce_sum(input_tensor=weights, axis=1)) tf.reduce_sum(input_tensor=weights, axis=1))
if streaming: if streaming:
return tf.contrib.metrics.streaming_mean(accuracy_per_example) return tf.metrics.mean(accuracy_per_example)
else: else:
return tf.reduce_mean(input_tensor=accuracy_per_example) return tf.reduce_mean(input_tensor=accuracy_per_example)
...@@ -87,6 +87,6 @@ def sequence_accuracy(predictions, targets, rej_char, streaming=False): ...@@ -87,6 +87,6 @@ def sequence_accuracy(predictions, targets, rej_char, streaming=False):
accuracy_per_example = tf.cast( accuracy_per_example = tf.cast(
tf.equal(correct_chars_counts, target_chars_counts), dtype=tf.float32) tf.equal(correct_chars_counts, target_chars_counts), dtype=tf.float32)
if streaming: if streaming:
return tf.contrib.metrics.streaming_mean(accuracy_per_example) return tf.metrics.mean(accuracy_per_example)
else: else:
return tf.reduce_mean(input_tensor=accuracy_per_example) return tf.reduce_mean(input_tensor=accuracy_per_example)
...@@ -44,20 +44,24 @@ def log(msg): ...@@ -44,20 +44,24 @@ def log(msg):
class YAMNet(tf.Module): class YAMNet(tf.Module):
"''A TF2 Module wrapper around YAMNet.""" """A TF2 Module wrapper around YAMNet."""
def __init__(self, weights_path, params): def __init__(self, weights_path, params):
super().__init__() super().__init__()
self._yamnet = yamnet.yamnet_frames_model(params) self._yamnet = yamnet.yamnet_frames_model(params)
self._yamnet.load_weights(weights_path) self._yamnet.load_weights(weights_path)
self._class_map_asset = tf.saved_model.Asset('yamnet_class_map.csv') self._class_map_asset = tf.saved_model.Asset('yamnet_class_map.csv')
@tf.function @tf.function(input_signature=[])
def class_map_path(self): def class_map_path(self):
return self._class_map_asset.asset_path return self._class_map_asset.asset_path
@tf.function(input_signature=(tf.TensorSpec(shape=[None], dtype=tf.float32),)) @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.float32)])
def __call__(self, waveform): def __call__(self, waveform):
return self._yamnet(waveform) predictions, embeddings, log_mel_spectrogram = self._yamnet(waveform)
return {'predictions': predictions,
'embeddings': embeddings,
'log_mel_spectrogram': log_mel_spectrogram}
def check_model(model_fn, class_map_path, params): def check_model(model_fn, class_map_path, params):
...@@ -65,7 +69,10 @@ def check_model(model_fn, class_map_path, params): ...@@ -65,7 +69,10 @@ def check_model(model_fn, class_map_path, params):
"""Applies yamnet_test's sanity checks to an instance of YAMNet.""" """Applies yamnet_test's sanity checks to an instance of YAMNet."""
def clip_test(waveform, expected_class_name, top_n=10): def clip_test(waveform, expected_class_name, top_n=10):
predictions, embeddings, log_mel_spectrogram = model_fn(waveform) results = model_fn(waveform=waveform)
predictions = results['predictions']
embeddings = results['embeddings']
log_mel_spectrogram = results['log_mel_spectrogram']
clip_predictions = np.mean(predictions, axis=0) clip_predictions = np.mean(predictions, axis=0)
top_n_indices = np.argsort(clip_predictions)[-top_n:] top_n_indices = np.argsort(clip_predictions)[-top_n:]
top_n_scores = clip_predictions[top_n_indices] top_n_scores = clip_predictions[top_n_indices]
...@@ -106,7 +113,9 @@ def make_tf2_export(weights_path, export_dir): ...@@ -106,7 +113,9 @@ def make_tf2_export(weights_path, export_dir):
# Make TF2 SavedModel export. # Make TF2 SavedModel export.
log('Making TF2 SavedModel export ...') log('Making TF2 SavedModel export ...')
tf.saved_model.save(yamnet, export_dir) tf.saved_model.save(
yamnet, export_dir,
signatures={'serving_default': yamnet.__call__.get_concrete_function()})
log('Done') log('Done')
# Check export with TF-Hub in TF2. # Check export with TF-Hub in TF2.
...@@ -143,7 +152,9 @@ def make_tflite_export(weights_path, export_dir): ...@@ -143,7 +152,9 @@ def make_tflite_export(weights_path, export_dir):
log('Making TF-Lite SavedModel export ...') log('Making TF-Lite SavedModel export ...')
saved_model_dir = os.path.join(export_dir, 'saved_model') saved_model_dir = os.path.join(export_dir, 'saved_model')
os.makedirs(saved_model_dir) os.makedirs(saved_model_dir)
tf.saved_model.save(yamnet, saved_model_dir) tf.saved_model.save(
yamnet, saved_model_dir,
signatures={'serving_default': yamnet.__call__.get_concrete_function()})
log('Done') log('Done')
# Check that the export can be loaded and works. # Check that the export can be loaded and works.
...@@ -154,7 +165,8 @@ def make_tflite_export(weights_path, export_dir): ...@@ -154,7 +165,8 @@ def make_tflite_export(weights_path, export_dir):
# Make a TF-Lite model from the SavedModel. # Make a TF-Lite model from the SavedModel.
log('Making TF-Lite model ...') log('Making TF-Lite model ...')
tflite_converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir) tflite_converter = tf.lite.TFLiteConverter.from_saved_model(
saved_model_dir, signature_keys=['serving_default'])
tflite_model = tflite_converter.convert() tflite_model = tflite_converter.convert()
tflite_model_path = os.path.join(export_dir, 'yamnet.tflite') tflite_model_path = os.path.join(export_dir, 'yamnet.tflite')
with open(tflite_model_path, 'wb') as f: with open(tflite_model_path, 'wb') as f:
...@@ -164,19 +176,8 @@ def make_tflite_export(weights_path, export_dir): ...@@ -164,19 +176,8 @@ def make_tflite_export(weights_path, export_dir):
# Check the TF-Lite export. # Check the TF-Lite export.
log('Checking TF-Lite model ...') log('Checking TF-Lite model ...')
interpreter = tf.lite.Interpreter(tflite_model_path) interpreter = tf.lite.Interpreter(tflite_model_path)
audio_input_index = interpreter.get_input_details()[0]['index'] runner = interpreter.get_signature_runner('serving_default')
scores_output_index = interpreter.get_output_details()[0]['index'] check_model(runner, 'yamnet_class_map.csv', params)
embeddings_output_index = interpreter.get_output_details()[1]['index']
spectrogram_output_index = interpreter.get_output_details()[2]['index']
def run_model(waveform):
interpreter.resize_tensor_input(audio_input_index, [len(waveform)], strict=True)
interpreter.allocate_tensors()
interpreter.set_tensor(audio_input_index, waveform)
interpreter.invoke()
return (interpreter.get_tensor(scores_output_index),
interpreter.get_tensor(embeddings_output_index),
interpreter.get_tensor(spectrogram_output_index))
check_model(run_model, 'yamnet_class_map.csv', params)
log('Done') log('Done')
return saved_model_dir return saved_model_dir
......
# DeepLab: Deep Labelling for Semantic Image Segmentation # DeepLab: Deep Labelling for Semantic Image Segmentation
**To new and existing DeepLab users**: We have released a unified codebase for
dense pixel labeling tasks in TensorFlow2 at https://github.com/google-research/deeplab2.
Please consider switching to the newer codebase for better support.
DeepLab is a state-of-art deep learning model for semantic image segmentation, DeepLab is a state-of-art deep learning model for semantic image segmentation,
where the goal is to assign semantic labels (e.g., person, dog, cat and so on) where the goal is to assign semantic labels (e.g., person, dog, cat and so on)
to every pixel in the input image. Current implementation includes the following to every pixel in the input image. Current implementation includes the following
......
...@@ -263,7 +263,8 @@ def _build_classification_loss(loss_config): ...@@ -263,7 +263,8 @@ def _build_classification_loss(loss_config):
elif loss_type == 'weighted_dice_classification_loss': elif loss_type == 'weighted_dice_classification_loss':
config = loss_config.weighted_dice_classification_loss config = loss_config.weighted_dice_classification_loss
return losses.WeightedDiceClassificationLoss( return losses.WeightedDiceClassificationLoss(
squared_normalization=config.squared_normalization) squared_normalization=config.squared_normalization,
is_prediction_probability=config.is_prediction_probability)
else: else:
raise ValueError('Empty loss config.') raise ValueError('Empty loss config.')
...@@ -916,7 +916,9 @@ def keypoint_proto_to_params(kp_config, keypoint_map_dict): ...@@ -916,7 +916,9 @@ def keypoint_proto_to_params(kp_config, keypoint_map_dict):
regress_head_kernel_sizes=regress_head_kernel_sizes, regress_head_kernel_sizes=regress_head_kernel_sizes,
score_distance_multiplier=kp_config.score_distance_multiplier, score_distance_multiplier=kp_config.score_distance_multiplier,
std_dev_multiplier=kp_config.std_dev_multiplier, std_dev_multiplier=kp_config.std_dev_multiplier,
rescoring_threshold=kp_config.rescoring_threshold) rescoring_threshold=kp_config.rescoring_threshold,
gaussian_denom_ratio=kp_config.gaussian_denom_ratio,
argmax_postprocessing=kp_config.argmax_postprocessing)
def object_detection_proto_to_params(od_config): def object_detection_proto_to_params(od_config):
...@@ -981,7 +983,8 @@ def object_center_proto_to_params(oc_config): ...@@ -981,7 +983,8 @@ def object_center_proto_to_params(oc_config):
use_labeled_classes=oc_config.use_labeled_classes, use_labeled_classes=oc_config.use_labeled_classes,
keypoint_weights_for_center=keypoint_weights_for_center, keypoint_weights_for_center=keypoint_weights_for_center,
center_head_num_filters=center_head_num_filters, center_head_num_filters=center_head_num_filters,
center_head_kernel_sizes=center_head_kernel_sizes) center_head_kernel_sizes=center_head_kernel_sizes,
peak_max_pool_kernel_size=oc_config.peak_max_pool_kernel_size)
def mask_proto_to_params(mask_config): def mask_proto_to_params(mask_config):
......
...@@ -126,6 +126,8 @@ class ModelBuilderTF2Test( ...@@ -126,6 +126,8 @@ class ModelBuilderTF2Test(
score_distance_multiplier: 11.0 score_distance_multiplier: 11.0
std_dev_multiplier: 2.8 std_dev_multiplier: 2.8
rescoring_threshold: 0.5 rescoring_threshold: 0.5
gaussian_denom_ratio: 0.3
argmax_postprocessing: True
""" """
if customize_head_params: if customize_head_params:
task_proto_txt += """ task_proto_txt += """
...@@ -158,6 +160,7 @@ class ModelBuilderTF2Test( ...@@ -158,6 +160,7 @@ class ModelBuilderTF2Test(
beta: 4.0 beta: 4.0
} }
} }
peak_max_pool_kernel_size: 5
""" """
if customize_head_params: if customize_head_params:
proto_txt += """ proto_txt += """
...@@ -319,6 +322,7 @@ class ModelBuilderTF2Test( ...@@ -319,6 +322,7 @@ class ModelBuilderTF2Test(
else: else:
self.assertEqual(model._center_params.center_head_num_filters, [256]) self.assertEqual(model._center_params.center_head_num_filters, [256])
self.assertEqual(model._center_params.center_head_kernel_sizes, [3]) self.assertEqual(model._center_params.center_head_kernel_sizes, [3])
self.assertEqual(model._center_params.peak_max_pool_kernel_size, 5)
# Check object detection related parameters. # Check object detection related parameters.
self.assertAlmostEqual(model._od_params.offset_loss_weight, 0.1) self.assertAlmostEqual(model._od_params.offset_loss_weight, 0.1)
...@@ -376,6 +380,8 @@ class ModelBuilderTF2Test( ...@@ -376,6 +380,8 @@ class ModelBuilderTF2Test(
self.assertEqual(kp_params.heatmap_head_kernel_sizes, [3]) self.assertEqual(kp_params.heatmap_head_kernel_sizes, [3])
self.assertEqual(kp_params.offset_head_num_filters, [256]) self.assertEqual(kp_params.offset_head_num_filters, [256])
self.assertEqual(kp_params.offset_head_kernel_sizes, [3]) self.assertEqual(kp_params.offset_head_kernel_sizes, [3])
self.assertAlmostEqual(kp_params.gaussian_denom_ratio, 0.3)
self.assertEqual(kp_params.argmax_postprocessing, True)
# Check mask related parameters. # Check mask related parameters.
self.assertAlmostEqual(model._mask_params.task_loss_weight, 0.7) self.assertAlmostEqual(model._mask_params.task_loss_weight, 0.7)
......
...@@ -244,10 +244,10 @@ ...@@ -244,10 +244,10 @@
"\r\n", "\r\n",
" interpreter.invoke()\r\n", " interpreter.invoke()\r\n",
"\r\n", "\r\n",
" boxes = interpreter.get_tensor(output_details[0]['index'])\r\n", " scores = interpreter.get_tensor(output_details[0]['index'])\r\n",
" classes = interpreter.get_tensor(output_details[1]['index'])\r\n", " boxes = interpreter.get_tensor(output_details[1]['index'])\r\n",
" scores = interpreter.get_tensor(output_details[2]['index'])\r\n", " num_detections = interpreter.get_tensor(output_details[2]['index'])\r\n",
" num_detections = interpreter.get_tensor(output_details[3]['index'])\r\n", " classes = interpreter.get_tensor(output_details[3]['index'])\r\n",
"\r\n", "\r\n",
" if include_keypoint:\r\n", " if include_keypoint:\r\n",
" kpts = interpreter.get_tensor(output_details[4]['index'])\r\n", " kpts = interpreter.get_tensor(output_details[4]['index'])\r\n",
...@@ -759,4 +759,4 @@ ...@@ -759,4 +759,4 @@
] ]
} }
] ]
} }
\ No newline at end of file
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "qENhcLrkK9hX"
},
"source": [
"# Generate SSD anchor box aspect ratios using k-means clustering\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KD164da8WQ0U"
},
"source": [
"Many object detection models use anchor boxes as a region-sampling strategy, so that during training, the model learns to match one of several pre-defined anchor boxes to the ground truth bounding boxes. To optimize the accuracy and efficiency of your object detection model, it's helpful if you tune these anchor boxes to fit your model dataset, because the configuration files that comes with TensorFlow's trained checkpoints include aspect ratios that are intended to cover a very broad set of objects.\n",
"\n",
"So in this notebook tutorial, you'll learn how to discover a set of aspect ratios that are custom-fit for your dataset, as discovered through k-means clustering of all the ground-truth bounding-box ratios.\n",
"\n",
"For demonstration purpsoses, we're using a subset of the [PETS dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/) (cats and dogs), which matches some other model training tutorials out there (such as [this one for the Edge TPU](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb#scrollTo=LvEMJSafnyEC)), but you can use this script with a different dataset, and we'll show how to tune it to meet your model's goals, including how to optimize speed over accuracy or accuracy over speed.\n",
"\n",
"The result of this notebook is a new [pipeline `.config` file](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md) that you can copy into your model training script. With the new customized anchor box configuration, you should observe a faster training pipeline and slightly improved model accuracy.\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cNBjMwIvCrhf"
},
"source": [
"## Get the required libraries"
]
},
{
"cell_type": "code",
"metadata": {
"id": "hCQlBGJkZTR2"
},
"source": [
"import tensorflow as tf"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "aw-Ba-5RUhMs"
},
"source": [
"# Install the tensorflow Object Detection API...\n",
"# If you're running this offline, you also might need to install the protobuf-compiler:\n",
"# apt-get install protobuf-compiler\n",
"\n",
"! git clone -n https://github.com/tensorflow/models.git\n",
"%cd models\n",
"!git checkout 461b3587ef38b42cda151fa3b7d37706d77e4244\n",
"%cd research\n",
"! protoc object_detection/protos/*.proto --python_out=.\n",
"\n",
"# Install TensorFlow Object Detection API\n",
"%cp object_detection/packages/tf2/setup.py .\n",
"! python -m pip install --upgrade pip\n",
"! python -m pip install --use-feature=2020-resolver .\n",
"\n",
"# Test the installation\n",
"! python object_detection/builders/model_builder_tf2_test.py"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "InjvvtaMECr9"
},
"source": [
"## Prepare the dataset"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "T62-oddjEH8r"
},
"source": [
"Although this notebook does not perform model training, you need to use the same dataset here that you'll use when training the model.\n",
"\n",
"To find the best anchor box ratios, you should use all of your training dataset (or as much of it as is reasonable). That's because, as mentioned in the introduction, you want to measure the precise variety of images that you expect your model to encounter—anything less and the anchor boxes might not cover the variety of objects you model encounters, so it might have weak accuracy. (Whereas the alternative, in which the ratios are based on data that is beyond the scope of your model's application, usually creates an inefficient model that can also have weaker accuracy.)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "sKYfhq7CKZ4B"
},
"source": [
"%mkdir /content/dataset\n",
"%cd /content/dataset\n",
"! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz\n",
"! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz\n",
"! tar zxf images.tar.gz\n",
"! tar zxf annotations.tar.gz"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "44vtL0nsAqXg"
},
"source": [
"In this case, we want to reduce the PETS dataset to match the collection of cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)):\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "8gcUoBU2K_s7"
},
"source": [
"! cp /content/dataset/annotations/list.txt /content/dataset/annotations/list_petsdataset.txt\n",
"! cp /content/dataset/annotations/trainval.txt /content/dataset/annotations/trainval_petsdataset.txt\n",
"! cp /content/dataset/annotations/test.txt /content/dataset/annotations/test_petsdataset.txt\n",
"! grep \"Abyssinian\" /content/dataset/annotations/list_petsdataset.txt > /content/dataset/annotations/list.txt\n",
"! grep \"american_bulldog\" /content/dataset/annotations/list_petsdataset.txt >> /content/dataset/annotations/list.txt\n",
"! grep \"Abyssinian\" /content/dataset/annotations/trainval_petsdataset.txt > /content/dataset/annotations/trainval.txt\n",
"! grep \"american_bulldog\" /content/dataset/annotations/trainval_petsdataset.txt >> /content/dataset/annotations/trainval.txt\n",
"! grep \"Abyssinian\" /content/dataset/annotations/test_petsdataset.txt > /content/dataset/annotations/test.txt\n",
"! grep \"american_bulldog\" /content/dataset/annotations/test_petsdataset.txt >> /content/dataset/annotations/test.txt"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Cs_71ZXMOctb"
},
"source": [
"## Find the aspect ratios using k-means"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "R3k5WrMYHPyL"
},
"source": [
"We are trying to find a group of aspect ratios that overlap the majority of object shapes in the dataset. We do that by finding common clusters of bounding boxes of the dataset, using the k-means clustering algorithm to find centroids of these clusters.\n",
"\n",
"To help with this, we need to calculate following:\n",
"\n",
"+ The k-means cluster centroids of the given bounding boxes\n",
"(see the `kmeans_aspect_ratios()` function below).\n",
"\n",
"+ The average intersection of bounding boxes with given aspect ratios.\n",
"(see the `average_iou()` function below).\n",
"This does not affect the outcome of the final box ratios, but serves as a useful metric for you to decide whether the selected boxes are effective and whether you want to try with more/fewer aspect ratios. (We'll discuss this score more below.)\n",
"\n",
"**NOTE:**\n",
"The term \"centroid\" used here refers to the center of the k-means cluster (the boxes (height,width) vector)."
]
},
{
"cell_type": "code",
"metadata": {
"id": "vCB8Dfs0Xlyv"
},
"source": [
"import sys\n",
"import glob\n",
"import numpy as np\n",
"import xml.etree.ElementTree as ET\n",
"\n",
"from sklearn.cluster import KMeans\n",
"\n",
"def xml_to_boxes(path, classes, rescale_width=None, rescale_height=None):\n",
" \"\"\"Extracts bounding-box widths and heights from ground-truth dataset.\n",
"\n",
" Args:\n",
" path : Path to .xml annotation files for your dataset.\n",
" classes : List of classes that are part of dataset.\n",
" rescale_width : Scaling factor to rescale width of bounding box.\n",
" rescale_height : Scaling factor to rescale height of bounding box.\n",
"\n",
" Returns:\n",
" bboxes : A numpy array with pairs of box dimensions as [width, height].\n",
" \"\"\"\n",
"\n",
" xml_list = []\n",
" for clss in classes:\n",
" for xml_file in glob.glob(path + '/'+clss+'*'):\n",
" if xml_file.endswith('.xml'):\n",
" tree = ET.parse(xml_file)\n",
" root = tree.getroot()\n",
" for member in root.findall('object'):\n",
" bndbox = member.find('bndbox')\n",
" bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n",
" bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n",
" if rescale_width and rescale_height:\n",
" size = root.find('size')\n",
" bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n",
" bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n",
"\n",
" xml_list.append([bbox_width, bbox_height])\n",
" else:\n",
" continue\n",
" bboxes = np.array(xml_list)\n",
" return bboxes\n",
"\n",
"\n",
"def average_iou(bboxes, anchors):\n",
" \"\"\"Calculates the Intersection over Union (IoU) between bounding boxes and\n",
" anchors.\n",
"\n",
" Args:\n",
" bboxes : Array of bounding boxes in [width, height] format.\n",
" anchors : Array of aspect ratios [n, 2] format.\n",
"\n",
" Returns:\n",
" avg_iou_perc : A Float value, average of IOU scores from each aspect ratio\n",
" \"\"\"\n",
" intersection_width = np.minimum(anchors[:, [0]], bboxes[:, 0]).T\n",
" intersection_height = np.minimum(anchors[:, [1]], bboxes[:, 1]).T\n",
"\n",
" if np.any(intersection_width == 0) or np.any(intersection_height == 0):\n",
" raise ValueError(\"Some boxes have zero size.\")\n",
"\n",
" intersection_area = intersection_width * intersection_height\n",
" boxes_area = np.prod(bboxes, axis=1, keepdims=True)\n",
" anchors_area = np.prod(anchors, axis=1, keepdims=True).T\n",
" union_area = boxes_area + anchors_area - intersection_area\n",
" avg_iou_perc = np.mean(np.max(intersection_area / union_area, axis=1)) * 100\n",
"\n",
" return avg_iou_perc\n",
"\n",
"def kmeans_aspect_ratios(bboxes, kmeans_max_iter, num_aspect_ratios):\n",
" \"\"\"Calculate the centroid of bounding boxes clusters using Kmeans algorithm.\n",
"\n",
" Args:\n",
" bboxes : Array of bounding boxes in [width, height] format.\n",
" kmeans_max_iter : Maximum number of iterations to find centroids.\n",
" num_aspect_ratios : Number of centroids to optimize kmeans.\n",
"\n",
" Returns:\n",
" aspect_ratios : Centroids of cluster (optmised for dataset).\n",
" avg_iou_prec : Average score of bboxes intersecting with new aspect ratios.\n",
" \"\"\"\n",
"\n",
" assert len(bboxes), \"You must provide bounding boxes\"\n",
"\n",
" normalized_bboxes = bboxes / np.sqrt(bboxes.prod(axis=1, keepdims=True))\n",
"\n",
" # Using kmeans to find centroids of the width/height clusters\n",
" kmeans = KMeans(\n",
" init='random', n_clusters=num_aspect_ratios,random_state=0, max_iter=kmeans_max_iter)\n",
" kmeans.fit(X=normalized_bboxes)\n",
" ar = kmeans.cluster_centers_\n",
"\n",
" assert len(ar), \"Unable to find k-means centroid, try increasing kmeans_max_iter.\"\n",
"\n",
" avg_iou_perc = average_iou(normalized_bboxes, ar)\n",
"\n",
" if not np.isfinite(avg_iou_perc):\n",
" sys.exit(\"Failed to get aspect ratios due to numerical errors in k-means\")\n",
"\n",
" aspect_ratios = [w/h for w,h in ar]\n",
"\n",
" return aspect_ratios, avg_iou_perc"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "eU2SuLvu55Ds"
},
"source": [
"In the next code block, we'll call the above functions to discover the ideal anchor box aspect ratios.\n",
"\n",
"You can tune the parameters below to suit your performance objectives.\n",
"\n",
"Most importantly, you should consider the number of aspect ratios you want to generate. At opposite ends of the decision spectrum, there are two objectives you might seek:\n",
"\n",
"1. **Low accuracy and fast inference**: Try 2-3 aspect ratios. \n",
" * This is if your application is okay with accuracy or confidence scores around/below 80%.\n",
" * The average IOU score (from `avg_iou_perc`) will be around 70-85.\n",
" * This reduces the model's overall computations during inference, which makes inference faster.\n",
"\n",
"2. **High accuracy and slow inference**: Try 5-6 aspect ratios.\n",
" * This is if your application requires accuracy or confidence scores around 95%.\n",
" * The average IOU score (from `avg_iou_perc`) should be over 95.\n",
" * This increases the model's overall computations during inference, which makes inference slower.\n",
"\n",
"The initial configuration below aims somewhere in between: it searches for 4 aspect ratios.\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "cNw-vX3nfl1g"
},
"source": [
"classes = ['Abyssinian','american_bulldog']\n",
"xml_path = '/content/dataset/annotations/xmls'\n",
"\n",
"# Tune this based on your accuracy/speed goals as described above\n",
"num_aspect_ratios = 4 # can be [2,3,4,5,6]\n",
"\n",
"# Tune the iterations based on the size and distribution of your dataset\n",
"# You can check avg_iou_prec every 100 iterations to see how centroids converge\n",
"kmeans_max_iter = 500\n",
"\n",
"# These should match the training pipeline config ('fixed_shape_resizer' param)\n",
"width = 320\n",
"height = 320\n",
"\n",
"# Get the ground-truth bounding boxes for our dataset\n",
"bboxes = xml_to_boxes(path=xml_path, classes=classes,\n",
" rescale_width=width, rescale_height=height)\n",
"\n",
"aspect_ratios, avg_iou_perc = kmeans_aspect_ratios(\n",
" bboxes=bboxes,\n",
" kmeans_max_iter=kmeans_max_iter,\n",
" num_aspect_ratios=num_aspect_ratios)\n",
"\n",
"aspect_ratios = sorted(aspect_ratios)\n",
"\n",
"print('Aspect ratios generated:', [round(ar,2) for ar in aspect_ratios])\n",
"print('Average IOU with anchors:', avg_iou_perc)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0xHqOpuxgmD0"
},
"source": [
"## Generate a new pipeline config file"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZB6jqVT6gpmT"
},
"source": [
"That's it. Now we just need the `.config` file your model started with, and we'll merge the new `ssd_anchor_generator` properties into it."
]
},
{
"cell_type": "code",
"metadata": {
"id": "AlMffd3rgKW2"
},
"source": [
"import tensorflow as tf\n",
"from google.protobuf import text_format\n",
"from object_detection.protos import pipeline_pb2\n",
"\n",
"pipeline = pipeline_pb2.TrainEvalPipelineConfig()\n",
"config_path = '/content/models/research/object_detection/samples/configs/ssdlite_mobiledet_edgetpu_320x320_coco_sync_4x4.config'\n",
"pipeline_save = '/content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config'\n",
"with tf.io.gfile.GFile(config_path, \"r\") as f:\n",
" proto_str = f.read()\n",
" text_format.Merge(proto_str, pipeline)\n",
"pipeline.model.ssd.num_classes = 2\n",
"while pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios:\n",
" pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios.pop()\n",
"\n",
"for i in range(len(aspect_ratios)):\n",
" pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios.append(aspect_ratios[i])\n",
"\n",
"config_text = text_format.MessageToString(pipeline)\n",
"with tf.io.gfile.GFile(pipeline_save, \"wb\") as f:\n",
" f.write(config_text)\n",
"# Check for updated aspect ratios in the config\n",
"!cat /content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "3kzWdu7ai1om"
},
"source": [
"## Summary and next steps"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "FltDhShbi06h"
},
"source": [
"If you look at the new `.config` file printed above, you'll find the `anchor_generator` specification, which includes the new `aspect_ratio` values that we generated with the k-means code above.\n",
"\n",
"The original config file ([`ssdlite_mobiledet_edgetpu_320x320_coco_sync_4x4.config`](https://github.com/tensorflow/models/blob/master/research/object_detection/samples/configs/ssd_mobilenet_v1_pets.config)) did have some default anchor box aspect ratios already, but we've replaced those with values that are optimized for our dataset. These new anchor boxes should improve the model accuracy (compared to the default anchors) and speed up the training process.\n",
"\n",
"If you want to use this configuration to train a model, then check out this tutorial to [retrain MobileDet for the Coral Edge TPU](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb), which uses this exact cats/dogs dataset. Just copy the `.config` file printed above and add it to that training notebook. (Or download the file from the **Files** panel on the left side of the Colab UI: it's called `ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config`.)\n",
"\n",
"For more information about the pipeline configuration file, read [Configuring the Object Detection Training Pipeline](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md).\n",
"\n",
"### About anchor scales...\n",
"\n",
"This notebook is focused on anchor box aspect ratios because that's often the most difficult to tune for each dataset. But you should also consider different configurations for the anchor box scales, which specify the number of different anchor box sizes and their min/max sizes—which affects how well your model detects objects of varying sizes.\n",
"\n",
"Tuning the anchor scales is much easier to do by hand, by estimating the min/max sizes you expect the model to encounter in your application environment. Just like when choosing the number of aspect ratios above, the number of different box sizes also affects your model accuracy and speed (using more box scales is more accurate, but also slower).\n",
"\n",
"You can also read more about anchor scales in [Configuring the Object Detection Training Pipeline](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md).\n",
"\n"
]
}
]
}
\ No newline at end of file
...@@ -286,15 +286,19 @@ class WeightedDiceClassificationLoss(Loss): ...@@ -286,15 +286,19 @@ class WeightedDiceClassificationLoss(Loss):
""" """
def __init__(self, squared_normalization): def __init__(self, squared_normalization, is_prediction_probability=False):
"""Initializes the loss object. """Initializes the loss object.
Args: Args:
squared_normalization: boolean, if set, we square the probabilities in the squared_normalization: boolean, if set, we square the probabilities in the
denominator term used for normalization. denominator term used for normalization.
is_prediction_probability: boolean, whether or not the input
prediction_tensor represents a probability. If false, it is
first converted to a probability by applying sigmoid.
""" """
self._squared_normalization = squared_normalization self._squared_normalization = squared_normalization
self.is_prediction_probability = is_prediction_probability
super(WeightedDiceClassificationLoss, self).__init__() super(WeightedDiceClassificationLoss, self).__init__()
def _compute_loss(self, def _compute_loss(self,
...@@ -332,7 +336,10 @@ class WeightedDiceClassificationLoss(Loss): ...@@ -332,7 +336,10 @@ class WeightedDiceClassificationLoss(Loss):
tf.shape(prediction_tensor)[2]), tf.shape(prediction_tensor)[2]),
[1, 1, -1]) [1, 1, -1])
prob_tensor = tf.nn.sigmoid(prediction_tensor) if self.is_prediction_probability:
prob_tensor = prediction_tensor
else:
prob_tensor = tf.nn.sigmoid(prediction_tensor)
if self._squared_normalization: if self._squared_normalization:
prob_tensor = tf.pow(prob_tensor, 2) prob_tensor = tf.pow(prob_tensor, 2)
......
...@@ -388,6 +388,28 @@ def _clip_window_prune_boxes(sorted_boxes, clip_window, pad_to_max_output_size, ...@@ -388,6 +388,28 @@ def _clip_window_prune_boxes(sorted_boxes, clip_window, pad_to_max_output_size,
return sorted_boxes, num_valid_nms_boxes_cumulative return sorted_boxes, num_valid_nms_boxes_cumulative
def _clip_boxes(boxes, clip_window):
"""Clips boxes to the given window.
Args:
boxes: A [batch, num_boxes, 4] float32 tensor containing box coordinates in
[ymin, xmin, ymax, xmax] form.
clip_window: A [batch, 4] float32 tensor with left top and right bottom
coordinate of the window in [ymin, xmin, ymax, xmax] form.
Returns:
A [batch, num_boxes, 4] float32 tensor containing boxes clipped to the given
window.
"""
ymin, xmin, ymax, xmax = tf.unstack(boxes, axis=-1)
clipped_ymin = tf.maximum(ymin, clip_window[:, 0, tf.newaxis])
clipped_xmin = tf.maximum(xmin, clip_window[:, 1, tf.newaxis])
clipped_ymax = tf.minimum(ymax, clip_window[:, 2, tf.newaxis])
clipped_xmax = tf.minimum(xmax, clip_window[:, 3, tf.newaxis])
return tf.stack([clipped_ymin, clipped_xmin, clipped_ymax, clipped_xmax],
axis=-1)
class NullContextmanager(object): class NullContextmanager(object):
def __enter__(self): def __enter__(self):
...@@ -985,10 +1007,10 @@ def batch_multiclass_non_max_suppression(boxes, ...@@ -985,10 +1007,10 @@ def batch_multiclass_non_max_suppression(boxes,
raise ValueError('Soft NMS is not supported by combined_nms.') raise ValueError('Soft NMS is not supported by combined_nms.')
if use_class_agnostic_nms: if use_class_agnostic_nms:
raise ValueError('class-agnostic NMS is not supported by combined_nms.') raise ValueError('class-agnostic NMS is not supported by combined_nms.')
if clip_window is not None: if clip_window is None:
tf.logging.warning( tf.logging.warning(
'clip_window is not supported by combined_nms unless it is' 'A default clip window of [0. 0. 1. 1.] will be applied for the '
' [0. 0. 1. 1.] for each image.') 'boxes.')
if additional_fields is not None: if additional_fields is not None:
tf.logging.warning('additional_fields is not supported by combined_nms.') tf.logging.warning('additional_fields is not supported by combined_nms.')
if parallel_iterations != 32: if parallel_iterations != 32:
...@@ -1007,7 +1029,14 @@ def batch_multiclass_non_max_suppression(boxes, ...@@ -1007,7 +1029,14 @@ def batch_multiclass_non_max_suppression(boxes,
max_total_size=max_total_size, max_total_size=max_total_size,
iou_threshold=iou_thresh, iou_threshold=iou_thresh,
score_threshold=score_thresh, score_threshold=score_thresh,
clip_boxes=(True if clip_window is None else False),
pad_per_class=use_static_shapes) pad_per_class=use_static_shapes)
if clip_window is not None:
if clip_window.shape.ndims == 1:
boxes_shape = boxes.shape
batch_size = shape_utils.get_dim_as_int(boxes_shape[0])
clip_window = tf.tile(clip_window[tf.newaxis, :], [batch_size, 1])
batch_nmsed_boxes = _clip_boxes(batch_nmsed_boxes, clip_window)
# Not supported by combined_non_max_suppression. # Not supported by combined_non_max_suppression.
batch_nmsed_masks = None batch_nmsed_masks = None
# Not supported by combined_non_max_suppression. # Not supported by combined_non_max_suppression.
......
...@@ -961,7 +961,8 @@ class CenterNetCenterHeatmapTargetAssigner(object): ...@@ -961,7 +961,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
width, width,
gt_boxes_list, gt_boxes_list,
gt_classes_list, gt_classes_list,
gt_weights_list=None): gt_weights_list=None,
maximum_normalized_coordinate=1.1):
"""Computes the object center heatmap target. """Computes the object center heatmap target.
Args: Args:
...@@ -977,6 +978,9 @@ class CenterNetCenterHeatmapTargetAssigner(object): ...@@ -977,6 +978,9 @@ class CenterNetCenterHeatmapTargetAssigner(object):
in the gt_boxes_list. in the gt_boxes_list.
gt_weights_list: A list of float tensors with shape [num_boxes] gt_weights_list: A list of float tensors with shape [num_boxes]
representing the weight of each groundtruth detection box. representing the weight of each groundtruth detection box.
maximum_normalized_coordinate: Maximum coordinate value to be considered
as normalized, default to 1.1. This is used to check bounds during
converting normalized coordinates to absolute coordinates.
Returns: Returns:
heatmap: A Tensor of size [batch_size, output_height, output_width, heatmap: A Tensor of size [batch_size, output_height, output_width,
...@@ -1002,7 +1006,8 @@ class CenterNetCenterHeatmapTargetAssigner(object): ...@@ -1002,7 +1006,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
boxes = box_list_ops.to_absolute_coordinates( boxes = box_list_ops.to_absolute_coordinates(
boxes, boxes,
tf.maximum(height // self._stride, 1), tf.maximum(height // self._stride, 1),
tf.maximum(width // self._stride, 1)) tf.maximum(width // self._stride, 1),
maximum_normalized_coordinate=maximum_normalized_coordinate)
# Get the box center coordinates. Each returned tensors have the shape of # Get the box center coordinates. Each returned tensors have the shape of
# [num_instances] # [num_instances]
(y_center, x_center, boxes_height, (y_center, x_center, boxes_height,
......
...@@ -782,6 +782,269 @@ def prediction_to_single_instance_keypoints( ...@@ -782,6 +782,269 @@ def prediction_to_single_instance_keypoints(
return keypoint_candidates, keypoint_scores, None return keypoint_candidates, keypoint_scores, None
def _gaussian_weighted_map_const_multi(
y_grid, x_grid, heatmap, points_y, points_x, boxes,
gaussian_denom_ratio):
"""Rescores heatmap using the distance information.
The function is called when the candidate_ranking_mode in the
KeypointEstimationParams is set to be 'gaussian_weighted_const'. The
keypoint candidates are ranked using the formula:
heatmap_score * exp((-distances^2) / (gaussian_denom))
where 'gaussian_denom' is determined by:
min(output_feature_height, output_feature_width) * gaussian_denom_ratio
the 'distances' are the distances between the grid coordinates and the target
points.
Note that the postfix 'const' refers to the fact that the denominator is a
constant given the input image size, not scaled by the size of each of the
instances.
Args:
y_grid: A float tensor with shape [height, width] representing the
y-coordinate of each pixel grid.
x_grid: A float tensor with shape [height, width] representing the
x-coordinate of each pixel grid.
heatmap: A float tensor with shape [batch_size, height, width,
num_keypoints] representing the heatmap to be rescored.
points_y: A float tensor with shape [batch_size, num_instances,
num_keypoints] representing the y coordinates of the target points for
each channel.
points_x: A float tensor with shape [batch_size, num_instances,
num_keypoints] representing the x coordinates of the target points for
each channel.
boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
bounding boxes for each instance, expressed in the output coordinate
frame.
gaussian_denom_ratio: A constant used in the above formula that determines
the denominator of the Gaussian kernel.
Returns:
A float tensor with shape [batch_size, height, width, channel] representing
the rescored heatmap.
"""
batch_size, num_instances, _ = _get_shape(boxes, 3)
_, height, width, num_keypoints = _get_shape(heatmap, 4)
# [batch_size, height, width, num_instances, num_keypoints].
# Note that we intentionally avoid using tf.newaxis as TfLite converter
# doesn't like it.
y_diff = (
tf.reshape(y_grid, [1, height, width, 1, 1]) -
tf.reshape(points_y, [batch_size, 1, 1, num_instances, num_keypoints]))
x_diff = (
tf.reshape(x_grid, [1, height, width, 1, 1]) -
tf.reshape(points_x, [batch_size, 1, 1, num_instances, num_keypoints]))
distance_square = y_diff**2 + x_diff**2
y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=2)
# Make the mask with all 1.0 in the box regions.
# Shape: [batch_size, height, width, num_instances]
in_boxes = tf.math.logical_and(
tf.math.logical_and(
tf.reshape(y_grid, [1, height, width, 1]) >= tf.reshape(
y_min, [batch_size, 1, 1, num_instances]),
tf.reshape(y_grid, [1, height, width, 1]) < tf.reshape(
y_max, [batch_size, 1, 1, num_instances])),
tf.math.logical_and(
tf.reshape(x_grid, [1, height, width, 1]) >= tf.reshape(
x_min, [batch_size, 1, 1, num_instances]),
tf.reshape(x_grid, [1, height, width, 1]) < tf.reshape(
x_max, [batch_size, 1, 1, num_instances])))
in_boxes = tf.cast(in_boxes, dtype=tf.float32)
gaussian_denom = tf.cast(
tf.minimum(height, width), dtype=tf.float32) * gaussian_denom_ratio
# shape: [batch_size, height, width, num_instances, num_keypoints]
gaussian_map = tf.exp((-1 * distance_square) / gaussian_denom)
return tf.expand_dims(
heatmap, axis=3) * gaussian_map * tf.reshape(
in_boxes, [batch_size, height, width, num_instances, 1])
def prediction_tensors_to_multi_instance_kpts(
keypoint_heatmap_predictions,
keypoint_heatmap_offsets,
keypoint_score_heatmap=None):
"""Converts keypoint heatmap predictions and offsets to keypoint candidates.
This function is similar to the 'prediction_tensors_to_single_instance_kpts'
function except that the input keypoint_heatmap_predictions is prepared to
have an additional 'num_instances' dimension for multi-instance prediction.
Args:
keypoint_heatmap_predictions: A float tensor of shape [batch_size, height,
width, num_instances, num_keypoints] representing the per-keypoint and
per-instance heatmaps which is used for finding the best keypoint
candidate locations.
keypoint_heatmap_offsets: A float tensor of shape [batch_size, height,
width, 2 * num_keypoints] representing the per-keypoint offsets.
keypoint_score_heatmap: (optional) A float tensor of shape [batch_size,
height, width, num_keypoints] representing the heatmap
which is used for reporting the confidence scores. If not provided, then
the values in the keypoint_heatmap_predictions will be used.
Returns:
keypoint_candidates: A tensor of shape
[batch_size, max_candidates, num_keypoints, 2] holding the
location of keypoint candidates in [y, x] format (expressed in absolute
coordinates in the output coordinate frame).
keypoint_scores: A float tensor of shape
[batch_size, max_candidates, num_keypoints] with the scores for each
keypoint candidate. The scores come directly from the heatmap predictions.
"""
batch_size, height, width, num_instances, num_keypoints = _get_shape(
keypoint_heatmap_predictions, 5)
# [batch_size, height * width, num_instances * num_keypoints].
feature_map_flattened = tf.reshape(
keypoint_heatmap_predictions,
[batch_size, -1, num_instances * num_keypoints])
# [batch_size, num_instances * num_keypoints].
peak_flat_indices = tf.math.argmax(
feature_map_flattened, axis=1, output_type=tf.dtypes.int32)
# Get x and y indices corresponding to the top indices in the flat array.
y_indices, x_indices = (
row_col_indices_from_flattened_indices(peak_flat_indices, width))
# [batch_size * num_instances * num_keypoints].
y_indices = tf.reshape(y_indices, [-1])
x_indices = tf.reshape(x_indices, [-1])
# Prepare the indices to gather the offsets from the keypoint_heatmap_offsets.
batch_idx = _multi_range(
limit=batch_size, value_repetitions=num_keypoints * num_instances)
kpts_idx = _multi_range(
limit=num_keypoints, value_repetitions=1,
range_repetitions=batch_size * num_instances)
combined_indices = tf.stack([
batch_idx,
y_indices,
x_indices,
kpts_idx
], axis=1)
keypoint_heatmap_offsets = tf.reshape(
keypoint_heatmap_offsets, [batch_size, height, width, num_keypoints, 2])
# Retrieve the keypoint offsets: shape:
# [batch_size * num_instance * num_keypoints, 2].
selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets,
combined_indices)
y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1)
keypoint_candidates = tf.stack([
tf.cast(y_indices, dtype=tf.float32) + tf.expand_dims(y_offsets, axis=0),
tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0)
], axis=2)
keypoint_candidates = tf.reshape(
keypoint_candidates, [batch_size, num_instances, num_keypoints, 2])
if keypoint_score_heatmap is None:
keypoint_scores = tf.gather_nd(
tf.reduce_max(keypoint_heatmap_predictions, axis=3), combined_indices)
else:
keypoint_scores = tf.gather_nd(keypoint_score_heatmap, combined_indices)
return keypoint_candidates, tf.reshape(
keypoint_scores, [batch_size, num_instances, num_keypoints])
def prediction_to_keypoints_argmax(
prediction_dict,
object_y_indices,
object_x_indices,
boxes,
task_name,
kp_params):
"""Postprocess function to predict multi instance keypoints with argmax op.
This is a different implementation of the original keypoint postprocessing
function such that it avoids using topk op (replaced by argmax) as it runs
much slower in the browser.
Args:
prediction_dict: a dictionary holding predicted tensors, returned from the
predict() method. This dictionary should contain keypoint prediction
feature maps for each keypoint task.
object_y_indices: A float tensor of shape [batch_size, max_instances]
representing the location indices of the object centers.
object_x_indices: A float tensor of shape [batch_size, max_instances]
representing the location indices of the object centers.
boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
bounding boxes for each instance, expressed in the output coordinate
frame.
task_name: string, the name of the task this namedtuple corresponds to.
Note that it should be an unique identifier of the task.
kp_params: A `KeypointEstimationParams` object with parameters for a single
keypoint class.
Returns:
A tuple of two tensors:
keypoint_candidates: A float tensor with shape [batch_size,
num_instances, num_keypoints, 2] representing the yx-coordinates of
the keypoints in the output feature map space.
keypoint_scores: A float tensor with shape [batch_size, num_instances,
num_keypoints] representing the keypoint prediction scores.
Raises:
ValueError: if the candidate_ranking_mode is not supported.
"""
keypoint_heatmap = tf.nn.sigmoid(prediction_dict[
get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1])
keypoint_offset = prediction_dict[
get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1]
keypoint_regression = prediction_dict[
get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
batch_size, height, width, num_keypoints = _get_shape(keypoint_heatmap, 4)
# Create the y,x grids: [height, width]
(y_grid, x_grid) = ta_utils.image_shape_to_grids(height, width)
# Prepare the indices to retrieve the information from object centers.
num_instances = _get_shape(object_y_indices, 2)[1]
combined_obj_indices = tf.stack([
_multi_range(batch_size, value_repetitions=num_instances),
tf.reshape(object_y_indices, [-1]),
tf.reshape(object_x_indices, [-1])
], axis=1)
# Select the regression vectors from the object center.
selected_regression_flat = tf.gather_nd(
keypoint_regression, combined_obj_indices)
selected_regression = tf.reshape(
selected_regression_flat, [batch_size, num_instances, num_keypoints, 2])
(y_reg, x_reg) = tf.unstack(selected_regression, axis=3)
# shape: [batch_size, num_instances, num_keypoints].
y_regressed = tf.cast(
tf.reshape(object_y_indices, [batch_size, num_instances, 1]),
dtype=tf.float32) + y_reg
x_regressed = tf.cast(
tf.reshape(object_x_indices, [batch_size, num_instances, 1]),
dtype=tf.float32) + x_reg
if kp_params.candidate_ranking_mode == 'gaussian_weighted_const':
rescored_heatmap = _gaussian_weighted_map_const_multi(
y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed, boxes,
kp_params.gaussian_denom_ratio)
# shape: [batch_size, height, width, num_keypoints].
keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=3)
else:
raise ValueError(
'Unsupported ranking mode in the multipose no topk method: %s' %
kp_params.candidate_ranking_mode)
(keypoint_candidates,
keypoint_scores) = prediction_tensors_to_multi_instance_kpts(
keypoint_heatmap_predictions=rescored_heatmap,
keypoint_heatmap_offsets=keypoint_offset,
keypoint_score_heatmap=keypoint_score_heatmap)
return keypoint_candidates, keypoint_scores
def regressed_keypoints_at_object_centers(regressed_keypoint_predictions, def regressed_keypoints_at_object_centers(regressed_keypoint_predictions,
y_indices, x_indices): y_indices, x_indices):
"""Returns the regressed keypoints at specified object centers. """Returns the regressed keypoints at specified object centers.
...@@ -1533,15 +1796,9 @@ def convert_strided_predictions_to_normalized_keypoints( ...@@ -1533,15 +1796,9 @@ def convert_strided_predictions_to_normalized_keypoints(
keypoints, window = inputs keypoints, window = inputs
return keypoint_ops.clip_to_window(keypoints, window) return keypoint_ops.clip_to_window(keypoints, window)
# Specify the TensorSpec explicitly in the tf.map_fn to make it tf.lite keypoint_coords_normalized = shape_utils.static_or_dynamic_map_fn(
# compatible. clip_to_window, [keypoint_coords_normalized, batch_window],
kpts_dims = _get_shape(keypoint_coords_normalized, 4) dtype=tf.float32, back_prop=False)
output_spec = tf.TensorSpec(
shape=[kpts_dims[1], kpts_dims[2], kpts_dims[3]], dtype=tf.float32)
keypoint_coords_normalized = tf.map_fn(
clip_to_window, (keypoint_coords_normalized, batch_window),
dtype=tf.float32, back_prop=False,
fn_output_signature=output_spec)
keypoint_scores = tf.where(valid_indices, keypoint_scores, keypoint_scores = tf.where(valid_indices, keypoint_scores,
tf.zeros_like(keypoint_scores)) tf.zeros_like(keypoint_scores))
return keypoint_coords_normalized, keypoint_scores return keypoint_coords_normalized, keypoint_scores
...@@ -1900,7 +2157,8 @@ class KeypointEstimationParams( ...@@ -1900,7 +2157,8 @@ class KeypointEstimationParams(
'heatmap_head_kernel_sizes', 'offset_head_num_filters', 'heatmap_head_kernel_sizes', 'offset_head_num_filters',
'offset_head_kernel_sizes', 'regress_head_num_filters', 'offset_head_kernel_sizes', 'regress_head_num_filters',
'regress_head_kernel_sizes', 'score_distance_multiplier', 'regress_head_kernel_sizes', 'score_distance_multiplier',
'std_dev_multiplier', 'rescoring_threshold' 'std_dev_multiplier', 'rescoring_threshold', 'gaussian_denom_ratio',
'argmax_postprocessing'
])): ])):
"""Namedtuple to host object detection related parameters. """Namedtuple to host object detection related parameters.
...@@ -1948,7 +2206,9 @@ class KeypointEstimationParams( ...@@ -1948,7 +2206,9 @@ class KeypointEstimationParams(
regress_head_kernel_sizes=(3), regress_head_kernel_sizes=(3),
score_distance_multiplier=0.1, score_distance_multiplier=0.1,
std_dev_multiplier=1.0, std_dev_multiplier=1.0,
rescoring_threshold=0.0): rescoring_threshold=0.0,
argmax_postprocessing=False,
gaussian_denom_ratio=0.1):
"""Constructor with default values for KeypointEstimationParams. """Constructor with default values for KeypointEstimationParams.
Args: Args:
...@@ -2049,6 +2309,12 @@ class KeypointEstimationParams( ...@@ -2049,6 +2309,12 @@ class KeypointEstimationParams(
True. The detection score of an instance is set to be the average over True. The detection score of an instance is set to be the average over
the scores of the keypoints which their scores higher than the the scores of the keypoints which their scores higher than the
threshold. threshold.
argmax_postprocessing: Whether to use the keypoint postprocessing logic
that replaces the topk op with argmax. Usually used when exporting the
model for predicting keypoints of multiple instances in the browser.
gaussian_denom_ratio: The ratio used to multiply the image size to
determine the denominator of the Gaussian formula. Only applicable when
the candidate_ranking_mode is set to be 'gaussian_weighted_const'.
Returns: Returns:
An initialized KeypointEstimationParams namedtuple. An initialized KeypointEstimationParams namedtuple.
...@@ -2067,7 +2333,8 @@ class KeypointEstimationParams( ...@@ -2067,7 +2333,8 @@ class KeypointEstimationParams(
heatmap_head_num_filters, heatmap_head_kernel_sizes, heatmap_head_num_filters, heatmap_head_kernel_sizes,
offset_head_num_filters, offset_head_kernel_sizes, offset_head_num_filters, offset_head_kernel_sizes,
regress_head_num_filters, regress_head_kernel_sizes, regress_head_num_filters, regress_head_kernel_sizes,
score_distance_multiplier, std_dev_multiplier, rescoring_threshold) score_distance_multiplier, std_dev_multiplier, rescoring_threshold,
argmax_postprocessing, gaussian_denom_ratio)
class ObjectCenterParams( class ObjectCenterParams(
...@@ -2075,7 +2342,7 @@ class ObjectCenterParams( ...@@ -2075,7 +2342,7 @@ class ObjectCenterParams(
'classification_loss', 'object_center_loss_weight', 'heatmap_bias_init', 'classification_loss', 'object_center_loss_weight', 'heatmap_bias_init',
'min_box_overlap_iou', 'max_box_predictions', 'use_labeled_classes', 'min_box_overlap_iou', 'max_box_predictions', 'use_labeled_classes',
'keypoint_weights_for_center', 'center_head_num_filters', 'keypoint_weights_for_center', 'center_head_num_filters',
'center_head_kernel_sizes' 'center_head_kernel_sizes', 'peak_max_pool_kernel_size'
])): ])):
"""Namedtuple to store object center prediction related parameters.""" """Namedtuple to store object center prediction related parameters."""
...@@ -2090,7 +2357,8 @@ class ObjectCenterParams( ...@@ -2090,7 +2357,8 @@ class ObjectCenterParams(
use_labeled_classes=False, use_labeled_classes=False,
keypoint_weights_for_center=None, keypoint_weights_for_center=None,
center_head_num_filters=(256), center_head_num_filters=(256),
center_head_kernel_sizes=(3)): center_head_kernel_sizes=(3),
peak_max_pool_kernel_size=3):
"""Constructor with default values for ObjectCenterParams. """Constructor with default values for ObjectCenterParams.
Args: Args:
...@@ -2115,6 +2383,8 @@ class ObjectCenterParams( ...@@ -2115,6 +2383,8 @@ class ObjectCenterParams(
by the object center prediction head. by the object center prediction head.
center_head_kernel_sizes: kernel size of the convolutional layers used center_head_kernel_sizes: kernel size of the convolutional layers used
by the object center prediction head. by the object center prediction head.
peak_max_pool_kernel_size: Max pool kernel size to use to pull off peak
score locations in a neighborhood for the object detection heatmap.
Returns: Returns:
An initialized ObjectCenterParams namedtuple. An initialized ObjectCenterParams namedtuple.
""" """
...@@ -2123,7 +2393,8 @@ class ObjectCenterParams( ...@@ -2123,7 +2393,8 @@ class ObjectCenterParams(
object_center_loss_weight, heatmap_bias_init, object_center_loss_weight, heatmap_bias_init,
min_box_overlap_iou, max_box_predictions, min_box_overlap_iou, max_box_predictions,
use_labeled_classes, keypoint_weights_for_center, use_labeled_classes, keypoint_weights_for_center,
center_head_num_filters, center_head_kernel_sizes) center_head_num_filters, center_head_kernel_sizes,
peak_max_pool_kernel_size)
class MaskParams( class MaskParams(
...@@ -2627,16 +2898,12 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -2627,16 +2898,12 @@ class CenterNetMetaArch(model.DetectionModel):
self.track_reid_classification_net = tf.keras.Sequential() self.track_reid_classification_net = tf.keras.Sequential()
for _ in range(self._track_params.num_fc_layers - 1): for _ in range(self._track_params.num_fc_layers - 1):
self.track_reid_classification_net.add( self.track_reid_classification_net.add(
tf.keras.layers.Dense(self._track_params.reid_embed_size, tf.keras.layers.Dense(self._track_params.reid_embed_size))
input_shape=(
self._track_params.reid_embed_size,)))
self.track_reid_classification_net.add( self.track_reid_classification_net.add(
tf.keras.layers.BatchNormalization()) tf.keras.layers.BatchNormalization())
self.track_reid_classification_net.add(tf.keras.layers.ReLU()) self.track_reid_classification_net.add(tf.keras.layers.ReLU())
self.track_reid_classification_net.add( self.track_reid_classification_net.add(
tf.keras.layers.Dense(self._track_params.num_track_ids, tf.keras.layers.Dense(self._track_params.num_track_ids))
input_shape=(
self._track_params.reid_embed_size,)))
if self._temporal_offset_params is not None: if self._temporal_offset_params is not None:
prediction_heads[TEMPORAL_OFFSET] = self._make_prediction_net_list( prediction_heads[TEMPORAL_OFFSET] = self._make_prediction_net_list(
num_feature_outputs, NUM_OFFSET_CHANNELS, name='temporal_offset', num_feature_outputs, NUM_OFFSET_CHANNELS, name='temporal_offset',
...@@ -2714,7 +2981,8 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -2714,7 +2981,8 @@ class CenterNetMetaArch(model.DetectionModel):
return target_assigners return target_assigners
def _compute_object_center_loss(self, input_height, input_width, def _compute_object_center_loss(self, input_height, input_width,
object_center_predictions, per_pixel_weights): object_center_predictions, per_pixel_weights,
maximum_normalized_coordinate=1.1):
"""Computes the object center loss. """Computes the object center loss.
Args: Args:
...@@ -2726,6 +2994,9 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -2726,6 +2994,9 @@ class CenterNetMetaArch(model.DetectionModel):
per_pixel_weights: A float tensor of shape [batch_size, per_pixel_weights: A float tensor of shape [batch_size,
out_height * out_width, 1] with 1s in locations where the spatial out_height * out_width, 1] with 1s in locations where the spatial
coordinates fall within the height and width in true_image_shapes. coordinates fall within the height and width in true_image_shapes.
maximum_normalized_coordinate: Maximum coordinate value to be considered
as normalized, default to 1.1. This is used to check bounds during
converting normalized coordinates to absolute coordinates.
Returns: Returns:
A float scalar tensor representing the object center loss per instance. A float scalar tensor representing the object center loss per instance.
...@@ -2752,7 +3023,8 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -2752,7 +3023,8 @@ class CenterNetMetaArch(model.DetectionModel):
width=input_width, width=input_width,
gt_classes_list=gt_classes_list, gt_classes_list=gt_classes_list,
gt_keypoints_list=gt_keypoints_list, gt_keypoints_list=gt_keypoints_list,
gt_weights_list=gt_weights_list) gt_weights_list=gt_weights_list,
maximum_normalized_coordinate=maximum_normalized_coordinate)
else: else:
gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes) gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
heatmap_targets = assigner.assign_center_targets_from_boxes( heatmap_targets = assigner.assign_center_targets_from_boxes(
...@@ -2760,7 +3032,8 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -2760,7 +3032,8 @@ class CenterNetMetaArch(model.DetectionModel):
width=input_width, width=input_width,
gt_boxes_list=gt_boxes_list, gt_boxes_list=gt_boxes_list,
gt_classes_list=gt_classes_list, gt_classes_list=gt_classes_list,
gt_weights_list=gt_weights_list) gt_weights_list=gt_weights_list,
maximum_normalized_coordinate=maximum_normalized_coordinate)
flattened_heatmap_targets = _flatten_spatial_dimensions(heatmap_targets) flattened_heatmap_targets = _flatten_spatial_dimensions(heatmap_targets)
num_boxes = _to_float32(get_num_instances_from_weights(gt_weights_list)) num_boxes = _to_float32(get_num_instances_from_weights(gt_weights_list))
...@@ -3577,7 +3850,9 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -3577,7 +3850,9 @@ class CenterNetMetaArch(model.DetectionModel):
self._batched_prediction_tensor_names = predictions.keys() self._batched_prediction_tensor_names = predictions.keys()
return predictions return predictions
def loss(self, prediction_dict, true_image_shapes, scope=None): def loss(
self, prediction_dict, true_image_shapes, scope=None,
maximum_normalized_coordinate=1.1):
"""Computes scalar loss tensors with respect to provided groundtruth. """Computes scalar loss tensors with respect to provided groundtruth.
This function implements the various CenterNet losses. This function implements the various CenterNet losses.
...@@ -3589,6 +3864,9 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -3589,6 +3864,9 @@ class CenterNetMetaArch(model.DetectionModel):
the form [height, width, channels] indicating the shapes of true images the form [height, width, channels] indicating the shapes of true images
in the resized images, as resized images can be padded with zeros. in the resized images, as resized images can be padded with zeros.
scope: Optional scope name. scope: Optional scope name.
maximum_normalized_coordinate: Maximum coordinate value to be considered
as normalized, default to 1.1. This is used to check bounds during
converting normalized coordinates to absolute coordinates.
Returns: Returns:
A dictionary mapping the keys [ A dictionary mapping the keys [
...@@ -3616,7 +3894,7 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -3616,7 +3894,7 @@ class CenterNetMetaArch(model.DetectionModel):
# TODO(vighneshb) Explore whether using floor here is safe. # TODO(vighneshb) Explore whether using floor here is safe.
output_true_image_shapes = tf.ceil( output_true_image_shapes = tf.ceil(
tf.to_float(true_image_shapes) / self._stride) tf.cast(true_image_shapes, tf.float32) / self._stride)
valid_anchor_weights = get_valid_anchor_weights_in_flattened_image( valid_anchor_weights = get_valid_anchor_weights_in_flattened_image(
output_true_image_shapes, output_height, output_width) output_true_image_shapes, output_height, output_width)
valid_anchor_weights = tf.expand_dims(valid_anchor_weights, 2) valid_anchor_weights = tf.expand_dims(valid_anchor_weights, 2)
...@@ -3625,7 +3903,8 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -3625,7 +3903,8 @@ class CenterNetMetaArch(model.DetectionModel):
object_center_predictions=prediction_dict[OBJECT_CENTER], object_center_predictions=prediction_dict[OBJECT_CENTER],
input_height=input_height, input_height=input_height,
input_width=input_width, input_width=input_width,
per_pixel_weights=valid_anchor_weights) per_pixel_weights=valid_anchor_weights,
maximum_normalized_coordinate=maximum_normalized_coordinate)
losses = { losses = {
OBJECT_CENTER: OBJECT_CENTER:
self._center_params.object_center_loss_weight * object_center_loss self._center_params.object_center_loss_weight * object_center_loss
...@@ -3742,21 +4021,32 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -3742,21 +4021,32 @@ class CenterNetMetaArch(model.DetectionModel):
""" """
object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1]) object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
# Mask object centers by true_image_shape. [batch, h, w, 1] if true_image_shapes is None:
object_center_mask = mask_from_true_image_shape( # If true_image_shapes is not provided, we assume the whole image is valid
_get_shape(object_center_prob, 4), true_image_shapes) # and infer the true_image_shapes from the object_center_prob shape.
object_center_prob *= object_center_mask batch_size, strided_height, strided_width, _ = _get_shape(
object_center_prob, 4)
true_image_shapes = tf.stack(
[strided_height * self._stride, strided_width * self._stride,
tf.constant(len(self._feature_extractor._channel_means))]) # pylint: disable=protected-access
true_image_shapes = tf.stack([true_image_shapes] * batch_size, axis=0)
else:
# Mask object centers by true_image_shape. [batch, h, w, 1]
object_center_mask = mask_from_true_image_shape(
_get_shape(object_center_prob, 4), true_image_shapes)
object_center_prob *= object_center_mask
# Get x, y and channel indices corresponding to the top indices in the class # Get x, y and channel indices corresponding to the top indices in the class
# center predictions. # center predictions.
detection_scores, y_indices, x_indices, channel_indices = ( detection_scores, y_indices, x_indices, channel_indices = (
top_k_feature_map_locations( top_k_feature_map_locations(
object_center_prob, max_pool_kernel_size=3, object_center_prob,
max_pool_kernel_size=self._center_params.peak_max_pool_kernel_size,
k=self._center_params.max_box_predictions)) k=self._center_params.max_box_predictions))
multiclass_scores = tf.gather_nd( multiclass_scores = tf.gather_nd(
object_center_prob, tf.stack([y_indices, x_indices], -1), batch_dims=1) object_center_prob, tf.stack([y_indices, x_indices], -1), batch_dims=1)
num_detections = tf.reduce_sum(
num_detections = tf.reduce_sum(tf.to_int32(detection_scores > 0), axis=1) tf.cast(detection_scores > 0, tf.int32), axis=1)
postprocess_dict = { postprocess_dict = {
fields.DetectionResultFields.detection_scores: detection_scores, fields.DetectionResultFields.detection_scores: detection_scores,
fields.DetectionResultFields.detection_multiclass_scores: fields.DetectionResultFields.detection_multiclass_scores:
...@@ -3786,10 +4076,22 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -3786,10 +4076,22 @@ class CenterNetMetaArch(model.DetectionModel):
# the ops that are supported by tf.lite on GPU. # the ops that are supported by tf.lite on GPU.
clip_keypoints = self._should_clip_keypoints() clip_keypoints = self._should_clip_keypoints()
if len(self._kp_params_dict) == 1 and self._num_classes == 1: if len(self._kp_params_dict) == 1 and self._num_classes == 1:
(keypoints, keypoint_scores, task_name, kp_params = next(iter(self._kp_params_dict.items()))
keypoint_depths) = self._postprocess_keypoints_single_class( keypoint_depths = None
prediction_dict, channel_indices, y_indices, x_indices, if kp_params.argmax_postprocessing:
boxes_strided, num_detections) keypoints, keypoint_scores = (
prediction_to_keypoints_argmax(
prediction_dict,
object_y_indices=y_indices,
object_x_indices=x_indices,
boxes=boxes_strided,
task_name=task_name,
kp_params=kp_params))
else:
(keypoints, keypoint_scores,
keypoint_depths) = self._postprocess_keypoints_single_class(
prediction_dict, channel_indices, y_indices, x_indices,
boxes_strided, num_detections)
keypoints, keypoint_scores = ( keypoints, keypoint_scores = (
convert_strided_predictions_to_normalized_keypoints( convert_strided_predictions_to_normalized_keypoints(
keypoints, keypoint_scores, self._stride, true_image_shapes, keypoints, keypoint_scores, self._stride, true_image_shapes,
...@@ -4073,9 +4375,13 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -4073,9 +4375,13 @@ class CenterNetMetaArch(model.DetectionModel):
kpt_coords_for_example_list = [] kpt_coords_for_example_list = []
kpt_scores_for_example_list = [] kpt_scores_for_example_list = []
for ex_ind in range(batch_size): for ex_ind in range(batch_size):
kpt_coords_for_class_list = [] # The tensors that host the keypoint coordinates and scores for all
kpt_scores_for_class_list = [] # instances and all keypoints. They will be updated by scatter_nd_add for
instance_inds_for_class_list = [] # each keypoint tasks.
kpt_coords_for_example_all_det = tf.zeros(
[max_detections, total_num_keypoints, 2])
kpt_scores_for_example_all_det = tf.zeros(
[max_detections, total_num_keypoints])
for task_name, kp_params in self._kp_params_dict.items(): for task_name, kp_params in self._kp_params_dict.items():
keypoint_heatmap = prediction_dict[ keypoint_heatmap = prediction_dict[
get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1] get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]
...@@ -4085,77 +4391,62 @@ class CenterNetMetaArch(model.DetectionModel): ...@@ -4085,77 +4391,62 @@ class CenterNetMetaArch(model.DetectionModel):
get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1] get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
instance_inds = self._get_instance_indices( instance_inds = self._get_instance_indices(
classes, num_detections, ex_ind, kp_params.class_id) classes, num_detections, ex_ind, kp_params.class_id)
num_ind = _get_shape(instance_inds, 1)
# Gather the feature map locations corresponding to the object class.
def true_fn(keypoint_heatmap, keypoint_offsets, keypoint_regression, y_indices_for_kpt_class = tf.gather(y_indices, instance_inds, axis=1)
classes, y_indices, x_indices, boxes, instance_inds, ex_ind, x_indices_for_kpt_class = tf.gather(x_indices, instance_inds, axis=1)
kp_params): if boxes is None:
"""Logics to execute when instance_inds is not an empty set.""" boxes_for_kpt_class = None
# Gather the feature map locations corresponding to the object class. else:
y_indices_for_kpt_class = tf.gather(y_indices, instance_inds, axis=1) boxes_for_kpt_class = tf.gather(boxes, instance_inds, axis=1)
x_indices_for_kpt_class = tf.gather(x_indices, instance_inds, axis=1)
if boxes is None: # Postprocess keypoints and scores for class and single image. Shapes
boxes_for_kpt_class = None # are [1, num_instances_i, num_keypoints_i, 2] and
else: # [1, num_instances_i, num_keypoints_i], respectively. Note that
boxes_for_kpt_class = tf.gather(boxes, instance_inds, axis=1) # num_instances_i and num_keypoints_i refers to the number of
# instances and keypoints for class i, respectively.
# Postprocess keypoints and scores for class and single image. Shapes (kpt_coords_for_class, kpt_scores_for_class, _) = (
# are [1, num_instances_i, num_keypoints_i, 2] and self._postprocess_keypoints_for_class_and_image(
# [1, num_instances_i, num_keypoints_i], respectively. Note that keypoint_heatmap,
# num_instances_i and num_keypoints_i refers to the number of keypoint_offsets,
# instances and keypoints for class i, respectively. keypoint_regression,
(kpt_coords_for_class, kpt_scores_for_class, _) = ( classes,
self._postprocess_keypoints_for_class_and_image( y_indices_for_kpt_class,
keypoint_heatmap, x_indices_for_kpt_class,
keypoint_offsets, boxes_for_kpt_class,
keypoint_regression, ex_ind,
classes, kp_params,
y_indices_for_kpt_class, ))
x_indices_for_kpt_class,
boxes_for_kpt_class, # Prepare the indices for scatter_nd. The resulting combined_inds has
ex_ind, # the shape of [num_instances_i * num_keypoints_i, 2], where the first
kp_params, # column corresponds to the instance IDs and the second column
)) # corresponds to the keypoint IDs.
kpt_inds = tf.constant(kp_params.keypoint_indices, dtype=tf.int32)
# Expand keypoint dimension (with padding) so that coordinates and kpt_inds = tf.expand_dims(kpt_inds, axis=0)
# scores have shape [1, num_instances_i, num_total_keypoints, 2] and instance_inds_expand = tf.expand_dims(instance_inds, axis=-1)
# [1, num_instances_i, num_total_keypoints], respectively. kpt_inds_expand = kpt_inds * tf.ones_like(instance_inds_expand)
kpts_coords_for_class_padded, kpt_scores_for_class_padded = ( instance_inds_expand = instance_inds_expand * tf.ones_like(kpt_inds)
_pad_to_full_keypoint_dim(kpt_coords_for_class, combined_inds = tf.stack(
kpt_scores_for_class, [instance_inds_expand, kpt_inds_expand], axis=2)
kp_params.keypoint_indices, combined_inds = tf.reshape(combined_inds, [-1, 2])
total_num_keypoints))
return kpts_coords_for_class_padded, kpt_scores_for_class_padded # Reshape the keypoint coordinates/scores to [num_instances_i *
# num_keypoints_i, 2]/[num_instances_i * num_keypoints_i] to be used
def false_fn(): # by scatter_nd_add.
"""Logics to execute when the instance_inds is an empty set.""" kpt_coords_for_class = tf.reshape(kpt_coords_for_class, [-1, 2])
return (tf.zeros([1, 0, total_num_keypoints, 2], dtype=tf.float32), kpt_scores_for_class = tf.reshape(kpt_scores_for_class, [-1])
tf.zeros([1, 0, total_num_keypoints], dtype=tf.float32)) kpt_coords_for_example_all_det = tf.tensor_scatter_nd_add(
kpt_coords_for_example_all_det,
true_fn = functools.partial( combined_inds, kpt_coords_for_class)
true_fn, keypoint_heatmap, keypoint_offsets, keypoint_regression, kpt_scores_for_example_all_det = tf.tensor_scatter_nd_add(
classes, y_indices, x_indices, boxes, instance_inds, ex_ind, kpt_scores_for_example_all_det,
kp_params) combined_inds, kpt_scores_for_class)
# Use dimension values instead of tf.size for tf.lite compatibility.
results = tf.cond(num_ind[0] > 0, true_fn, false_fn) kpt_coords_for_example_list.append(
tf.expand_dims(kpt_coords_for_example_all_det, axis=0))
kpt_coords_for_class_list.append(results[0]) kpt_scores_for_example_list.append(
kpt_scores_for_class_list.append(results[1]) tf.expand_dims(kpt_scores_for_example_all_det, axis=0))
instance_inds_for_class_list.append(instance_inds)
# Concatenate all keypoints across all classes (single example).
kpt_coords_for_example = tf.concat(kpt_coords_for_class_list, axis=1)
kpt_scores_for_example = tf.concat(kpt_scores_for_class_list, axis=1)
instance_inds_for_example = tf.concat(instance_inds_for_class_list,
axis=0)
(kpt_coords_for_example_all_det,
kpt_scores_for_example_all_det) = self._scatter_keypoints_to_batch(
num_ind, kpt_coords_for_example, kpt_scores_for_example,
instance_inds_for_example, max_detections, total_num_keypoints)
kpt_coords_for_example_list.append(kpt_coords_for_example_all_det)
kpt_scores_for_example_list.append(kpt_scores_for_example_all_det)
# Concatenate all keypoints and scores from all examples in the batch. # Concatenate all keypoints and scores from all examples in the batch.
# Shapes are [batch_size, max_detections, num_total_keypoints, 2] and # Shapes are [batch_size, max_detections, num_total_keypoints, 2] and
......
...@@ -807,6 +807,77 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase): ...@@ -807,6 +807,77 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
np.testing.assert_allclose(expected_keypoint_candidates, keypoint_cands) np.testing.assert_allclose(expected_keypoint_candidates, keypoint_cands)
np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores) np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores)
@parameterized.parameters({'provide_keypoint_score': True},
{'provide_keypoint_score': False})
def test_prediction_to_multi_instance_keypoints(self, provide_keypoint_score):
image_size = (9, 9)
keypoint_heatmap_np = np.zeros((1, image_size[0], image_size[1], 3, 4),
dtype=np.float32)
# Instance 0.
keypoint_heatmap_np[0, 1, 1, 0, 0] = 0.9
keypoint_heatmap_np[0, 1, 7, 0, 1] = 0.9
keypoint_heatmap_np[0, 7, 1, 0, 2] = 0.9
keypoint_heatmap_np[0, 7, 7, 0, 3] = 0.9
# Instance 1.
keypoint_heatmap_np[0, 2, 2, 1, 0] = 0.8
keypoint_heatmap_np[0, 2, 8, 1, 1] = 0.8
keypoint_heatmap_np[0, 8, 2, 1, 2] = 0.8
keypoint_heatmap_np[0, 8, 8, 1, 3] = 0.8
keypoint_offset_np = np.zeros((1, image_size[0], image_size[1], 8),
dtype=np.float32)
keypoint_offset_np[0, 1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0]
keypoint_offset_np[0, 7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5]
keypoint_offset_np[0, 2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0]
keypoint_offset_np[0, 8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3]
def graph_fn():
keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
keypoint_offset = tf.constant(keypoint_offset_np, dtype=tf.float32)
if provide_keypoint_score:
(keypoint_cands, keypoint_scores) = (
cnma.prediction_tensors_to_multi_instance_kpts(
keypoint_heatmap,
keypoint_offset,
tf.reduce_max(keypoint_heatmap, axis=3)))
else:
(keypoint_cands, keypoint_scores) = (
cnma.prediction_tensors_to_multi_instance_kpts(
keypoint_heatmap,
keypoint_offset))
return keypoint_cands, keypoint_scores
(keypoint_cands, keypoint_scores) = self.execute(graph_fn, [])
expected_keypoint_candidates_0 = [
[1.5, 1.5], # top-left
[1.5, 6.5], # top-right
[6.5, 1.5], # bottom-left
[6.5, 6.5], # bottom-right
]
expected_keypoint_scores_0 = [0.9, 0.9, 0.9, 0.9]
expected_keypoint_candidates_1 = [
[2.3, 2.3], # top-left
[2.3, 7.7], # top-right
[7.7, 2.3], # bottom-left
[7.7, 7.7], # bottom-right
]
expected_keypoint_scores_1 = [0.8, 0.8, 0.8, 0.8]
np.testing.assert_allclose(
expected_keypoint_candidates_0, keypoint_cands[0, 0, :, :])
np.testing.assert_allclose(
expected_keypoint_candidates_1, keypoint_cands[0, 1, :, :])
np.testing.assert_allclose(
expected_keypoint_scores_0, keypoint_scores[0, 0, :])
np.testing.assert_allclose(
expected_keypoint_scores_1, keypoint_scores[0, 1, :])
def test_keypoint_candidate_prediction_per_keypoints(self): def test_keypoint_candidate_prediction_per_keypoints(self):
keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32) keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
keypoint_heatmap_np[0, 0, 0, 0] = 1.0 keypoint_heatmap_np[0, 0, 0, 0] = 1.0
...@@ -1644,7 +1715,8 @@ def get_fake_kp_params(num_candidates_per_keypoint=100, ...@@ -1644,7 +1715,8 @@ def get_fake_kp_params(num_candidates_per_keypoint=100,
predict_depth=False, predict_depth=False,
per_keypoint_depth=False, per_keypoint_depth=False,
peak_radius=0, peak_radius=0,
candidate_ranking_mode='min_distance'): candidate_ranking_mode='min_distance',
argmax_postprocessing=False):
"""Returns the fake keypoint estimation parameter namedtuple.""" """Returns the fake keypoint estimation parameter namedtuple."""
return cnma.KeypointEstimationParams( return cnma.KeypointEstimationParams(
task_name=_TASK_NAME, task_name=_TASK_NAME,
...@@ -1660,7 +1732,8 @@ def get_fake_kp_params(num_candidates_per_keypoint=100, ...@@ -1660,7 +1732,8 @@ def get_fake_kp_params(num_candidates_per_keypoint=100,
predict_depth=predict_depth, predict_depth=predict_depth,
per_keypoint_depth=per_keypoint_depth, per_keypoint_depth=per_keypoint_depth,
offset_peak_radius=peak_radius, offset_peak_radius=peak_radius,
candidate_ranking_mode=candidate_ranking_mode) candidate_ranking_mode=candidate_ranking_mode,
argmax_postprocessing=argmax_postprocessing)
def get_fake_mask_params(): def get_fake_mask_params():
...@@ -1715,7 +1788,8 @@ def build_center_net_meta_arch(build_resnet=False, ...@@ -1715,7 +1788,8 @@ def build_center_net_meta_arch(build_resnet=False,
per_keypoint_depth=False, per_keypoint_depth=False,
peak_radius=0, peak_radius=0,
keypoint_only=False, keypoint_only=False,
candidate_ranking_mode='min_distance'): candidate_ranking_mode='min_distance',
argmax_postprocessing=False):
"""Builds the CenterNet meta architecture.""" """Builds the CenterNet meta architecture."""
if build_resnet: if build_resnet:
feature_extractor = ( feature_extractor = (
...@@ -1762,7 +1836,8 @@ def build_center_net_meta_arch(build_resnet=False, ...@@ -1762,7 +1836,8 @@ def build_center_net_meta_arch(build_resnet=False,
get_fake_kp_params(num_candidates_per_keypoint, get_fake_kp_params(num_candidates_per_keypoint,
per_keypoint_offset, predict_depth, per_keypoint_offset, predict_depth,
per_keypoint_depth, peak_radius, per_keypoint_depth, peak_radius,
candidate_ranking_mode) candidate_ranking_mode,
argmax_postprocessing)
}, },
non_max_suppression_fn=non_max_suppression_fn) non_max_suppression_fn=non_max_suppression_fn)
elif detection_only: elif detection_only:
...@@ -1790,7 +1865,8 @@ def build_center_net_meta_arch(build_resnet=False, ...@@ -1790,7 +1865,8 @@ def build_center_net_meta_arch(build_resnet=False,
get_fake_kp_params(num_candidates_per_keypoint, get_fake_kp_params(num_candidates_per_keypoint,
per_keypoint_offset, predict_depth, per_keypoint_offset, predict_depth,
per_keypoint_depth, peak_radius, per_keypoint_depth, peak_radius,
candidate_ranking_mode) candidate_ranking_mode,
argmax_postprocessing)
}, },
non_max_suppression_fn=non_max_suppression_fn) non_max_suppression_fn=non_max_suppression_fn)
else: else:
...@@ -2056,10 +2132,11 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase): ...@@ -2056,10 +2132,11 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
cnma.TEMPORAL_OFFSET)]) cnma.TEMPORAL_OFFSET)])
@parameterized.parameters( @parameterized.parameters(
{'target_class_id': 1}, {'target_class_id': 1, 'with_true_image_shape': True},
{'target_class_id': 2}, {'target_class_id': 2, 'with_true_image_shape': True},
{'target_class_id': 1, 'with_true_image_shape': False},
) )
def test_postprocess(self, target_class_id): def test_postprocess(self, target_class_id, with_true_image_shape):
"""Test the postprocess function.""" """Test the postprocess function."""
model = build_center_net_meta_arch() model = build_center_net_meta_arch()
max_detection = model._center_params.max_box_predictions max_detection = model._center_params.max_box_predictions
...@@ -2140,8 +2217,11 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase): ...@@ -2140,8 +2217,11 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
} }
def graph_fn(): def graph_fn():
detections = model.postprocess(prediction_dict, if with_true_image_shape:
tf.constant([[128, 128, 3]])) detections = model.postprocess(prediction_dict,
tf.constant([[128, 128, 3]]))
else:
detections = model.postprocess(prediction_dict, None)
return detections return detections
detections = self.execute_cpu(graph_fn, []) detections = self.execute_cpu(graph_fn, [])
...@@ -2320,17 +2400,32 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase): ...@@ -2320,17 +2400,32 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
self.assertAllClose(expected_multiclass_scores, self.assertAllClose(expected_multiclass_scores,
detections['detection_multiclass_scores'][0][0]) detections['detection_multiclass_scores'][0][0])
def test_postprocess_single_class(self): @parameterized.parameters(
{
'candidate_ranking_mode': 'min_distance',
'argmax_postprocessing': False
},
{
'candidate_ranking_mode': 'gaussian_weighted_const',
'argmax_postprocessing': True
})
def test_postprocess_single_class(self, candidate_ranking_mode,
argmax_postprocessing):
"""Test the postprocess function.""" """Test the postprocess function."""
model = build_center_net_meta_arch(num_classes=1) model = build_center_net_meta_arch(
num_classes=1, max_box_predictions=5, per_keypoint_offset=True,
candidate_ranking_mode=candidate_ranking_mode,
argmax_postprocessing=argmax_postprocessing)
max_detection = model._center_params.max_box_predictions max_detection = model._center_params.max_box_predictions
num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices) num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices)
class_center = np.zeros((1, 32, 32, 1), dtype=np.float32) class_center = np.zeros((1, 32, 32, 1), dtype=np.float32)
height_width = np.zeros((1, 32, 32, 2), dtype=np.float32) height_width = np.zeros((1, 32, 32, 2), dtype=np.float32)
offset = np.zeros((1, 32, 32, 2), dtype=np.float32) offset = np.zeros((1, 32, 32, 2), dtype=np.float32)
keypoint_heatmaps = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32) keypoint_heatmaps = np.ones(
keypoint_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32) (1, 32, 32, num_keypoints), dtype=np.float32) * _logit(0.01)
keypoint_offsets = np.zeros(
(1, 32, 32, num_keypoints * 2), dtype=np.float32)
keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2) keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2)
class_probs = np.zeros(1) class_probs = np.zeros(1)
...@@ -2383,6 +2478,9 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase): ...@@ -2383,6 +2478,9 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
self.assertEqual(detections['num_detections'], [5]) self.assertEqual(detections['num_detections'], [5])
self.assertAllEqual([1, max_detection, num_keypoints, 2], self.assertAllEqual([1, max_detection, num_keypoints, 2],
detections['detection_keypoints'].shape) detections['detection_keypoints'].shape)
self.assertAllClose(
[[0.4375, 0.4375], [0.4375, 0.5625], [0.5625, 0.4375]],
detections['detection_keypoints'][0, 0, 0:3, :])
self.assertAllEqual([1, max_detection, num_keypoints], self.assertAllEqual([1, max_detection, num_keypoints],
detections['detection_keypoint_scores'].shape) detections['detection_keypoint_scores'].shape)
......
...@@ -36,7 +36,8 @@ class DeepMACParams( ...@@ -36,7 +36,8 @@ class DeepMACParams(
'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples', 'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels', 'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
'predict_full_resolution_masks', 'postprocess_crop_size', 'predict_full_resolution_masks', 'postprocess_crop_size',
'max_roi_jitter_ratio', 'roi_jitter_mode', 'box_consistency_loss_weight' 'max_roi_jitter_ratio', 'roi_jitter_mode',
'box_consistency_loss_weight',
])): ])):
"""Class holding the DeepMAC network configutration.""" """Class holding the DeepMAC network configutration."""
...@@ -125,6 +126,9 @@ def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None): ...@@ -125,6 +126,9 @@ def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None):
raise ValueError('Mask size must be set.') raise ValueError('Mask size must be set.')
return FullyConnectedMaskHead(num_init_channels, mask_size) return FullyConnectedMaskHead(num_init_channels, mask_size)
elif name == 'embedding_projection':
return tf.keras.layers.Lambda(lambda x: x)
elif name.startswith('resnet'): elif name.startswith('resnet'):
return ResNetMaskNetwork(name, num_init_channels) return ResNetMaskNetwork(name, num_init_channels)
...@@ -262,6 +266,24 @@ def fill_boxes(boxes, height, width): ...@@ -262,6 +266,24 @@ def fill_boxes(boxes, height, width):
return tf.cast(filled_boxes, tf.float32) return tf.cast(filled_boxes, tf.float32)
def embedding_projection(x, y):
"""Compute dot product between two given embeddings.
Args:
x: [num_instances, height, width, dimension] float tensor input.
y: [num_instances, height, width, dimension] or
[num_instances, 1, 1, dimension] float tensor input. When the height
and width dimensions are 1, TF will broadcast it.
Returns:
dist: [num_instances, height, width, 1] A float tensor returning
the per-pixel embedding projection.
"""
dot = tf.reduce_sum(x * y, axis=3, keepdims=True)
return dot
class ResNetMaskNetwork(tf.keras.layers.Layer): class ResNetMaskNetwork(tf.keras.layers.Layer):
"""A small wrapper around ResNet blocks to predict masks.""" """A small wrapper around ResNet blocks to predict masks."""
...@@ -341,6 +363,92 @@ class FullyConnectedMaskHead(tf.keras.layers.Layer): ...@@ -341,6 +363,92 @@ class FullyConnectedMaskHead(tf.keras.layers.Layer):
[num_instances, self.mask_size, self.mask_size, 1]) [num_instances, self.mask_size, self.mask_size, 1])
class DenseResidualBlock(tf.keras.layers.Layer):
"""Residual block for 1D inputs.
This class implemented the pre-activation version of the ResNet block.
"""
def __init__(self, hidden_size, use_shortcut_linear):
"""Residual Block for 1D inputs.
Args:
hidden_size: size of the hidden layer.
use_shortcut_linear: bool, whether or not to use a linear layer for
shortcut.
"""
super(DenseResidualBlock, self).__init__()
self.bn_0 = tf.keras.layers.experimental.SyncBatchNormalization(axis=-1)
self.bn_1 = tf.keras.layers.experimental.SyncBatchNormalization(axis=-1)
self.fc_0 = tf.keras.layers.Dense(
hidden_size, activation=None)
self.fc_1 = tf.keras.layers.Dense(
hidden_size, activation=None, kernel_initializer='zeros')
self.activation = tf.keras.layers.Activation('relu')
if use_shortcut_linear:
self.shortcut = tf.keras.layers.Dense(
hidden_size, activation=None, use_bias=False)
else:
self.shortcut = tf.keras.layers.Lambda(lambda x: x)
def __call__(self, inputs):
"""Layer's forward pass.
Args:
inputs: input tensor.
Returns:
Tensor after residual block w/ CondBatchNorm.
"""
out = self.fc_0(self.activation(self.bn_0(inputs)))
residual_inp = self.fc_1(self.activation(self.bn_1(out)))
skip = self.shortcut(inputs)
return residual_inp + skip
class DenseResNet(tf.keras.layers.Layer):
"""Resnet with dense layers."""
def __init__(self, num_layers, hidden_size, output_size):
"""Resnet with dense layers.
Args:
num_layers: int, the number of layers.
hidden_size: size of the hidden layer.
output_size: size of the output.
"""
super(DenseResNet, self).__init__()
self.input_proj = DenseResidualBlock(hidden_size, use_shortcut_linear=True)
if num_layers < 4:
raise ValueError(
'Cannot construct a DenseResNet with less than 4 layers')
num_blocks = (num_layers - 2) // 2
if ((num_blocks * 2) + 2) != num_layers:
raise ValueError(('DenseResNet depth has to be of the form (2n + 2). '
f'Found {num_layers}'))
self._num_blocks = num_blocks
blocks = [DenseResidualBlock(hidden_size, use_shortcut_linear=False)
for _ in range(num_blocks)]
self.resnet = tf.keras.Sequential(blocks)
self.out_conv = tf.keras.layers.Dense(output_size)
def __call__(self, inputs):
net = self.input_proj(inputs)
return self.out_conv(self.resnet(net))
class MaskHeadNetwork(tf.keras.layers.Layer): class MaskHeadNetwork(tf.keras.layers.Layer):
"""Mask head class for DeepMAC.""" """Mask head class for DeepMAC."""
...@@ -366,8 +474,18 @@ class MaskHeadNetwork(tf.keras.layers.Layer): ...@@ -366,8 +474,18 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
network_type, num_init_channels, mask_size) network_type, num_init_channels, mask_size)
self._use_instance_embedding = use_instance_embedding self._use_instance_embedding = use_instance_embedding
self.project_out = tf.keras.layers.Conv2D( self._network_type = network_type
filters=1, kernel_size=1, activation=None)
if (self._use_instance_embedding and
(self._network_type == 'embedding_projection')):
raise ValueError(('Cannot feed instance embedding to mask head when '
'computing embedding projection.'))
if network_type == 'embedding_projection':
self.project_out = tf.keras.layers.Lambda(lambda x: x)
else:
self.project_out = tf.keras.layers.Conv2D(
filters=1, kernel_size=1, activation=None)
def __call__(self, instance_embedding, pixel_embedding, training): def __call__(self, instance_embedding, pixel_embedding, training):
"""Returns mask logits given object center and spatial embeddings. """Returns mask logits given object center and spatial embeddings.
...@@ -388,10 +506,9 @@ class MaskHeadNetwork(tf.keras.layers.Layer): ...@@ -388,10 +506,9 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
height = tf.shape(pixel_embedding)[1] height = tf.shape(pixel_embedding)[1]
width = tf.shape(pixel_embedding)[2] width = tf.shape(pixel_embedding)[2]
instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
instance_embedding = tf.tile(instance_embedding, [1, height, width, 1])
if self._use_instance_embedding: if self._use_instance_embedding:
instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
instance_embedding = tf.tile(instance_embedding, [1, height, width, 1])
inputs = tf.concat([pixel_embedding, instance_embedding], axis=3) inputs = tf.concat([pixel_embedding, instance_embedding], axis=3)
else: else:
inputs = pixel_embedding inputs = pixel_embedding
...@@ -400,6 +517,10 @@ class MaskHeadNetwork(tf.keras.layers.Layer): ...@@ -400,6 +517,10 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
if isinstance(out, list): if isinstance(out, list):
out = out[-1] out = out[-1]
if self._network_type == 'embedding_projection':
instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
out = embedding_projection(instance_embedding, out)
if out.shape[-1] > 1: if out.shape[-1] > 1:
out = self.project_out(out) out = self.project_out(out)
...@@ -466,6 +587,21 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch): ...@@ -466,6 +587,21 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
if self._deepmac_params.mask_num_subsamples > 0: if self._deepmac_params.mask_num_subsamples > 0:
raise ValueError('Subsampling masks is currently not supported.') raise ValueError('Subsampling masks is currently not supported.')
if self._deepmac_params.network_type == 'embedding_projection':
if self._deepmac_params.use_xy:
raise ValueError(
'Cannot use x/y coordinates when using embedding projection.')
pixel_embedding_dim = self._deepmac_params.pixel_embedding_dim
dim = self._deepmac_params.dim
if dim != pixel_embedding_dim:
raise ValueError(
'When using embedding projection mask head, '
f'pixel_embedding_dim({pixel_embedding_dim}) '
f'must be same as dim({dim}).')
loss = self._deepmac_params.classification_loss
super(DeepMACMetaArch, self).__init__( super(DeepMACMetaArch, self).__init__(
is_training=is_training, add_summaries=add_summaries, is_training=is_training, add_summaries=add_summaries,
num_classes=num_classes, feature_extractor=feature_extractor, num_classes=num_classes, feature_extractor=feature_extractor,
......
...@@ -61,7 +61,10 @@ class MockMaskNet(tf.keras.layers.Layer): ...@@ -61,7 +61,10 @@ class MockMaskNet(tf.keras.layers.Layer):
def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False, def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
mask_num_subsamples=-1): use_instance_embedding=True, mask_num_subsamples=-1,
network_type='hourglass10', use_xy=True,
pixel_embedding_dim=2,
dice_loss_prediction_probability=False):
"""Builds the DeepMAC meta architecture.""" """Builds the DeepMAC meta architecture."""
feature_extractor = DummyFeatureExtractor( feature_extractor = DummyFeatureExtractor(
...@@ -84,7 +87,9 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False, ...@@ -84,7 +87,9 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
use_labeled_classes=False) use_labeled_classes=False)
if use_dice_loss: if use_dice_loss:
classification_loss = losses.WeightedDiceClassificationLoss(False) classification_loss = losses.WeightedDiceClassificationLoss(
squared_normalization=False,
is_prediction_probability=dice_loss_prediction_probability)
else: else:
classification_loss = losses.WeightedSigmoidClassificationLoss() classification_loss = losses.WeightedSigmoidClassificationLoss()
...@@ -92,13 +97,13 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False, ...@@ -92,13 +97,13 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
classification_loss=classification_loss, classification_loss=classification_loss,
dim=8, dim=8,
task_loss_weight=1.0, task_loss_weight=1.0,
pixel_embedding_dim=2, pixel_embedding_dim=pixel_embedding_dim,
allowed_masked_classes_ids=[], allowed_masked_classes_ids=[],
mask_size=16, mask_size=16,
mask_num_subsamples=mask_num_subsamples, mask_num_subsamples=mask_num_subsamples,
use_xy=True, use_xy=use_xy,
network_type='hourglass10', network_type=network_type,
use_instance_embedding=True, use_instance_embedding=use_instance_embedding,
num_init_channels=8, num_init_channels=8,
predict_full_resolution_masks=predict_full_resolution_masks, predict_full_resolution_masks=predict_full_resolution_masks,
postprocess_crop_size=128, postprocess_crop_size=128,
...@@ -125,7 +130,7 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False, ...@@ -125,7 +130,7 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.') @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACUtilsTest(tf.test.TestCase): class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
def test_subsample_trivial(self): def test_subsample_trivial(self):
"""Test subsampling masks.""" """Test subsampling masks."""
...@@ -169,12 +174,41 @@ class DeepMACUtilsTest(tf.test.TestCase): ...@@ -169,12 +174,41 @@ class DeepMACUtilsTest(tf.test.TestCase):
features, boxes, 32) features, boxes, 32)
self.assertEqual(output.shape, (5, 32, 32, 7)) self.assertEqual(output.shape, (5, 32, 32, 7))
def test_embedding_projection_prob_shape(self):
dist = deepmac_meta_arch.embedding_projection(
tf.ones((4, 32, 32, 8)), tf.zeros((4, 32, 32, 8)))
self.assertEqual(dist.shape, (4, 32, 32, 1))
@parameterized.parameters([1e-20, 1e20])
def test_embedding_projection_value(self, value):
dist = deepmac_meta_arch.embedding_projection(
tf.zeros((1, 1, 1, 8)), value + tf.zeros((1, 1, 1, 8))).numpy()
max_float = np.finfo(dist.dtype).max
self.assertLess(dist.max(), max_float)
self.assertGreater(dist.max(), -max_float)
@parameterized.named_parameters(
[('no_conv_shortcut', (False,)),
('conv_shortcut', (True,))]
)
def test_res_dense_block(self, conv_shortcut):
net = deepmac_meta_arch.DenseResidualBlock(32, conv_shortcut)
out = net(tf.zeros((2, 32)))
self.assertEqual(out.shape, (2, 32))
@parameterized.parameters(
[4, 8, 20]
)
def test_dense_resnet(self, num_layers):
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.') net = deepmac_meta_arch.DenseResNet(num_layers, 16, 8)
class DeepMACMetaArchTest(tf.test.TestCase): out = net(tf.zeros((2, 24)))
self.assertEqual(out.shape, (2, 8))
def setUp(self): # pylint:disable=g-missing-super-call
self.model = build_meta_arch() @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
def test_mask_network(self): def test_mask_network(self):
net = deepmac_meta_arch.MaskHeadNetwork('hourglass10', 8) net = deepmac_meta_arch.MaskHeadNetwork('hourglass10', 8)
...@@ -203,6 +237,38 @@ class DeepMACMetaArchTest(tf.test.TestCase): ...@@ -203,6 +237,38 @@ class DeepMACMetaArchTest(tf.test.TestCase):
out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True) out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32)) self.assertEqual(out.shape, (2, 32, 32))
def test_mask_network_embedding_projection_zero(self):
net = deepmac_meta_arch.MaskHeadNetwork(
'embedding_projection', num_init_channels=8,
use_instance_embedding=False)
call_func = tf.function(net.__call__)
out = call_func(tf.zeros((2, 7)), tf.zeros((2, 32, 32, 7)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
self.assertAllGreater(out.numpy(), -np.inf)
self.assertAllLess(out.numpy(), np.inf)
def test_mask_network_embedding_projection_small(self):
net = deepmac_meta_arch.MaskHeadNetwork(
'embedding_projection', num_init_channels=-1,
use_instance_embedding=False)
call_func = tf.function(net.__call__)
out = call_func(1e6 + tf.zeros((2, 7)),
tf.zeros((2, 32, 32, 7)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
self.assertAllGreater(out.numpy(), -np.inf)
self.assertAllLess(out.numpy(), np.inf)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self): # pylint:disable=g-missing-super-call
self.model = build_meta_arch()
def test_get_mask_head_input(self): def test_get_mask_head_input(self):
boxes = tf.constant([[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]], boxes = tf.constant([[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]],
...@@ -349,6 +415,36 @@ class DeepMACMetaArchTest(tf.test.TestCase): ...@@ -349,6 +415,36 @@ class DeepMACMetaArchTest(tf.test.TestCase):
prob = tf.nn.sigmoid(0.9).numpy() prob = tf.nn.sigmoid(0.9).numpy()
self.assertAllClose(masks, prob * np.ones((2, 3, 16, 16))) self.assertAllClose(masks, prob * np.ones((2, 3, 16, 16)))
def test_postprocess_emb_proj(self):
model = build_meta_arch(network_type='embedding_projection',
use_instance_embedding=False,
use_xy=False, pixel_embedding_dim=8,
use_dice_loss=True,
dice_loss_prediction_probability=True)
boxes = np.zeros((2, 3, 4), dtype=np.float32)
boxes[:, :, [0, 2]] = 0.0
boxes[:, :, [1, 3]] = 8.0
boxes = tf.constant(boxes)
masks = model._postprocess_masks(
boxes, tf.zeros((2, 32, 32, 2)), tf.zeros((2, 32, 32, 2)))
self.assertEqual(masks.shape, (2, 3, 16, 16))
def test_postprocess_emb_proj_fullres(self):
model = build_meta_arch(network_type='embedding_projection',
predict_full_resolution_masks=True,
use_instance_embedding=False,
pixel_embedding_dim=8, use_xy=False,
use_dice_loss=True)
boxes = np.zeros((2, 3, 4), dtype=np.float32)
boxes = tf.constant(boxes)
masks = model._postprocess_masks(
boxes, tf.zeros((2, 32, 32, 2)), tf.zeros((2, 32, 32, 2)))
self.assertEqual(masks.shape, (2, 3, 128, 128))
def test_postprocess_no_crop_resize_shape(self): def test_postprocess_no_crop_resize_shape(self):
model = build_meta_arch(predict_full_resolution_masks=True) model = build_meta_arch(predict_full_resolution_masks=True)
...@@ -494,7 +590,7 @@ class FullyConnectedMaskHeadTest(tf.test.TestCase): ...@@ -494,7 +590,7 @@ class FullyConnectedMaskHeadTest(tf.test.TestCase):
class ResNetMaskHeadTest(tf.test.TestCase, parameterized.TestCase): class ResNetMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(['resnet4', 'resnet8', 'resnet20']) @parameterized.parameters(['resnet4', 'resnet8', 'resnet20'])
def test_pass(self, name): def test_forward(self, name):
net = deepmac_meta_arch.ResNetMaskNetwork(name, 8) net = deepmac_meta_arch.ResNetMaskNetwork(name, 8)
out = net(tf.zeros((3, 32, 32, 16))) out = net(tf.zeros((3, 32, 32, 16)))
self.assertEqual(out.shape[:3], (3, 32, 32)) self.assertEqual(out.shape[:3], (3, 32, 32))
......
...@@ -21,11 +21,7 @@ REQUIRED_PACKAGES = [ ...@@ -21,11 +21,7 @@ REQUIRED_PACKAGES = [
'lvis', 'lvis',
'scipy', 'scipy',
'pandas', 'pandas',
# tensorflow 2.5.0 requires grpcio~=1.34.0. 'tf-models-official>=2.5.1',
# tf-models-official (which requires google-could-bigquery) ends
# up installing the latest grpcio which causes problems later.
'google-cloud-bigquery==1.21.0',
'tf-models-official',
] ]
setup( setup(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment