Unverified Commit 0225b135 authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

parents 7479dbb8 4c571a3c
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Factory methods to build models."""
from typing import Optional
import tensorflow as tf
from official.vision.configs import image_classification as classification_cfg
from official.vision.configs import maskrcnn as maskrcnn_cfg
from official.vision.configs import retinanet as retinanet_cfg
from official.vision.configs import semantic_segmentation as segmentation_cfg
from official.vision.modeling import backbones
from official.vision.modeling import classification_model
from official.vision.modeling import decoders
from official.vision.modeling import maskrcnn_model
from official.vision.modeling import retinanet_model
from official.vision.modeling import segmentation_model
from official.vision.modeling.heads import dense_prediction_heads
from official.vision.modeling.heads import instance_heads
from official.vision.modeling.heads import segmentation_heads
from official.vision.modeling.layers import detection_generator
from official.vision.modeling.layers import mask_sampler
from official.vision.modeling.layers import roi_aligner
from official.vision.modeling.layers import roi_generator
from official.vision.modeling.layers import roi_sampler
def build_classification_model(
input_specs: tf.keras.layers.InputSpec,
model_config: classification_cfg.ImageClassificationModel,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
skip_logits_layer: bool = False,
backbone: Optional[tf.keras.Model] = None) -> tf.keras.Model:
"""Builds the classification model."""
norm_activation_config = model_config.norm_activation
if not backbone:
backbone = backbones.factory.build_backbone(
input_specs=input_specs,
backbone_config=model_config.backbone,
norm_activation_config=norm_activation_config,
l2_regularizer=l2_regularizer)
model = classification_model.ClassificationModel(
backbone=backbone,
num_classes=model_config.num_classes,
input_specs=input_specs,
dropout_rate=model_config.dropout_rate,
kernel_initializer=model_config.kernel_initializer,
kernel_regularizer=l2_regularizer,
add_head_batch_norm=model_config.add_head_batch_norm,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
skip_logits_layer=skip_logits_layer)
return model
def build_maskrcnn(input_specs: tf.keras.layers.InputSpec,
model_config: maskrcnn_cfg.MaskRCNN,
l2_regularizer: Optional[
tf.keras.regularizers.Regularizer] = None,
backbone: Optional[tf.keras.Model] = None,
decoder: Optional[tf.keras.Model] = None) -> tf.keras.Model:
"""Builds Mask R-CNN model."""
norm_activation_config = model_config.norm_activation
if not backbone:
backbone = backbones.factory.build_backbone(
input_specs=input_specs,
backbone_config=model_config.backbone,
norm_activation_config=norm_activation_config,
l2_regularizer=l2_regularizer)
backbone_features = backbone(tf.keras.Input(input_specs.shape[1:]))
if not decoder:
decoder = decoders.factory.build_decoder(
input_specs=backbone.output_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
rpn_head_config = model_config.rpn_head
roi_generator_config = model_config.roi_generator
roi_sampler_config = model_config.roi_sampler
roi_aligner_config = model_config.roi_aligner
detection_head_config = model_config.detection_head
generator_config = model_config.detection_generator
num_anchors_per_location = (
len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales)
rpn_head = dense_prediction_heads.RPNHead(
min_level=model_config.min_level,
max_level=model_config.max_level,
num_anchors_per_location=num_anchors_per_location,
num_convs=rpn_head_config.num_convs,
num_filters=rpn_head_config.num_filters,
use_separable_conv=rpn_head_config.use_separable_conv,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
detection_head = instance_heads.DetectionHead(
num_classes=model_config.num_classes,
num_convs=detection_head_config.num_convs,
num_filters=detection_head_config.num_filters,
use_separable_conv=detection_head_config.use_separable_conv,
num_fcs=detection_head_config.num_fcs,
fc_dims=detection_head_config.fc_dims,
class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer,
name='detection_head')
if decoder:
decoder_features = decoder(backbone_features)
rpn_head(decoder_features)
if roi_sampler_config.cascade_iou_thresholds:
detection_head_cascade = [detection_head]
for cascade_num in range(len(roi_sampler_config.cascade_iou_thresholds)):
detection_head = instance_heads.DetectionHead(
num_classes=model_config.num_classes,
num_convs=detection_head_config.num_convs,
num_filters=detection_head_config.num_filters,
use_separable_conv=detection_head_config.use_separable_conv,
num_fcs=detection_head_config.num_fcs,
fc_dims=detection_head_config.fc_dims,
class_agnostic_bbox_pred=detection_head_config
.class_agnostic_bbox_pred,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer,
name='detection_head_{}'.format(cascade_num + 1))
detection_head_cascade.append(detection_head)
detection_head = detection_head_cascade
roi_generator_obj = roi_generator.MultilevelROIGenerator(
pre_nms_top_k=roi_generator_config.pre_nms_top_k,
pre_nms_score_threshold=roi_generator_config.pre_nms_score_threshold,
pre_nms_min_size_threshold=(
roi_generator_config.pre_nms_min_size_threshold),
nms_iou_threshold=roi_generator_config.nms_iou_threshold,
num_proposals=roi_generator_config.num_proposals,
test_pre_nms_top_k=roi_generator_config.test_pre_nms_top_k,
test_pre_nms_score_threshold=(
roi_generator_config.test_pre_nms_score_threshold),
test_pre_nms_min_size_threshold=(
roi_generator_config.test_pre_nms_min_size_threshold),
test_nms_iou_threshold=roi_generator_config.test_nms_iou_threshold,
test_num_proposals=roi_generator_config.test_num_proposals,
use_batched_nms=roi_generator_config.use_batched_nms)
roi_sampler_cascade = []
roi_sampler_obj = roi_sampler.ROISampler(
mix_gt_boxes=roi_sampler_config.mix_gt_boxes,
num_sampled_rois=roi_sampler_config.num_sampled_rois,
foreground_fraction=roi_sampler_config.foreground_fraction,
foreground_iou_threshold=roi_sampler_config.foreground_iou_threshold,
background_iou_high_threshold=(
roi_sampler_config.background_iou_high_threshold),
background_iou_low_threshold=(
roi_sampler_config.background_iou_low_threshold))
roi_sampler_cascade.append(roi_sampler_obj)
# Initialize addtional roi simplers for cascade heads.
if roi_sampler_config.cascade_iou_thresholds:
for iou in roi_sampler_config.cascade_iou_thresholds:
roi_sampler_obj = roi_sampler.ROISampler(
mix_gt_boxes=False,
num_sampled_rois=roi_sampler_config.num_sampled_rois,
foreground_iou_threshold=iou,
background_iou_high_threshold=iou,
background_iou_low_threshold=0.0,
skip_subsampling=True)
roi_sampler_cascade.append(roi_sampler_obj)
roi_aligner_obj = roi_aligner.MultilevelROIAligner(
crop_size=roi_aligner_config.crop_size,
sample_offset=roi_aligner_config.sample_offset)
detection_generator_obj = detection_generator.DetectionGenerator(
apply_nms=generator_config.apply_nms,
pre_nms_top_k=generator_config.pre_nms_top_k,
pre_nms_score_threshold=generator_config.pre_nms_score_threshold,
nms_iou_threshold=generator_config.nms_iou_threshold,
max_num_detections=generator_config.max_num_detections,
nms_version=generator_config.nms_version,
use_cpu_nms=generator_config.use_cpu_nms,
soft_nms_sigma=generator_config.soft_nms_sigma)
if model_config.include_mask:
mask_head = instance_heads.MaskHead(
num_classes=model_config.num_classes,
upsample_factor=model_config.mask_head.upsample_factor,
num_convs=model_config.mask_head.num_convs,
num_filters=model_config.mask_head.num_filters,
use_separable_conv=model_config.mask_head.use_separable_conv,
activation=model_config.norm_activation.activation,
norm_momentum=model_config.norm_activation.norm_momentum,
norm_epsilon=model_config.norm_activation.norm_epsilon,
kernel_regularizer=l2_regularizer,
class_agnostic=model_config.mask_head.class_agnostic)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=(
model_config.mask_roi_aligner.crop_size *
model_config.mask_head.upsample_factor),
num_sampled_masks=model_config.mask_sampler.num_sampled_masks)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(
crop_size=model_config.mask_roi_aligner.crop_size,
sample_offset=model_config.mask_roi_aligner.sample_offset)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone=backbone,
decoder=decoder,
rpn_head=rpn_head,
detection_head=detection_head,
roi_generator=roi_generator_obj,
roi_sampler=roi_sampler_cascade,
roi_aligner=roi_aligner_obj,
detection_generator=detection_generator_obj,
mask_head=mask_head,
mask_sampler=mask_sampler_obj,
mask_roi_aligner=mask_roi_aligner_obj,
class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred,
cascade_class_ensemble=detection_head_config.cascade_class_ensemble,
min_level=model_config.min_level,
max_level=model_config.max_level,
num_scales=model_config.anchor.num_scales,
aspect_ratios=model_config.anchor.aspect_ratios,
anchor_size=model_config.anchor.anchor_size)
return model
def build_retinanet(
input_specs: tf.keras.layers.InputSpec,
model_config: retinanet_cfg.RetinaNet,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
backbone: Optional[tf.keras.Model] = None,
decoder: Optional[tf.keras.regularizers.Regularizer] = None
) -> tf.keras.Model:
"""Builds RetinaNet model."""
norm_activation_config = model_config.norm_activation
if not backbone:
backbone = backbones.factory.build_backbone(
input_specs=input_specs,
backbone_config=model_config.backbone,
norm_activation_config=norm_activation_config,
l2_regularizer=l2_regularizer)
backbone_features = backbone(tf.keras.Input(input_specs.shape[1:]))
if not decoder:
decoder = decoders.factory.build_decoder(
input_specs=backbone.output_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
head_config = model_config.head
generator_config = model_config.detection_generator
num_anchors_per_location = (
len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales)
head = dense_prediction_heads.RetinaNetHead(
min_level=model_config.min_level,
max_level=model_config.max_level,
num_classes=model_config.num_classes,
num_anchors_per_location=num_anchors_per_location,
num_convs=head_config.num_convs,
num_filters=head_config.num_filters,
attribute_heads=[
cfg.as_dict() for cfg in (head_config.attribute_heads or [])
],
use_separable_conv=head_config.use_separable_conv,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
# Builds decoder and head so that their trainable weights are initialized
if decoder:
decoder_features = decoder(backbone_features)
_ = head(decoder_features)
detection_generator_obj = detection_generator.MultilevelDetectionGenerator(
apply_nms=generator_config.apply_nms,
pre_nms_top_k=generator_config.pre_nms_top_k,
pre_nms_score_threshold=generator_config.pre_nms_score_threshold,
nms_iou_threshold=generator_config.nms_iou_threshold,
max_num_detections=generator_config.max_num_detections,
nms_version=generator_config.nms_version,
use_cpu_nms=generator_config.use_cpu_nms,
soft_nms_sigma=generator_config.soft_nms_sigma,
tflite_post_processing_config=generator_config.tflite_post_processing
.as_dict())
model = retinanet_model.RetinaNetModel(
backbone,
decoder,
head,
detection_generator_obj,
min_level=model_config.min_level,
max_level=model_config.max_level,
num_scales=model_config.anchor.num_scales,
aspect_ratios=model_config.anchor.aspect_ratios,
anchor_size=model_config.anchor.anchor_size)
return model
def build_segmentation_model(
input_specs: tf.keras.layers.InputSpec,
model_config: segmentation_cfg.SemanticSegmentationModel,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
backbone: Optional[tf.keras.regularizers.Regularizer] = None,
decoder: Optional[tf.keras.regularizers.Regularizer] = None
) -> tf.keras.Model:
"""Builds Segmentation model."""
norm_activation_config = model_config.norm_activation
if not backbone:
backbone = backbones.factory.build_backbone(
input_specs=input_specs,
backbone_config=model_config.backbone,
norm_activation_config=norm_activation_config,
l2_regularizer=l2_regularizer)
if not decoder:
decoder = decoders.factory.build_decoder(
input_specs=backbone.output_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
head_config = model_config.head
head = segmentation_heads.SegmentationHead(
num_classes=model_config.num_classes,
level=head_config.level,
num_convs=head_config.num_convs,
prediction_kernel_size=head_config.prediction_kernel_size,
num_filters=head_config.num_filters,
use_depthwise_convolution=head_config.use_depthwise_convolution,
upsample_factor=head_config.upsample_factor,
feature_fusion=head_config.feature_fusion,
low_level=head_config.low_level,
low_level_num_filters=head_config.low_level_num_filters,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
mask_scoring_head = None
if model_config.mask_scoring_head:
mask_scoring_head = segmentation_heads.MaskScoring(
num_classes=model_config.num_classes,
**model_config.mask_scoring_head.as_dict(),
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
model = segmentation_model.SegmentationModel(
backbone, decoder, head, mask_scoring_head=mask_scoring_head)
return model
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Factory methods to build models."""
# Import libraries
import tensorflow as tf
from official.core import registry
from official.vision.configs import video_classification as video_classification_cfg
from official.vision.modeling import video_classification_model
from official.vision.modeling import backbones
_REGISTERED_MODEL_CLS = {}
def register_model_builder(key: str):
"""Decorates a builder of model class.
The builder should be a Callable (a class or a function).
This decorator supports registration of backbone builder as follows:
```
class MyModel(tf.keras.Model):
pass
@register_backbone_builder('mybackbone')
def builder(input_specs, config, l2_reg):
return MyModel(...)
# Builds a MyModel object.
my_backbone = build_backbone_3d(input_specs, config, l2_reg)
```
Args:
key: the key to look up the builder.
Returns:
A callable for use as class decorator that registers the decorated class
for creation from an instance of model class.
"""
return registry.register(_REGISTERED_MODEL_CLS, key)
def build_model(
model_type: str,
input_specs: tf.keras.layers.InputSpec,
model_config: video_classification_cfg.hyperparams.Config,
num_classes: int,
l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
"""Builds backbone from a config.
Args:
model_type: string name of model type. It should be consistent with
ModelConfig.model_type.
input_specs: tf.keras.layers.InputSpec.
model_config: a OneOfConfig. Model config.
num_classes: number of classes.
l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None.
Returns:
tf.keras.Model instance of the backbone.
"""
model_builder = registry.lookup(_REGISTERED_MODEL_CLS, model_type)
return model_builder(input_specs, model_config, num_classes, l2_regularizer)
@register_model_builder('video_classification')
def build_video_classification_model(
input_specs: tf.keras.layers.InputSpec,
model_config: video_classification_cfg.VideoClassificationModel,
num_classes: int,
l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
"""Builds the video classification model."""
input_specs_dict = {'image': input_specs}
norm_activation_config = model_config.norm_activation
backbone = backbones.factory.build_backbone(
input_specs=input_specs,
backbone_config=model_config.backbone,
norm_activation_config=norm_activation_config,
l2_regularizer=l2_regularizer)
model = video_classification_model.VideoClassificationModel(
backbone=backbone,
num_classes=num_classes,
input_specs=input_specs_dict,
dropout_rate=model_config.dropout_rate,
aggregate_endpoints=model_config.aggregate_endpoints,
kernel_regularizer=l2_regularizer,
require_endpoints=model_config.require_endpoints)
return model
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for factory.py."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from official.vision.configs import backbones
from official.vision.configs import backbones_3d
from official.vision.configs import image_classification as classification_cfg
from official.vision.configs import maskrcnn as maskrcnn_cfg
from official.vision.configs import retinanet as retinanet_cfg
from official.vision.configs import video_classification as video_classification_cfg
from official.vision.modeling import factory
from official.vision.modeling import factory_3d
class ClassificationModelBuilderTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
('resnet', (224, 224), 5e-5),
('resnet', (224, 224), None),
('resnet', (None, None), 5e-5),
('resnet', (None, None), None),
)
def test_builder(self, backbone_type, input_size, weight_decay):
num_classes = 2
input_specs = tf.keras.layers.InputSpec(
shape=[None, input_size[0], input_size[1], 3])
model_config = classification_cfg.ImageClassificationModel(
num_classes=num_classes,
backbone=backbones.Backbone(type=backbone_type))
l2_regularizer = (
tf.keras.regularizers.l2(weight_decay) if weight_decay else None)
_ = factory.build_classification_model(
input_specs=input_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
class MaskRCNNBuilderTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
('resnet', (640, 640)),
('resnet', (None, None)),
)
def test_builder(self, backbone_type, input_size):
num_classes = 2
input_specs = tf.keras.layers.InputSpec(
shape=[None, input_size[0], input_size[1], 3])
model_config = maskrcnn_cfg.MaskRCNN(
num_classes=num_classes,
backbone=backbones.Backbone(type=backbone_type))
l2_regularizer = tf.keras.regularizers.l2(5e-5)
_ = factory.build_maskrcnn(
input_specs=input_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
class RetinaNetBuilderTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
('resnet', (640, 640), False),
('resnet', (None, None), True),
)
def test_builder(self, backbone_type, input_size, has_att_heads):
num_classes = 2
input_specs = tf.keras.layers.InputSpec(
shape=[None, input_size[0], input_size[1], 3])
if has_att_heads:
attribute_heads_config = [
retinanet_cfg.AttributeHead(name='att1'),
retinanet_cfg.AttributeHead(
name='att2', type='classification', size=2),
]
else:
attribute_heads_config = None
model_config = retinanet_cfg.RetinaNet(
num_classes=num_classes,
backbone=backbones.Backbone(type=backbone_type),
head=retinanet_cfg.RetinaNetHead(
attribute_heads=attribute_heads_config))
l2_regularizer = tf.keras.regularizers.l2(5e-5)
_ = factory.build_retinanet(
input_specs=input_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
if has_att_heads:
self.assertEqual(model_config.head.attribute_heads[0].as_dict(),
dict(name='att1', type='regression', size=1))
self.assertEqual(model_config.head.attribute_heads[1].as_dict(),
dict(name='att2', type='classification', size=2))
class VideoClassificationModelBuilderTest(parameterized.TestCase,
tf.test.TestCase):
@parameterized.parameters(
('resnet_3d', (8, 224, 224), 5e-5),
('resnet_3d', (None, None, None), 5e-5),
)
def test_builder(self, backbone_type, input_size, weight_decay):
input_specs = tf.keras.layers.InputSpec(
shape=[None, input_size[0], input_size[1], input_size[2], 3])
model_config = video_classification_cfg.VideoClassificationModel(
backbone=backbones_3d.Backbone3D(type=backbone_type))
l2_regularizer = (
tf.keras.regularizers.l2(weight_decay) if weight_decay else None)
_ = factory_3d.build_video_classification_model(
input_specs=input_specs,
model_config=model_config,
num_classes=2,
l2_regularizer=l2_regularizer)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Heads package definition."""
from official.vision.modeling.heads.dense_prediction_heads import RetinaNetHead
from official.vision.modeling.heads.dense_prediction_heads import RPNHead
from official.vision.modeling.heads.instance_heads import DetectionHead
from official.vision.modeling.heads.instance_heads import MaskHead
from official.vision.modeling.heads.segmentation_heads import SegmentationHead
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of dense prediction heads."""
from typing import Any, Dict, List, Mapping, Optional, Union
# Import libraries
import numpy as np
import tensorflow as tf
from official.modeling import tf_utils
@tf.keras.utils.register_keras_serializable(package='Vision')
class RetinaNetHead(tf.keras.layers.Layer):
"""Creates a RetinaNet head."""
def __init__(
self,
min_level: int,
max_level: int,
num_classes: int,
num_anchors_per_location: int,
num_convs: int = 4,
num_filters: int = 256,
attribute_heads: Optional[List[Dict[str, Any]]] = None,
use_separable_conv: bool = False,
activation: str = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
num_params_per_anchor: int = 4,
**kwargs):
"""Initializes a RetinaNet head.
Args:
min_level: An `int` number of minimum feature level.
max_level: An `int` number of maximum feature level.
num_classes: An `int` number of classes to predict.
num_anchors_per_location: An `int` number of number of anchors per pixel
location.
num_convs: An `int` number that represents the number of the intermediate
conv layers before the prediction.
num_filters: An `int` number that represents the number of filters of the
intermediate conv layers.
attribute_heads: If not None, a list that contains a dict for each
additional attribute head. Each dict consists of 3 key-value pairs:
`name`, `type` ('regression' or 'classification'), and `size` (number
of predicted values for each instance).
use_separable_conv: A `bool` that indicates whether the separable
convolution layers is used.
activation: A `str` that indicates which activation is used, e.g. 'relu',
'swish', etc.
use_sync_bn: A `bool` that indicates whether to use synchronized batch
normalization across different replicas.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default is None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
num_params_per_anchor: Number of parameters required to specify an anchor
box. For example, `num_params_per_anchor` would be 4 for axis-aligned
anchor boxes specified by their y-centers, x-centers, heights, and
widths.
**kwargs: Additional keyword arguments to be passed.
"""
super(RetinaNetHead, self).__init__(**kwargs)
self._config_dict = {
'min_level': min_level,
'max_level': max_level,
'num_classes': num_classes,
'num_anchors_per_location': num_anchors_per_location,
'num_convs': num_convs,
'num_filters': num_filters,
'attribute_heads': attribute_heads,
'use_separable_conv': use_separable_conv,
'activation': activation,
'use_sync_bn': use_sync_bn,
'norm_momentum': norm_momentum,
'norm_epsilon': norm_epsilon,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
'num_params_per_anchor': num_params_per_anchor,
}
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation = tf_utils.get_activation(activation)
def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
"""Creates the variables of the head."""
conv_op = (tf.keras.layers.SeparableConv2D
if self._config_dict['use_separable_conv']
else tf.keras.layers.Conv2D)
conv_kwargs = {
'filters': self._config_dict['num_filters'],
'kernel_size': 3,
'padding': 'same',
'bias_initializer': tf.zeros_initializer(),
'bias_regularizer': self._config_dict['bias_regularizer'],
}
if not self._config_dict['use_separable_conv']:
conv_kwargs.update({
'kernel_initializer': tf.keras.initializers.RandomNormal(
stddev=0.01),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
})
bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
if self._config_dict['use_sync_bn']
else tf.keras.layers.BatchNormalization)
bn_kwargs = {
'axis': self._bn_axis,
'momentum': self._config_dict['norm_momentum'],
'epsilon': self._config_dict['norm_epsilon'],
}
# Class net.
self._cls_convs = []
self._cls_norms = []
for level in range(
self._config_dict['min_level'], self._config_dict['max_level'] + 1):
this_level_cls_norms = []
for i in range(self._config_dict['num_convs']):
if level == self._config_dict['min_level']:
cls_conv_name = 'classnet-conv_{}'.format(i)
self._cls_convs.append(conv_op(name=cls_conv_name, **conv_kwargs))
cls_norm_name = 'classnet-conv-norm_{}_{}'.format(level, i)
this_level_cls_norms.append(bn_op(name=cls_norm_name, **bn_kwargs))
self._cls_norms.append(this_level_cls_norms)
classifier_kwargs = {
'filters': (
self._config_dict['num_classes'] *
self._config_dict['num_anchors_per_location']),
'kernel_size': 3,
'padding': 'same',
'bias_initializer': tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
'bias_regularizer': self._config_dict['bias_regularizer'],
}
if not self._config_dict['use_separable_conv']:
classifier_kwargs.update({
'kernel_initializer': tf.keras.initializers.RandomNormal(stddev=1e-5),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
})
self._classifier = conv_op(name='scores', **classifier_kwargs)
# Box net.
self._box_convs = []
self._box_norms = []
for level in range(
self._config_dict['min_level'], self._config_dict['max_level'] + 1):
this_level_box_norms = []
for i in range(self._config_dict['num_convs']):
if level == self._config_dict['min_level']:
box_conv_name = 'boxnet-conv_{}'.format(i)
self._box_convs.append(conv_op(name=box_conv_name, **conv_kwargs))
box_norm_name = 'boxnet-conv-norm_{}_{}'.format(level, i)
this_level_box_norms.append(bn_op(name=box_norm_name, **bn_kwargs))
self._box_norms.append(this_level_box_norms)
box_regressor_kwargs = {
'filters': (self._config_dict['num_params_per_anchor'] *
self._config_dict['num_anchors_per_location']),
'kernel_size': 3,
'padding': 'same',
'bias_initializer': tf.zeros_initializer(),
'bias_regularizer': self._config_dict['bias_regularizer'],
}
if not self._config_dict['use_separable_conv']:
box_regressor_kwargs.update({
'kernel_initializer': tf.keras.initializers.RandomNormal(
stddev=1e-5),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
})
self._box_regressor = conv_op(name='boxes', **box_regressor_kwargs)
# Attribute learning nets.
if self._config_dict['attribute_heads']:
self._att_predictors = {}
self._att_convs = {}
self._att_norms = {}
for att_config in self._config_dict['attribute_heads']:
att_name = att_config['name']
att_type = att_config['type']
att_size = att_config['size']
att_convs_i = []
att_norms_i = []
# Build conv and norm layers.
for level in range(self._config_dict['min_level'],
self._config_dict['max_level'] + 1):
this_level_att_norms = []
for i in range(self._config_dict['num_convs']):
if level == self._config_dict['min_level']:
att_conv_name = '{}-conv_{}'.format(att_name, i)
att_convs_i.append(conv_op(name=att_conv_name, **conv_kwargs))
att_norm_name = '{}-conv-norm_{}_{}'.format(att_name, level, i)
this_level_att_norms.append(bn_op(name=att_norm_name, **bn_kwargs))
att_norms_i.append(this_level_att_norms)
self._att_convs[att_name] = att_convs_i
self._att_norms[att_name] = att_norms_i
# Build the final prediction layer.
att_predictor_kwargs = {
'filters':
(att_size * self._config_dict['num_anchors_per_location']),
'kernel_size': 3,
'padding': 'same',
'bias_initializer': tf.zeros_initializer(),
'bias_regularizer': self._config_dict['bias_regularizer'],
}
if att_type == 'regression':
att_predictor_kwargs.update(
{'bias_initializer': tf.zeros_initializer()})
elif att_type == 'classification':
att_predictor_kwargs.update({
'bias_initializer':
tf.constant_initializer(-np.log((1 - 0.01) / 0.01))
})
else:
raise ValueError(
'Attribute head type {} not supported.'.format(att_type))
if not self._config_dict['use_separable_conv']:
att_predictor_kwargs.update({
'kernel_initializer':
tf.keras.initializers.RandomNormal(stddev=1e-5),
'kernel_regularizer':
self._config_dict['kernel_regularizer'],
})
self._att_predictors[att_name] = conv_op(
name='{}_attributes'.format(att_name), **att_predictor_kwargs)
super(RetinaNetHead, self).build(input_shape)
def call(self, features: Mapping[str, tf.Tensor]):
"""Forward pass of the RetinaNet head.
Args:
features: A `dict` of `tf.Tensor` where
- key: A `str` of the level of the multilevel features.
- values: A `tf.Tensor`, the feature map tensors, whose shape is
[batch, height_l, width_l, channels].
Returns:
scores: A `dict` of `tf.Tensor` which includes scores of the predictions.
- key: A `str` of the level of the multilevel predictions.
- values: A `tf.Tensor` of the box scores predicted from a particular
feature level, whose shape is
[batch, height_l, width_l, num_classes * num_anchors_per_location].
boxes: A `dict` of `tf.Tensor` which includes coordinates of the
predictions.
- key: A `str` of the level of the multilevel predictions.
- values: A `tf.Tensor` of the box scores predicted from a particular
feature level, whose shape is
[batch, height_l, width_l,
num_params_per_anchor * num_anchors_per_location].
attributes: a dict of (attribute_name, attribute_prediction). Each
`attribute_prediction` is a dict of:
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the box scores predicted from a particular feature
level, whose shape is
[batch, height_l, width_l,
attribute_size * num_anchors_per_location].
Can be an empty dictionary if no attribute learning is required.
"""
scores = {}
boxes = {}
if self._config_dict['attribute_heads']:
attributes = {
att_config['name']: {}
for att_config in self._config_dict['attribute_heads']
}
else:
attributes = {}
for i, level in enumerate(
range(self._config_dict['min_level'],
self._config_dict['max_level'] + 1)):
this_level_features = features[str(level)]
# class net.
x = this_level_features
for conv, norm in zip(self._cls_convs, self._cls_norms[i]):
x = conv(x)
x = norm(x)
x = self._activation(x)
scores[str(level)] = self._classifier(x)
# box net.
x = this_level_features
for conv, norm in zip(self._box_convs, self._box_norms[i]):
x = conv(x)
x = norm(x)
x = self._activation(x)
boxes[str(level)] = self._box_regressor(x)
# attribute nets.
if self._config_dict['attribute_heads']:
for att_config in self._config_dict['attribute_heads']:
att_name = att_config['name']
x = this_level_features
for conv, norm in zip(self._att_convs[att_name],
self._att_norms[att_name][i]):
x = conv(x)
x = norm(x)
x = self._activation(x)
attributes[att_name][str(level)] = self._att_predictors[att_name](x)
return scores, boxes, attributes
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
@tf.keras.utils.register_keras_serializable(package='Vision')
class RPNHead(tf.keras.layers.Layer):
"""Creates a Region Proposal Network (RPN) head."""
def __init__(
self,
min_level: int,
max_level: int,
num_anchors_per_location: int,
num_convs: int = 1,
num_filters: int = 256,
use_separable_conv: bool = False,
activation: str = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
**kwargs):
"""Initializes a Region Proposal Network head.
Args:
min_level: An `int` number of minimum feature level.
max_level: An `int` number of maximum feature level.
num_anchors_per_location: An `int` number of number of anchors per pixel
location.
num_convs: An `int` number that represents the number of the intermediate
convolution layers before the prediction.
num_filters: An `int` number that represents the number of filters of the
intermediate convolution layers.
use_separable_conv: A `bool` that indicates whether the separable
convolution layers is used.
activation: A `str` that indicates which activation is used, e.g. 'relu',
'swish', etc.
use_sync_bn: A `bool` that indicates whether to use synchronized batch
normalization across different replicas.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default is None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
**kwargs: Additional keyword arguments to be passed.
"""
super(RPNHead, self).__init__(**kwargs)
self._config_dict = {
'min_level': min_level,
'max_level': max_level,
'num_anchors_per_location': num_anchors_per_location,
'num_convs': num_convs,
'num_filters': num_filters,
'use_separable_conv': use_separable_conv,
'activation': activation,
'use_sync_bn': use_sync_bn,
'norm_momentum': norm_momentum,
'norm_epsilon': norm_epsilon,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
}
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation = tf_utils.get_activation(activation)
def build(self, input_shape):
"""Creates the variables of the head."""
conv_op = (tf.keras.layers.SeparableConv2D
if self._config_dict['use_separable_conv']
else tf.keras.layers.Conv2D)
conv_kwargs = {
'filters': self._config_dict['num_filters'],
'kernel_size': 3,
'padding': 'same',
'bias_initializer': tf.zeros_initializer(),
'bias_regularizer': self._config_dict['bias_regularizer'],
}
if not self._config_dict['use_separable_conv']:
conv_kwargs.update({
'kernel_initializer': tf.keras.initializers.RandomNormal(
stddev=0.01),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
})
bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
if self._config_dict['use_sync_bn']
else tf.keras.layers.BatchNormalization)
bn_kwargs = {
'axis': self._bn_axis,
'momentum': self._config_dict['norm_momentum'],
'epsilon': self._config_dict['norm_epsilon'],
}
self._convs = []
self._norms = []
for level in range(
self._config_dict['min_level'], self._config_dict['max_level'] + 1):
this_level_norms = []
for i in range(self._config_dict['num_convs']):
if level == self._config_dict['min_level']:
conv_name = 'rpn-conv_{}'.format(i)
self._convs.append(conv_op(name=conv_name, **conv_kwargs))
norm_name = 'rpn-conv-norm_{}_{}'.format(level, i)
this_level_norms.append(bn_op(name=norm_name, **bn_kwargs))
self._norms.append(this_level_norms)
classifier_kwargs = {
'filters': self._config_dict['num_anchors_per_location'],
'kernel_size': 1,
'padding': 'valid',
'bias_initializer': tf.zeros_initializer(),
'bias_regularizer': self._config_dict['bias_regularizer'],
}
if not self._config_dict['use_separable_conv']:
classifier_kwargs.update({
'kernel_initializer': tf.keras.initializers.RandomNormal(
stddev=1e-5),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
})
self._classifier = conv_op(name='rpn-scores', **classifier_kwargs)
box_regressor_kwargs = {
'filters': 4 * self._config_dict['num_anchors_per_location'],
'kernel_size': 1,
'padding': 'valid',
'bias_initializer': tf.zeros_initializer(),
'bias_regularizer': self._config_dict['bias_regularizer'],
}
if not self._config_dict['use_separable_conv']:
box_regressor_kwargs.update({
'kernel_initializer': tf.keras.initializers.RandomNormal(
stddev=1e-5),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
})
self._box_regressor = conv_op(name='rpn-boxes', **box_regressor_kwargs)
super(RPNHead, self).build(input_shape)
def call(self, features: Mapping[str, tf.Tensor]):
"""Forward pass of the RPN head.
Args:
features: A `dict` of `tf.Tensor` where
- key: A `str` of the level of the multilevel features.
- values: A `tf.Tensor`, the feature map tensors, whose shape is [batch,
height_l, width_l, channels].
Returns:
scores: A `dict` of `tf.Tensor` which includes scores of the predictions.
- key: A `str` of the level of the multilevel predictions.
- values: A `tf.Tensor` of the box scores predicted from a particular
feature level, whose shape is
[batch, height_l, width_l, num_classes * num_anchors_per_location].
boxes: A `dict` of `tf.Tensor` which includes coordinates of the
predictions.
- key: A `str` of the level of the multilevel predictions.
- values: A `tf.Tensor` of the box scores predicted from a particular
feature level, whose shape is
[batch, height_l, width_l, 4 * num_anchors_per_location].
"""
scores = {}
boxes = {}
for i, level in enumerate(
range(self._config_dict['min_level'],
self._config_dict['max_level'] + 1)):
x = features[str(level)]
for conv, norm in zip(self._convs, self._norms[i]):
x = conv(x)
x = norm(x)
x = self._activation(x)
scores[str(level)] = self._classifier(x)
boxes[str(level)] = self._box_regressor(x)
return scores, boxes
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for dense_prediction_heads.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling.heads import dense_prediction_heads
class RetinaNetHeadTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(False, False, False),
(False, True, False),
(True, False, True),
(True, True, True),
)
def test_forward(self, use_separable_conv, use_sync_bn, has_att_heads):
if has_att_heads:
attribute_heads = [dict(name='depth', type='regression', size=1)]
else:
attribute_heads = None
retinanet_head = dense_prediction_heads.RetinaNetHead(
min_level=3,
max_level=4,
num_classes=3,
num_anchors_per_location=3,
num_convs=2,
num_filters=256,
attribute_heads=attribute_heads,
use_separable_conv=use_separable_conv,
activation='relu',
use_sync_bn=use_sync_bn,
norm_momentum=0.99,
norm_epsilon=0.001,
kernel_regularizer=None,
bias_regularizer=None,
)
features = {
'3': np.random.rand(2, 128, 128, 16),
'4': np.random.rand(2, 64, 64, 16),
}
scores, boxes, attributes = retinanet_head(features)
self.assertAllEqual(scores['3'].numpy().shape, [2, 128, 128, 9])
self.assertAllEqual(scores['4'].numpy().shape, [2, 64, 64, 9])
self.assertAllEqual(boxes['3'].numpy().shape, [2, 128, 128, 12])
self.assertAllEqual(boxes['4'].numpy().shape, [2, 64, 64, 12])
if has_att_heads:
for att in attributes.values():
self.assertAllEqual(att['3'].numpy().shape, [2, 128, 128, 3])
self.assertAllEqual(att['4'].numpy().shape, [2, 64, 64, 3])
def test_serialize_deserialize(self):
retinanet_head = dense_prediction_heads.RetinaNetHead(
min_level=3,
max_level=7,
num_classes=3,
num_anchors_per_location=9,
num_convs=2,
num_filters=16,
attribute_heads=None,
use_separable_conv=False,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
kernel_regularizer=None,
bias_regularizer=None,
)
config = retinanet_head.get_config()
new_retinanet_head = (
dense_prediction_heads.RetinaNetHead.from_config(config))
self.assertAllEqual(
retinanet_head.get_config(), new_retinanet_head.get_config())
class RpnHeadTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(False, False),
(False, True),
(True, False),
(True, True),
)
def test_forward(self, use_separable_conv, use_sync_bn):
rpn_head = dense_prediction_heads.RPNHead(
min_level=3,
max_level=4,
num_anchors_per_location=3,
num_convs=2,
num_filters=256,
use_separable_conv=use_separable_conv,
activation='relu',
use_sync_bn=use_sync_bn,
norm_momentum=0.99,
norm_epsilon=0.001,
kernel_regularizer=None,
bias_regularizer=None,
)
features = {
'3': np.random.rand(2, 128, 128, 16),
'4': np.random.rand(2, 64, 64, 16),
}
scores, boxes = rpn_head(features)
self.assertAllEqual(scores['3'].numpy().shape, [2, 128, 128, 3])
self.assertAllEqual(scores['4'].numpy().shape, [2, 64, 64, 3])
self.assertAllEqual(boxes['3'].numpy().shape, [2, 128, 128, 12])
self.assertAllEqual(boxes['4'].numpy().shape, [2, 64, 64, 12])
def test_serialize_deserialize(self):
rpn_head = dense_prediction_heads.RPNHead(
min_level=3,
max_level=7,
num_anchors_per_location=9,
num_convs=2,
num_filters=16,
use_separable_conv=False,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
kernel_regularizer=None,
bias_regularizer=None,
)
config = rpn_head.get_config()
new_rpn_head = dense_prediction_heads.RPNHead.from_config(config)
self.assertAllEqual(rpn_head.get_config(), new_rpn_head.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of instance prediction heads."""
from typing import List, Union, Optional
# Import libraries
import tensorflow as tf
from official.modeling import tf_utils
@tf.keras.utils.register_keras_serializable(package='Vision')
class DetectionHead(tf.keras.layers.Layer):
"""Creates a detection head."""
def __init__(
self,
num_classes: int,
num_convs: int = 0,
num_filters: int = 256,
use_separable_conv: bool = False,
num_fcs: int = 2,
fc_dims: int = 1024,
class_agnostic_bbox_pred: bool = False,
activation: str = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
**kwargs):
"""Initializes a detection head.
Args:
num_classes: An `int` for the number of classes.
num_convs: An `int` number that represents the number of the intermediate
convolution layers before the FC layers.
num_filters: An `int` number that represents the number of filters of the
intermediate convolution layers.
use_separable_conv: A `bool` that indicates whether the separable
convolution layers is used.
num_fcs: An `int` number that represents the number of FC layers before
the predictions.
fc_dims: An `int` number that represents the number of dimension of the FC
layers.
class_agnostic_bbox_pred: `bool`, indicating whether bboxes should be
predicted for every class or not.
activation: A `str` that indicates which activation is used, e.g. 'relu',
'swish', etc.
use_sync_bn: A `bool` that indicates whether to use synchronized batch
normalization across different replicas.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default is None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
**kwargs: Additional keyword arguments to be passed.
"""
super(DetectionHead, self).__init__(**kwargs)
self._config_dict = {
'num_classes': num_classes,
'num_convs': num_convs,
'num_filters': num_filters,
'use_separable_conv': use_separable_conv,
'num_fcs': num_fcs,
'fc_dims': fc_dims,
'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
'activation': activation,
'use_sync_bn': use_sync_bn,
'norm_momentum': norm_momentum,
'norm_epsilon': norm_epsilon,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
}
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation = tf_utils.get_activation(activation)
def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
"""Creates the variables of the head."""
conv_op = (tf.keras.layers.SeparableConv2D
if self._config_dict['use_separable_conv']
else tf.keras.layers.Conv2D)
conv_kwargs = {
'filters': self._config_dict['num_filters'],
'kernel_size': 3,
'padding': 'same',
}
if self._config_dict['use_separable_conv']:
conv_kwargs.update({
'depthwise_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'pointwise_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'bias_initializer': tf.zeros_initializer(),
'depthwise_regularizer': self._config_dict['kernel_regularizer'],
'pointwise_regularizer': self._config_dict['kernel_regularizer'],
'bias_regularizer': self._config_dict['bias_regularizer'],
})
else:
conv_kwargs.update({
'kernel_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'bias_initializer': tf.zeros_initializer(),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
'bias_regularizer': self._config_dict['bias_regularizer'],
})
bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
if self._config_dict['use_sync_bn']
else tf.keras.layers.BatchNormalization)
bn_kwargs = {
'axis': self._bn_axis,
'momentum': self._config_dict['norm_momentum'],
'epsilon': self._config_dict['norm_epsilon'],
}
self._convs = []
self._conv_norms = []
for i in range(self._config_dict['num_convs']):
conv_name = 'detection-conv_{}'.format(i)
self._convs.append(conv_op(name=conv_name, **conv_kwargs))
bn_name = 'detection-conv-bn_{}'.format(i)
self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
self._fcs = []
self._fc_norms = []
for i in range(self._config_dict['num_fcs']):
fc_name = 'detection-fc_{}'.format(i)
self._fcs.append(
tf.keras.layers.Dense(
units=self._config_dict['fc_dims'],
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=1 / 3.0, mode='fan_out', distribution='uniform'),
kernel_regularizer=self._config_dict['kernel_regularizer'],
bias_regularizer=self._config_dict['bias_regularizer'],
name=fc_name))
bn_name = 'detection-fc-bn_{}'.format(i)
self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs))
self._classifier = tf.keras.layers.Dense(
units=self._config_dict['num_classes'],
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
bias_initializer=tf.zeros_initializer(),
kernel_regularizer=self._config_dict['kernel_regularizer'],
bias_regularizer=self._config_dict['bias_regularizer'],
name='detection-scores')
num_box_outputs = (4 if self._config_dict['class_agnostic_bbox_pred'] else
self._config_dict['num_classes'] * 4)
self._box_regressor = tf.keras.layers.Dense(
units=num_box_outputs,
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
bias_initializer=tf.zeros_initializer(),
kernel_regularizer=self._config_dict['kernel_regularizer'],
bias_regularizer=self._config_dict['bias_regularizer'],
name='detection-boxes')
super(DetectionHead, self).build(input_shape)
def call(self, inputs: tf.Tensor, training: bool = None):
"""Forward pass of box and class branches for the Mask-RCNN model.
Args:
inputs: A `tf.Tensor` of the shape [batch_size, num_instances, roi_height,
roi_width, roi_channels], representing the ROI features.
training: a `bool` indicating whether it is in `training` mode.
Returns:
class_outputs: A `tf.Tensor` of the shape
[batch_size, num_rois, num_classes], representing the class predictions.
box_outputs: A `tf.Tensor` of the shape
[batch_size, num_rois, num_classes * 4], representing the box
predictions.
"""
roi_features = inputs
_, num_rois, height, width, filters = roi_features.get_shape().as_list()
x = tf.reshape(roi_features, [-1, height, width, filters])
for conv, bn in zip(self._convs, self._conv_norms):
x = conv(x)
x = bn(x)
x = self._activation(x)
_, _, _, filters = x.get_shape().as_list()
x = tf.reshape(x, [-1, num_rois, height * width * filters])
for fc, bn in zip(self._fcs, self._fc_norms):
x = fc(x)
x = bn(x)
x = self._activation(x)
classes = self._classifier(x)
boxes = self._box_regressor(x)
return classes, boxes
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
@tf.keras.utils.register_keras_serializable(package='Vision')
class MaskHead(tf.keras.layers.Layer):
"""Creates a mask head."""
def __init__(
self,
num_classes: int,
upsample_factor: int = 2,
num_convs: int = 4,
num_filters: int = 256,
use_separable_conv: bool = False,
activation: str = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
class_agnostic: bool = False,
**kwargs):
"""Initializes a mask head.
Args:
num_classes: An `int` of the number of classes.
upsample_factor: An `int` that indicates the upsample factor to generate
the final predicted masks. It should be >= 1.
num_convs: An `int` number that represents the number of the intermediate
convolution layers before the mask prediction layers.
num_filters: An `int` number that represents the number of filters of the
intermediate convolution layers.
use_separable_conv: A `bool` that indicates whether the separable
convolution layers is used.
activation: A `str` that indicates which activation is used, e.g. 'relu',
'swish', etc.
use_sync_bn: A `bool` that indicates whether to use synchronized batch
normalization across different replicas.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default is None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
class_agnostic: A `bool`. If set, we use a single channel mask head that
is shared between all classes.
**kwargs: Additional keyword arguments to be passed.
"""
super(MaskHead, self).__init__(**kwargs)
self._config_dict = {
'num_classes': num_classes,
'upsample_factor': upsample_factor,
'num_convs': num_convs,
'num_filters': num_filters,
'use_separable_conv': use_separable_conv,
'activation': activation,
'use_sync_bn': use_sync_bn,
'norm_momentum': norm_momentum,
'norm_epsilon': norm_epsilon,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
'class_agnostic': class_agnostic
}
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation = tf_utils.get_activation(activation)
def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
"""Creates the variables of the head."""
conv_op = (tf.keras.layers.SeparableConv2D
if self._config_dict['use_separable_conv']
else tf.keras.layers.Conv2D)
conv_kwargs = {
'filters': self._config_dict['num_filters'],
'kernel_size': 3,
'padding': 'same',
}
if self._config_dict['use_separable_conv']:
conv_kwargs.update({
'depthwise_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'pointwise_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'bias_initializer': tf.zeros_initializer(),
'depthwise_regularizer': self._config_dict['kernel_regularizer'],
'pointwise_regularizer': self._config_dict['kernel_regularizer'],
'bias_regularizer': self._config_dict['bias_regularizer'],
})
else:
conv_kwargs.update({
'kernel_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'bias_initializer': tf.zeros_initializer(),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
'bias_regularizer': self._config_dict['bias_regularizer'],
})
bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
if self._config_dict['use_sync_bn']
else tf.keras.layers.BatchNormalization)
bn_kwargs = {
'axis': self._bn_axis,
'momentum': self._config_dict['norm_momentum'],
'epsilon': self._config_dict['norm_epsilon'],
}
self._convs = []
self._conv_norms = []
for i in range(self._config_dict['num_convs']):
conv_name = 'mask-conv_{}'.format(i)
self._convs.append(conv_op(name=conv_name, **conv_kwargs))
bn_name = 'mask-conv-bn_{}'.format(i)
self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
self._deconv = tf.keras.layers.Conv2DTranspose(
filters=self._config_dict['num_filters'],
kernel_size=self._config_dict['upsample_factor'],
strides=self._config_dict['upsample_factor'],
padding='valid',
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
bias_initializer=tf.zeros_initializer(),
kernel_regularizer=self._config_dict['kernel_regularizer'],
bias_regularizer=self._config_dict['bias_regularizer'],
name='mask-upsampling')
self._deconv_bn = bn_op(name='mask-deconv-bn', **bn_kwargs)
if self._config_dict['class_agnostic']:
num_filters = 1
else:
num_filters = self._config_dict['num_classes']
conv_kwargs = {
'filters': num_filters,
'kernel_size': 1,
'padding': 'valid',
}
if self._config_dict['use_separable_conv']:
conv_kwargs.update({
'depthwise_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'pointwise_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'bias_initializer': tf.zeros_initializer(),
'depthwise_regularizer': self._config_dict['kernel_regularizer'],
'pointwise_regularizer': self._config_dict['kernel_regularizer'],
'bias_regularizer': self._config_dict['bias_regularizer'],
})
else:
conv_kwargs.update({
'kernel_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'bias_initializer': tf.zeros_initializer(),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
'bias_regularizer': self._config_dict['bias_regularizer'],
})
self._mask_regressor = conv_op(name='mask-logits', **conv_kwargs)
super(MaskHead, self).build(input_shape)
def call(self, inputs: List[tf.Tensor], training: bool = None):
"""Forward pass of mask branch for the Mask-RCNN model.
Args:
inputs: A `list` of two tensors where
inputs[0]: A `tf.Tensor` of shape [batch_size, num_instances,
roi_height, roi_width, roi_channels], representing the ROI features.
inputs[1]: A `tf.Tensor` of shape [batch_size, num_instances],
representing the classes of the ROIs.
training: A `bool` indicating whether it is in `training` mode.
Returns:
mask_outputs: A `tf.Tensor` of shape
[batch_size, num_instances, roi_height * upsample_factor,
roi_width * upsample_factor], representing the mask predictions.
"""
roi_features, roi_classes = inputs
batch_size, num_rois, height, width, filters = (
roi_features.get_shape().as_list())
if batch_size is None:
batch_size = tf.shape(roi_features)[0]
x = tf.reshape(roi_features, [-1, height, width, filters])
for conv, bn in zip(self._convs, self._conv_norms):
x = conv(x)
x = bn(x)
x = self._activation(x)
x = self._deconv(x)
x = self._deconv_bn(x)
x = self._activation(x)
logits = self._mask_regressor(x)
mask_height = height * self._config_dict['upsample_factor']
mask_width = width * self._config_dict['upsample_factor']
if self._config_dict['class_agnostic']:
logits = tf.reshape(logits, [-1, num_rois, mask_height, mask_width, 1])
else:
logits = tf.reshape(
logits,
[-1, num_rois, mask_height, mask_width,
self._config_dict['num_classes']])
batch_indices = tf.tile(
tf.expand_dims(tf.range(batch_size), axis=1), [1, num_rois])
mask_indices = tf.tile(
tf.expand_dims(tf.range(num_rois), axis=0), [batch_size, 1])
if self._config_dict['class_agnostic']:
class_gather_indices = tf.zeros_like(roi_classes, dtype=tf.int32)
else:
class_gather_indices = tf.cast(roi_classes, dtype=tf.int32)
gather_indices = tf.stack(
[batch_indices, mask_indices, class_gather_indices],
axis=2)
mask_outputs = tf.gather_nd(
tf.transpose(logits, [0, 1, 4, 2, 3]), gather_indices)
return mask_outputs
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for instance_heads.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling.heads import instance_heads
class DetectionHeadTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(0, 0, False, False),
(0, 1, False, False),
(1, 0, False, False),
(1, 1, False, False),
)
def test_forward(self, num_convs, num_fcs, use_separable_conv, use_sync_bn):
detection_head = instance_heads.DetectionHead(
num_classes=3,
num_convs=num_convs,
num_filters=16,
use_separable_conv=use_separable_conv,
num_fcs=num_fcs,
fc_dims=4,
activation='relu',
use_sync_bn=use_sync_bn,
norm_momentum=0.99,
norm_epsilon=0.001,
kernel_regularizer=None,
bias_regularizer=None,
)
roi_features = np.random.rand(2, 10, 128, 128, 16)
scores, boxes = detection_head(roi_features)
self.assertAllEqual(scores.numpy().shape, [2, 10, 3])
self.assertAllEqual(boxes.numpy().shape, [2, 10, 12])
def test_serialize_deserialize(self):
detection_head = instance_heads.DetectionHead(
num_classes=91,
num_convs=0,
num_filters=256,
use_separable_conv=False,
num_fcs=2,
fc_dims=1024,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
kernel_regularizer=None,
bias_regularizer=None,
)
config = detection_head.get_config()
new_detection_head = instance_heads.DetectionHead.from_config(config)
self.assertAllEqual(
detection_head.get_config(), new_detection_head.get_config())
class MaskHeadTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(1, 1, False),
(1, 2, False),
(2, 1, False),
(2, 2, False),
)
def test_forward(self, upsample_factor, num_convs, use_sync_bn):
mask_head = instance_heads.MaskHead(
num_classes=3,
upsample_factor=upsample_factor,
num_convs=num_convs,
num_filters=16,
use_separable_conv=False,
activation='relu',
use_sync_bn=use_sync_bn,
norm_momentum=0.99,
norm_epsilon=0.001,
kernel_regularizer=None,
bias_regularizer=None,
)
roi_features = np.random.rand(2, 10, 14, 14, 16)
roi_classes = np.zeros((2, 10))
masks = mask_head([roi_features, roi_classes])
self.assertAllEqual(
masks.numpy().shape,
[2, 10, 14 * upsample_factor, 14 * upsample_factor])
def test_serialize_deserialize(self):
mask_head = instance_heads.MaskHead(
num_classes=3,
upsample_factor=2,
num_convs=1,
num_filters=256,
use_separable_conv=False,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
kernel_regularizer=None,
bias_regularizer=None,
)
config = mask_head.get_config()
new_mask_head = instance_heads.MaskHead.from_config(config)
self.assertAllEqual(
mask_head.get_config(), new_mask_head.get_config())
def test_forward_class_agnostic(self):
mask_head = instance_heads.MaskHead(
num_classes=3,
class_agnostic=True
)
roi_features = np.random.rand(2, 10, 14, 14, 16)
roi_classes = np.zeros((2, 10))
masks = mask_head([roi_features, roi_classes])
self.assertAllEqual(masks.numpy().shape, [2, 10, 28, 28])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of segmentation heads."""
from typing import List, Union, Optional, Mapping, Tuple, Any
import tensorflow as tf
from official.modeling import tf_utils
from official.vision.modeling.layers import nn_layers
from official.vision.ops import spatial_transform_ops
class MaskScoring(tf.keras.Model):
"""Creates a mask scoring layer.
This implements mask scoring layer from the paper:
Zhaojin Huang, Lichao Huang, Yongchao Gong, Chang Huang, Xinggang Wang.
Mask Scoring R-CNN.
(https://arxiv.org/pdf/1903.00241.pdf)
"""
def __init__(
self,
num_classes: int,
fc_input_size: List[int],
num_convs: int = 3,
num_filters: int = 256,
fc_dims: int = 1024,
num_fcs: int = 2,
activation: str = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
**kwargs):
"""Initializes mask scoring layer.
Args:
num_classes: An `int` for number of classes.
fc_input_size: A List of `int` for the input size of the
fully connected layers.
num_convs: An`int` for number of conv layers.
num_filters: An `int` for the number of filters for conv layers.
fc_dims: An `int` number of filters for each fully connected layers.
num_fcs: An `int` for number of fully connected layers.
activation: A `str` name of the activation function.
use_sync_bn: A bool, whether or not to use sync batch normalization.
norm_momentum: A float for the momentum in BatchNorm. Defaults to 0.99.
norm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
0.001.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default is None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
**kwargs: Additional keyword arguments to be passed.
"""
super(MaskScoring, self).__init__(**kwargs)
self._config_dict = {
'num_classes': num_classes,
'num_convs': num_convs,
'num_filters': num_filters,
'fc_input_size': fc_input_size,
'fc_dims': fc_dims,
'num_fcs': num_fcs,
'use_sync_bn': use_sync_bn,
'norm_momentum': norm_momentum,
'norm_epsilon': norm_epsilon,
'activation': activation,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
}
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation = tf_utils.get_activation(activation)
def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
"""Creates the variables of the mask scoring head."""
conv_op = tf.keras.layers.Conv2D
conv_kwargs = {
'filters': self._config_dict['num_filters'],
'kernel_size': 3,
'padding': 'same',
}
conv_kwargs.update({
'kernel_initializer': tf.keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
'bias_initializer': tf.zeros_initializer(),
'kernel_regularizer': self._config_dict['kernel_regularizer'],
'bias_regularizer': self._config_dict['bias_regularizer'],
})
bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
if self._config_dict['use_sync_bn']
else tf.keras.layers.BatchNormalization)
bn_kwargs = {
'axis': self._bn_axis,
'momentum': self._config_dict['norm_momentum'],
'epsilon': self._config_dict['norm_epsilon'],
}
self._convs = []
self._conv_norms = []
for i in range(self._config_dict['num_convs']):
conv_name = 'mask-scoring_{}'.format(i)
self._convs.append(conv_op(name=conv_name, **conv_kwargs))
bn_name = 'mask-scoring-bn_{}'.format(i)
self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs))
self._fcs = []
self._fc_norms = []
for i in range(self._config_dict['num_fcs']):
fc_name = 'mask-scoring-fc_{}'.format(i)
self._fcs.append(
tf.keras.layers.Dense(
units=self._config_dict['fc_dims'],
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=1 / 3.0, mode='fan_out', distribution='uniform'),
kernel_regularizer=self._config_dict['kernel_regularizer'],
bias_regularizer=self._config_dict['bias_regularizer'],
name=fc_name))
bn_name = 'mask-scoring-fc-bn_{}'.format(i)
self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs))
self._classifier = tf.keras.layers.Dense(
units=self._config_dict['num_classes'],
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
bias_initializer=tf.zeros_initializer(),
kernel_regularizer=self._config_dict['kernel_regularizer'],
bias_regularizer=self._config_dict['bias_regularizer'],
name='iou-scores')
super(MaskScoring, self).build(input_shape)
def call(self, inputs: tf.Tensor, training: bool = None):
"""Forward pass mask scoring head.
Args:
inputs: A `tf.Tensor` of the shape [batch_size, width, size, num_classes],
representing the segmentation logits.
training: a `bool` indicating whether it is in `training` mode.
Returns:
mask_scores: A `tf.Tensor` of predicted mask scores
[batch_size, num_classes].
"""
x = tf.stop_gradient(inputs)
for conv, bn in zip(self._convs, self._conv_norms):
x = conv(x)
x = bn(x)
x = self._activation(x)
# Casts feat to float32 so the resize op can be run on TPU.
x = tf.cast(x, tf.float32)
x = tf.image.resize(x, size=self._config_dict['fc_input_size'],
method=tf.image.ResizeMethod.BILINEAR)
# Casts it back to be compatible with the rest opetations.
x = tf.cast(x, inputs.dtype)
_, h, w, filters = x.get_shape().as_list()
x = tf.reshape(x, [-1, h * w * filters])
for fc, bn in zip(self._fcs, self._fc_norms):
x = fc(x)
x = bn(x)
x = self._activation(x)
ious = self._classifier(x)
return ious
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
@tf.keras.utils.register_keras_serializable(package='Vision')
class SegmentationHead(tf.keras.layers.Layer):
"""Creates a segmentation head."""
def __init__(
self,
num_classes: int,
level: Union[int, str],
num_convs: int = 2,
num_filters: int = 256,
use_depthwise_convolution: bool = False,
prediction_kernel_size: int = 1,
upsample_factor: int = 1,
feature_fusion: Optional[str] = None,
decoder_min_level: Optional[int] = None,
decoder_max_level: Optional[int] = None,
low_level: int = 2,
low_level_num_filters: int = 48,
num_decoder_filters: int = 256,
activation: str = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
**kwargs):
"""Initializes a segmentation head.
Args:
num_classes: An `int` number of mask classification categories. The number
of classes does not include background class.
level: An `int` or `str`, level to use to build segmentation head.
num_convs: An `int` number of stacked convolution before the last
prediction layer.
num_filters: An `int` number to specify the number of filters used.
Default is 256.
use_depthwise_convolution: A bool to specify if use depthwise separable
convolutions.
prediction_kernel_size: An `int` number to specify the kernel size of the
prediction layer.
upsample_factor: An `int` number to specify the upsampling factor to
generate finer mask. Default 1 means no upsampling is applied.
feature_fusion: One of `deeplabv3plus`, `pyramid_fusion`,
`panoptic_fpn_fusion`, or None. If `deeplabv3plus`, features from
decoder_features[level] will be fused with low level feature maps from
backbone. If `pyramid_fusion`, multiscale features will be resized and
fused at the target level.
decoder_min_level: An `int` of minimum level from decoder to use in
feature fusion. It is only used when feature_fusion is set to
`panoptic_fpn_fusion`.
decoder_max_level: An `int` of maximum level from decoder to use in
feature fusion. It is only used when feature_fusion is set to
`panoptic_fpn_fusion`.
low_level: An `int` of backbone level to be used for feature fusion. It is
used when feature_fusion is set to `deeplabv3plus`.
low_level_num_filters: An `int` of reduced number of filters for the low
level features before fusing it with higher level features. It is only
used when feature_fusion is set to `deeplabv3plus`.
num_decoder_filters: An `int` of number of filters in the decoder outputs.
It is only used when feature_fusion is set to `panoptic_fpn_fusion`.
activation: A `str` that indicates which activation is used, e.g. 'relu',
'swish', etc.
use_sync_bn: A `bool` that indicates whether to use synchronized batch
normalization across different replicas.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default is None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
**kwargs: Additional keyword arguments to be passed.
"""
super(SegmentationHead, self).__init__(**kwargs)
self._config_dict = {
'num_classes': num_classes,
'level': level,
'num_convs': num_convs,
'num_filters': num_filters,
'use_depthwise_convolution': use_depthwise_convolution,
'prediction_kernel_size': prediction_kernel_size,
'upsample_factor': upsample_factor,
'feature_fusion': feature_fusion,
'decoder_min_level': decoder_min_level,
'decoder_max_level': decoder_max_level,
'low_level': low_level,
'low_level_num_filters': low_level_num_filters,
'num_decoder_filters': num_decoder_filters,
'activation': activation,
'use_sync_bn': use_sync_bn,
'norm_momentum': norm_momentum,
'norm_epsilon': norm_epsilon,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer
}
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation = tf_utils.get_activation(activation)
def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
"""Creates the variables of the segmentation head."""
use_depthwise_convolution = self._config_dict['use_depthwise_convolution']
random_initializer = tf.keras.initializers.RandomNormal(stddev=0.01)
conv_op = tf.keras.layers.Conv2D
conv_kwargs = {
'kernel_size': 3 if not use_depthwise_convolution else 1,
'padding': 'same',
'use_bias': False,
'kernel_initializer': random_initializer,
'kernel_regularizer': self._config_dict['kernel_regularizer'],
}
bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
if self._config_dict['use_sync_bn']
else tf.keras.layers.BatchNormalization)
bn_kwargs = {
'axis': self._bn_axis,
'momentum': self._config_dict['norm_momentum'],
'epsilon': self._config_dict['norm_epsilon'],
}
if self._config_dict['feature_fusion'] == 'deeplabv3plus':
# Deeplabv3+ feature fusion layers.
self._dlv3p_conv = conv_op(
kernel_size=1,
padding='same',
use_bias=False,
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
kernel_regularizer=self._config_dict['kernel_regularizer'],
name='segmentation_head_deeplabv3p_fusion_conv',
filters=self._config_dict['low_level_num_filters'])
self._dlv3p_norm = bn_op(
name='segmentation_head_deeplabv3p_fusion_norm', **bn_kwargs)
elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
self._panoptic_fpn_fusion = nn_layers.PanopticFPNFusion(
min_level=self._config_dict['decoder_min_level'],
max_level=self._config_dict['decoder_max_level'],
target_level=self._config_dict['level'],
num_filters=self._config_dict['num_filters'],
num_fpn_filters=self._config_dict['num_decoder_filters'],
activation=self._config_dict['activation'],
kernel_regularizer=self._config_dict['kernel_regularizer'],
bias_regularizer=self._config_dict['bias_regularizer'])
# Segmentation head layers.
self._convs = []
self._norms = []
for i in range(self._config_dict['num_convs']):
if use_depthwise_convolution:
self._convs.append(
tf.keras.layers.DepthwiseConv2D(
name='segmentation_head_depthwise_conv_{}'.format(i),
kernel_size=3,
padding='same',
use_bias=False,
depthwise_initializer=random_initializer,
depthwise_regularizer=self._config_dict['kernel_regularizer'],
depth_multiplier=1))
norm_name = 'segmentation_head_depthwise_norm_{}'.format(i)
self._norms.append(bn_op(name=norm_name, **bn_kwargs))
conv_name = 'segmentation_head_conv_{}'.format(i)
self._convs.append(
conv_op(
name=conv_name,
filters=self._config_dict['num_filters'],
**conv_kwargs))
norm_name = 'segmentation_head_norm_{}'.format(i)
self._norms.append(bn_op(name=norm_name, **bn_kwargs))
self._classifier = conv_op(
name='segmentation_output',
filters=self._config_dict['num_classes'],
kernel_size=self._config_dict['prediction_kernel_size'],
padding='same',
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
kernel_regularizer=self._config_dict['kernel_regularizer'],
bias_regularizer=self._config_dict['bias_regularizer'])
super().build(input_shape)
def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]],
Union[tf.Tensor, Mapping[str, tf.Tensor]]]):
"""Forward pass of the segmentation head.
It supports both a tuple of 2 tensors or 2 dictionaries. The first is
backbone endpoints, and the second is decoder endpoints. When inputs are
tensors, they are from a single level of feature maps. When inputs are
dictionaries, they contain multiple levels of feature maps, where the key
is the index of feature map.
Args:
inputs: A tuple of 2 feature map tensors of shape
[batch, height_l, width_l, channels] or 2 dictionaries of tensors:
- key: A `str` of the level of the multilevel features.
- values: A `tf.Tensor` of the feature map tensors, whose shape is
[batch, height_l, width_l, channels].
The first is backbone endpoints, and the second is decoder endpoints.
Returns:
segmentation prediction mask: A `tf.Tensor` of the segmentation mask
scores predicted from input features.
"""
backbone_output = inputs[0]
decoder_output = inputs[1]
if self._config_dict['feature_fusion'] == 'deeplabv3plus':
# deeplabv3+ feature fusion
x = decoder_output[str(self._config_dict['level'])] if isinstance(
decoder_output, dict) else decoder_output
y = backbone_output[str(self._config_dict['low_level'])] if isinstance(
backbone_output, dict) else backbone_output
y = self._dlv3p_norm(self._dlv3p_conv(y))
y = self._activation(y)
x = tf.image.resize(
x, tf.shape(y)[1:3], method=tf.image.ResizeMethod.BILINEAR)
x = tf.cast(x, dtype=y.dtype)
x = tf.concat([x, y], axis=self._bn_axis)
elif self._config_dict['feature_fusion'] == 'pyramid_fusion':
if not isinstance(decoder_output, dict):
raise ValueError('Only support dictionary decoder_output.')
x = nn_layers.pyramid_feature_fusion(decoder_output,
self._config_dict['level'])
elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
x = self._panoptic_fpn_fusion(decoder_output)
else:
x = decoder_output[str(self._config_dict['level'])] if isinstance(
decoder_output, dict) else decoder_output
for conv, norm in zip(self._convs, self._norms):
x = conv(x)
x = norm(x)
x = self._activation(x)
if self._config_dict['upsample_factor'] > 1:
x = spatial_transform_ops.nearest_upsampling(
x, scale=self._config_dict['upsample_factor'])
return self._classifier(x)
def get_config(self):
base_config = super().get_config()
return dict(list(base_config.items()) + list(self._config_dict.items()))
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for segmentation_heads.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling.heads import segmentation_heads
class SegmentationHeadTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(2, 'pyramid_fusion', None, None),
(3, 'pyramid_fusion', None, None),
(2, 'panoptic_fpn_fusion', 2, 5),
(2, 'panoptic_fpn_fusion', 2, 6),
(3, 'panoptic_fpn_fusion', 3, 5),
(3, 'panoptic_fpn_fusion', 3, 6))
def test_forward(self, level, feature_fusion,
decoder_min_level, decoder_max_level):
backbone_features = {
'3': np.random.rand(2, 128, 128, 16),
'4': np.random.rand(2, 64, 64, 16),
'5': np.random.rand(2, 32, 32, 16),
}
decoder_features = {
'3': np.random.rand(2, 128, 128, 64),
'4': np.random.rand(2, 64, 64, 64),
'5': np.random.rand(2, 32, 32, 64),
'6': np.random.rand(2, 16, 16, 64),
}
if feature_fusion == 'panoptic_fpn_fusion':
backbone_features['2'] = np.random.rand(2, 256, 256, 16)
decoder_features['2'] = np.random.rand(2, 256, 256, 64)
head = segmentation_heads.SegmentationHead(
num_classes=10,
level=level,
feature_fusion=feature_fusion,
decoder_min_level=decoder_min_level,
decoder_max_level=decoder_max_level,
num_decoder_filters=64)
logits = head((backbone_features, decoder_features))
if level in decoder_features:
self.assertAllEqual(logits.numpy().shape, [
2, decoder_features[str(level)].shape[1],
decoder_features[str(level)].shape[2], 10
])
def test_serialize_deserialize(self):
head = segmentation_heads.SegmentationHead(num_classes=10, level=3)
config = head.get_config()
new_head = segmentation_heads.SegmentationHead.from_config(config)
self.assertAllEqual(head.get_config(), new_head.get_config())
class MaskScoringHeadTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(1, 1, 64, [4, 4]),
(2, 1, 64, [4, 4]),
(3, 1, 64, [4, 4]),
(1, 2, 32, [8, 8]),
(2, 2, 32, [8, 8]),
(3, 2, 32, [8, 8]),)
def test_forward(self, num_convs, num_fcs,
num_filters, fc_input_size):
features = np.random.rand(2, 64, 64, 16)
head = segmentation_heads.MaskScoring(
num_classes=2,
num_convs=num_convs,
num_filters=num_filters,
fc_dims=128,
fc_input_size=fc_input_size)
scores = head(features)
self.assertAllEqual(scores.numpy().shape, [2, 2])
def test_serialize_deserialize(self):
head = segmentation_heads.MaskScoring(
num_classes=2, fc_input_size=[4, 4], fc_dims=128)
config = head.get_config()
new_head = segmentation_heads.MaskScoring.from_config(config)
self.assertAllEqual(head.get_config(), new_head.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Layers package definition."""
from official.vision.modeling.layers.box_sampler import BoxSampler
from official.vision.modeling.layers.detection_generator import DetectionGenerator
from official.vision.modeling.layers.detection_generator import MultilevelDetectionGenerator
from official.vision.modeling.layers.mask_sampler import MaskSampler
from official.vision.modeling.layers.nn_blocks import BottleneckBlock
from official.vision.modeling.layers.nn_blocks import BottleneckResidualInner
from official.vision.modeling.layers.nn_blocks import DepthwiseSeparableConvBlock
from official.vision.modeling.layers.nn_blocks import InvertedBottleneckBlock
from official.vision.modeling.layers.nn_blocks import ResidualBlock
from official.vision.modeling.layers.nn_blocks import ResidualInner
from official.vision.modeling.layers.nn_blocks import ReversibleLayer
from official.vision.modeling.layers.nn_blocks_3d import BottleneckBlock3D
from official.vision.modeling.layers.nn_blocks_3d import SelfGating
from official.vision.modeling.layers.nn_layers import CausalConvMixin
from official.vision.modeling.layers.nn_layers import Conv2D
from official.vision.modeling.layers.nn_layers import Conv3D
from official.vision.modeling.layers.nn_layers import DepthwiseConv2D
from official.vision.modeling.layers.nn_layers import GlobalAveragePool3D
from official.vision.modeling.layers.nn_layers import PositionalEncoding
from official.vision.modeling.layers.nn_layers import Scale
from official.vision.modeling.layers.nn_layers import SpatialAveragePool3D
from official.vision.modeling.layers.nn_layers import SqueezeExcitation
from official.vision.modeling.layers.nn_layers import StochasticDepth
from official.vision.modeling.layers.nn_layers import TemporalSoftmaxPool
from official.vision.modeling.layers.roi_aligner import MultilevelROIAligner
from official.vision.modeling.layers.roi_generator import MultilevelROIGenerator
from official.vision.modeling.layers.roi_sampler import ROISampler
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of box sampler."""
# Import libraries
import tensorflow as tf
from official.vision.ops import sampling_ops
@tf.keras.utils.register_keras_serializable(package='Vision')
class BoxSampler(tf.keras.layers.Layer):
"""Creates a BoxSampler to sample positive and negative boxes."""
def __init__(self,
num_samples: int = 512,
foreground_fraction: float = 0.25,
**kwargs):
"""Initializes a box sampler.
Args:
num_samples: An `int` of the number of sampled boxes per image.
foreground_fraction: A `float` in [0, 1], what percentage of boxes should
be sampled from the positive examples.
**kwargs: Additional keyword arguments passed to Layer.
"""
self._config_dict = {
'num_samples': num_samples,
'foreground_fraction': foreground_fraction,
}
super(BoxSampler, self).__init__(**kwargs)
def call(self, positive_matches: tf.Tensor, negative_matches: tf.Tensor,
ignored_matches: tf.Tensor):
"""Samples and selects positive and negative instances.
Args:
positive_matches: A `bool` tensor of shape of [batch, N] where N is the
number of instances. For each element, `True` means the instance
corresponds to a positive example.
negative_matches: A `bool` tensor of shape of [batch, N] where N is the
number of instances. For each element, `True` means the instance
corresponds to a negative example.
ignored_matches: A `bool` tensor of shape of [batch, N] where N is the
number of instances. For each element, `True` means the instance should
be ignored.
Returns:
A `tf.tensor` of shape of [batch_size, K], storing the indices of the
sampled examples, where K is `num_samples`.
"""
sample_candidates = tf.logical_and(
tf.logical_or(positive_matches, negative_matches),
tf.logical_not(ignored_matches))
sampler = sampling_ops.BalancedPositiveNegativeSampler(
positive_fraction=self._config_dict['foreground_fraction'],
is_static=True)
batch_size = sample_candidates.shape[0]
sampled_indicators = []
for i in range(batch_size):
sampled_indicator = sampler.subsample(
sample_candidates[i],
self._config_dict['num_samples'],
positive_matches[i])
sampled_indicators.append(sampled_indicator)
sampled_indicators = tf.stack(sampled_indicators)
_, selected_indices = tf.nn.top_k(
tf.cast(sampled_indicators, dtype=tf.int32),
k=self._config_dict['num_samples'],
sorted=True)
return selected_indices
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Layers for DeepLabV3."""
import tensorflow as tf
class SpatialPyramidPooling(tf.keras.layers.Layer):
"""Implements the Atrous Spatial Pyramid Pooling.
References:
[Rethinking Atrous Convolution for Semantic Image Segmentation](
https://arxiv.org/pdf/1706.05587.pdf)
[Encoder-Decoder with Atrous Separable Convolution for Semantic Image
Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
"""
def __init__(
self,
output_channels,
dilation_rates,
pool_kernel_size=None,
use_sync_bn=False,
batchnorm_momentum=0.99,
batchnorm_epsilon=0.001,
activation='relu',
dropout=0.5,
kernel_initializer='glorot_uniform',
kernel_regularizer=None,
interpolation='bilinear',
use_depthwise_convolution=False,
**kwargs):
"""Initializes `SpatialPyramidPooling`.
Args:
output_channels: Number of channels produced by SpatialPyramidPooling.
dilation_rates: A list of integers for parallel dilated conv.
pool_kernel_size: A list of integers or None. If None, global average
pooling is applied, otherwise an average pooling of pool_kernel_size
is applied.
use_sync_bn: A bool, whether or not to use sync batch normalization.
batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
0.99.
batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
0.001.
activation: A `str` for type of activation to be used. Defaults to 'relu'.
dropout: A float for the dropout rate before output. Defaults to 0.5.
kernel_initializer: Kernel initializer for conv layers. Defaults to
`glorot_uniform`.
kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
interpolation: The interpolation method for upsampling. Defaults to
`bilinear`.
use_depthwise_convolution: Allows spatial pooling to be separable
depthwise convolusions. [Encoder-Decoder with Atrous Separable
Convolution for Semantic Image Segmentation](
https://arxiv.org/pdf/1802.02611.pdf)
**kwargs: Other keyword arguments for the layer.
"""
super(SpatialPyramidPooling, self).__init__(**kwargs)
self.output_channels = output_channels
self.dilation_rates = dilation_rates
self.use_sync_bn = use_sync_bn
self.batchnorm_momentum = batchnorm_momentum
self.batchnorm_epsilon = batchnorm_epsilon
self.activation = activation
self.dropout = dropout
self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self.interpolation = interpolation
self.input_spec = tf.keras.layers.InputSpec(ndim=4)
self.pool_kernel_size = pool_kernel_size
self.use_depthwise_convolution = use_depthwise_convolution
def build(self, input_shape):
height = input_shape[1]
width = input_shape[2]
channels = input_shape[3]
self.aspp_layers = []
if self.use_sync_bn:
bn_op = tf.keras.layers.experimental.SyncBatchNormalization
else:
bn_op = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
bn_axis = -1
else:
bn_axis = 1
conv_sequential = tf.keras.Sequential([
tf.keras.layers.Conv2D(
filters=self.output_channels, kernel_size=(1, 1),
kernel_initializer=self.kernel_initializer,
kernel_regularizer=self.kernel_regularizer,
use_bias=False),
bn_op(
axis=bn_axis,
momentum=self.batchnorm_momentum,
epsilon=self.batchnorm_epsilon),
tf.keras.layers.Activation(self.activation)
])
self.aspp_layers.append(conv_sequential)
for dilation_rate in self.dilation_rates:
leading_layers = []
kernel_size = (3, 3)
if self.use_depthwise_convolution:
leading_layers += [
tf.keras.layers.DepthwiseConv2D(
depth_multiplier=1, kernel_size=kernel_size,
padding='same', depthwise_regularizer=self.kernel_regularizer,
depthwise_initializer=self.kernel_initializer,
dilation_rate=dilation_rate, use_bias=False)
]
kernel_size = (1, 1)
conv_sequential = tf.keras.Sequential(leading_layers + [
tf.keras.layers.Conv2D(
filters=self.output_channels, kernel_size=kernel_size,
padding='same', kernel_regularizer=self.kernel_regularizer,
kernel_initializer=self.kernel_initializer,
dilation_rate=dilation_rate, use_bias=False),
bn_op(axis=bn_axis, momentum=self.batchnorm_momentum,
epsilon=self.batchnorm_epsilon),
tf.keras.layers.Activation(self.activation)])
self.aspp_layers.append(conv_sequential)
if self.pool_kernel_size is None:
pool_sequential = tf.keras.Sequential([
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Reshape((1, 1, channels))])
else:
pool_sequential = tf.keras.Sequential([
tf.keras.layers.AveragePooling2D(self.pool_kernel_size)])
pool_sequential.add(
tf.keras.Sequential([
tf.keras.layers.Conv2D(
filters=self.output_channels,
kernel_size=(1, 1),
kernel_initializer=self.kernel_initializer,
kernel_regularizer=self.kernel_regularizer,
use_bias=False),
bn_op(
axis=bn_axis,
momentum=self.batchnorm_momentum,
epsilon=self.batchnorm_epsilon),
tf.keras.layers.Activation(self.activation),
tf.keras.layers.experimental.preprocessing.Resizing(
height,
width,
interpolation=self.interpolation,
dtype=tf.float32)
]))
self.aspp_layers.append(pool_sequential)
self.projection = tf.keras.Sequential([
tf.keras.layers.Conv2D(
filters=self.output_channels, kernel_size=(1, 1),
kernel_initializer=self.kernel_initializer,
kernel_regularizer=self.kernel_regularizer,
use_bias=False),
bn_op(
axis=bn_axis,
momentum=self.batchnorm_momentum,
epsilon=self.batchnorm_epsilon),
tf.keras.layers.Activation(self.activation),
tf.keras.layers.Dropout(rate=self.dropout)])
def call(self, inputs, training=None):
if training is None:
training = tf.keras.backend.learning_phase()
result = []
for layer in self.aspp_layers:
result.append(tf.cast(layer(inputs, training=training), inputs.dtype))
result = tf.concat(result, axis=-1)
result = self.projection(result, training=training)
return result
def get_config(self):
config = {
'output_channels': self.output_channels,
'dilation_rates': self.dilation_rates,
'pool_kernel_size': self.pool_kernel_size,
'use_sync_bn': self.use_sync_bn,
'batchnorm_momentum': self.batchnorm_momentum,
'batchnorm_epsilon': self.batchnorm_epsilon,
'activation': self.activation,
'dropout': self.dropout,
'kernel_initializer': tf.keras.initializers.serialize(
self.kernel_initializer),
'kernel_regularizer': tf.keras.regularizers.serialize(
self.kernel_regularizer),
'interpolation': self.interpolation,
}
base_config = super(SpatialPyramidPooling, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for ASPP."""
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized
from official.vision.modeling.layers import deeplab
@keras_parameterized.run_all_keras_modes
class DeeplabTest(keras_parameterized.TestCase):
@keras_parameterized.parameterized.parameters(
(None,),
([32, 32],),
)
def test_aspp(self, pool_kernel_size):
inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32)
layer = deeplab.SpatialPyramidPooling(output_channels=256,
dilation_rates=[6, 12, 18],
pool_kernel_size=None)
output = layer(inputs)
self.assertAllEqual([None, 64, 64, 256], output.shape)
def test_aspp_invalid_shape(self):
inputs = tf.keras.Input(shape=(64, 64), dtype=tf.float32)
layer = deeplab.SpatialPyramidPooling(output_channels=256,
dilation_rates=[6, 12, 18])
with self.assertRaises(ValueError):
_ = layer(inputs)
def test_config_with_custom_name(self):
layer = deeplab.SpatialPyramidPooling(256, [5], name='aspp')
config = layer.get_config()
layer_1 = deeplab.SpatialPyramidPooling.from_config(config)
self.assertEqual(layer_1.name, layer.name)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of generators to generate the final detections."""
import contextlib
from typing import Any, Dict, List, Optional, Mapping, Sequence
# Import libraries
import tensorflow as tf
from official.vision.ops import box_ops
from official.vision.ops import nms
from official.vision.ops import preprocess_ops
def _generate_detections_v1(boxes: tf.Tensor,
scores: tf.Tensor,
attributes: Optional[Mapping[str,
tf.Tensor]] = None,
pre_nms_top_k: int = 5000,
pre_nms_score_threshold: float = 0.05,
nms_iou_threshold: float = 0.5,
max_num_detections: int = 100,
soft_nms_sigma: Optional[float] = None):
"""Generates the final detections given the model outputs.
The implementation unrolls the batch dimension and process images one by one.
It required the batch dimension to be statically known and it is TPU
compatible.
Args:
boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
`[batch_size, N, 1, 4]` for box predictions on all feature levels. The
N is the number of total anchors on all levels.
scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
stacks class probability on all feature levels. The N is the number of
total anchors on all levels. The num_classes is the number of classes
predicted by the model. Note that the class_outputs here is the raw score.
attributes: None or a dict of (attribute_name, attributes) pairs. Each
attributes is a `tf.Tensor` with shape
`[batch_size, N, num_classes, attribute_size]` or
`[batch_size, N, 1, attribute_size]` for attribute predictions on all
feature levels. The N is the number of total anchors on all levels. Can
be None if no attribute learning is required.
pre_nms_top_k: An `int` number of top candidate detections per class before
NMS.
pre_nms_score_threshold: A `float` representing the threshold for deciding
when to remove boxes based on score.
nms_iou_threshold: A `float` representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
max_num_detections: A scalar representing maximum number of boxes retained
over all classes.
soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
When soft_nms_sigma=0.0 (which is default), we fall back to standard NMS.
Returns:
nms_boxes: A `float` type `tf.Tensor` of shape
`[batch_size, max_num_detections, 4]` representing top detected boxes in
`[y1, x1, y2, x2]`.
nms_scores: A `float` type `tf.Tensor` of shape
`[batch_size, max_num_detections]` representing sorted confidence scores
for detected boxes. The values are between `[0, 1]`.
nms_classes: An `int` type `tf.Tensor` of shape
`[batch_size, max_num_detections]` representing classes for detected
boxes.
valid_detections: An `int` type `tf.Tensor` of shape `[batch_size]` only the
top `valid_detections` boxes are valid detections.
nms_attributes: None or a dict of (attribute_name, attributes). Each
attribute is a `float` type `tf.Tensor` of shape
`[batch_size, max_num_detections, attribute_size]` representing attribute
predictions for detected boxes. Can be an empty dict if no attribute
learning is required.
"""
with tf.name_scope('generate_detections'):
batch_size = scores.get_shape().as_list()[0]
nmsed_boxes = []
nmsed_classes = []
nmsed_scores = []
valid_detections = []
if attributes:
nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
else:
nmsed_attributes = {}
for i in range(batch_size):
(nmsed_boxes_i, nmsed_scores_i, nmsed_classes_i, valid_detections_i,
nmsed_att_i) = _generate_detections_per_image(
boxes[i],
scores[i],
attributes={
att_name: att[i] for att_name, att in attributes.items()
} if attributes else {},
pre_nms_top_k=pre_nms_top_k,
pre_nms_score_threshold=pre_nms_score_threshold,
nms_iou_threshold=nms_iou_threshold,
max_num_detections=max_num_detections,
soft_nms_sigma=soft_nms_sigma)
nmsed_boxes.append(nmsed_boxes_i)
nmsed_scores.append(nmsed_scores_i)
nmsed_classes.append(nmsed_classes_i)
valid_detections.append(valid_detections_i)
if attributes:
for att_name in attributes.keys():
nmsed_attributes[att_name].append(nmsed_att_i[att_name])
nmsed_boxes = tf.stack(nmsed_boxes, axis=0)
nmsed_scores = tf.stack(nmsed_scores, axis=0)
nmsed_classes = tf.stack(nmsed_classes, axis=0)
valid_detections = tf.stack(valid_detections, axis=0)
if attributes:
for att_name in attributes.keys():
nmsed_attributes[att_name] = tf.stack(nmsed_attributes[att_name], axis=0)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
def _generate_detections_per_image(
boxes: tf.Tensor,
scores: tf.Tensor,
attributes: Optional[Mapping[str, tf.Tensor]] = None,
pre_nms_top_k: int = 5000,
pre_nms_score_threshold: float = 0.05,
nms_iou_threshold: float = 0.5,
max_num_detections: int = 100,
soft_nms_sigma: Optional[float] = None):
"""Generates the final detections per image given the model outputs.
Args:
boxes: A `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which
box predictions on all feature levels. The N is the number of total
anchors on all levels.
scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class
probability on all feature levels. The N is the number of total anchors on
all levels. The num_classes is the number of classes predicted by the
model. Note that the class_outputs here is the raw score.
attributes: If not None, a dict of `tf.Tensor`. Each value is in shape
`[N, num_classes, attribute_size]` or `[N, 1, attribute_size]` of
attribute predictions on all feature levels. The N is the number of total
anchors on all levels.
pre_nms_top_k: An `int` number of top candidate detections per class before
NMS.
pre_nms_score_threshold: A `float` representing the threshold for deciding
when to remove boxes based on score.
nms_iou_threshold: A `float` representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
max_num_detections: A `scalar` representing maximum number of boxes retained
over all classes.
soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
When soft_nms_sigma=0.0, we fall back to standard NMS.
If set to None, `tf.image.non_max_suppression_padded` is called instead.
Returns:
nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]`
representing top detected boxes in `[y1, x1, y2, x2]`.
nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing
sorted confidence scores for detected boxes. The values are between [0,
1].
nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing
classes for detected boxes.
valid_detections: An `int` tf.Tensor of shape [1] only the top
`valid_detections` boxes are valid detections.
nms_attributes: None or a dict. Each value is a `float` tf.Tensor of shape
`[max_num_detections, attribute_size]` representing attribute predictions
for detected boxes. Can be an empty dict if `attributes` is None.
"""
nmsed_boxes = []
nmsed_scores = []
nmsed_classes = []
num_classes_for_box = boxes.get_shape().as_list()[1]
num_classes = scores.get_shape().as_list()[1]
if attributes:
nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
else:
nmsed_attributes = {}
for i in range(num_classes):
boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
scores_i = scores[:, i]
# Obtains pre_nms_top_k before running NMS.
scores_i, indices = tf.nn.top_k(
scores_i, k=tf.minimum(tf.shape(scores_i)[-1], pre_nms_top_k))
boxes_i = tf.gather(boxes_i, indices)
if soft_nms_sigma is not None:
(nmsed_indices_i,
nmsed_scores_i) = tf.image.non_max_suppression_with_scores(
tf.cast(boxes_i, tf.float32),
tf.cast(scores_i, tf.float32),
max_num_detections,
iou_threshold=nms_iou_threshold,
score_threshold=pre_nms_score_threshold,
soft_nms_sigma=soft_nms_sigma,
name='nms_detections_' + str(i))
nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
nmsed_boxes_i = preprocess_ops.clip_or_pad_to_fixed_size(
nmsed_boxes_i, max_num_detections, 0.0)
nmsed_scores_i = preprocess_ops.clip_or_pad_to_fixed_size(
nmsed_scores_i, max_num_detections, -1.0)
else:
(nmsed_indices_i,
nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
tf.cast(boxes_i, tf.float32),
tf.cast(scores_i, tf.float32),
max_num_detections,
iou_threshold=nms_iou_threshold,
score_threshold=pre_nms_score_threshold,
pad_to_max_output_size=True,
name='nms_detections_' + str(i))
nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
# Sets scores of invalid boxes to -1.
nmsed_scores_i = tf.where(
tf.less(tf.range(max_num_detections), [nmsed_num_valid_i]),
nmsed_scores_i, -tf.ones_like(nmsed_scores_i))
nmsed_classes_i = tf.fill([max_num_detections], i)
nmsed_boxes.append(nmsed_boxes_i)
nmsed_scores.append(nmsed_scores_i)
nmsed_classes.append(nmsed_classes_i)
if attributes:
for att_name, att in attributes.items():
num_classes_for_attr = att.get_shape().as_list()[1]
att_i = att[:, min(num_classes_for_attr - 1, i)]
att_i = tf.gather(att_i, indices)
nmsed_att_i = tf.gather(att_i, nmsed_indices_i)
nmsed_att_i = preprocess_ops.clip_or_pad_to_fixed_size(
nmsed_att_i, max_num_detections, 0.0)
nmsed_attributes[att_name].append(nmsed_att_i)
# Concats results from all classes and sort them.
nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
nmsed_scores = tf.concat(nmsed_scores, axis=0)
nmsed_classes = tf.concat(nmsed_classes, axis=0)
nmsed_scores, indices = tf.nn.top_k(
nmsed_scores, k=max_num_detections, sorted=True)
nmsed_boxes = tf.gather(nmsed_boxes, indices)
nmsed_classes = tf.gather(nmsed_classes, indices)
valid_detections = tf.reduce_sum(
tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
if attributes:
for att_name in attributes.keys():
nmsed_attributes[att_name] = tf.concat(nmsed_attributes[att_name], axis=0)
nmsed_attributes[att_name] = tf.gather(nmsed_attributes[att_name],
indices)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
def _select_top_k_scores(scores_in: tf.Tensor, pre_nms_num_detections: int):
"""Selects top_k scores and indices for each class.
Args:
scores_in: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
stacks class logit outputs on all feature levels. The N is the number of
total anchors on all levels. The num_classes is the number of classes
predicted by the model.
pre_nms_num_detections: Number of candidates before NMS.
Returns:
scores and indices: A `tf.Tensor` with shape
`[batch_size, pre_nms_num_detections, num_classes]`.
"""
batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
if batch_size is None:
batch_size = tf.shape(scores_in)[0]
scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
top_k_scores, top_k_indices = tf.nn.top_k(
scores_trans, k=pre_nms_num_detections, sorted=True)
top_k_scores = tf.reshape(top_k_scores,
[batch_size, num_class, pre_nms_num_detections])
top_k_indices = tf.reshape(top_k_indices,
[batch_size, num_class, pre_nms_num_detections])
return tf.transpose(top_k_scores,
[0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
def _generate_detections_v2(boxes: tf.Tensor,
scores: tf.Tensor,
pre_nms_top_k: int = 5000,
pre_nms_score_threshold: float = 0.05,
nms_iou_threshold: float = 0.5,
max_num_detections: int = 100):
"""Generates the final detections given the model outputs.
This implementation unrolls classes dimension while using the tf.while_loop
to implement the batched NMS, so that it can be parallelized at the batch
dimension. It should give better performance comparing to v1 implementation.
It is TPU compatible.
Args:
boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
`[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
N is the number of total anchors on all levels.
scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
stacks class probability on all feature levels. The N is the number of
total anchors on all levels. The num_classes is the number of classes
predicted by the model. Note that the class_outputs here is the raw score.
pre_nms_top_k: An `int` number of top candidate detections per class before
NMS.
pre_nms_score_threshold: A `float` representing the threshold for deciding
when to remove boxes based on score.
nms_iou_threshold: A `float` representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
max_num_detections: A `scalar` representing maximum number of boxes retained
over all classes.
Returns:
nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
representing classes for detected boxes.
valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
with tf.name_scope('generate_detections'):
nmsed_boxes = []
nmsed_classes = []
nmsed_scores = []
valid_detections = []
batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
if batch_size is None:
batch_size = tf.shape(boxes)[0]
_, total_anchors, num_classes = scores.get_shape().as_list()
# Selects top pre_nms_num scores and indices before NMS.
scores, indices = _select_top_k_scores(
scores, min(total_anchors, pre_nms_top_k))
for i in range(num_classes):
boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
scores_i = scores[:, :, i]
# Obtains pre_nms_top_k before running NMS.
boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
# Filter out scores.
boxes_i, scores_i = box_ops.filter_boxes_by_scores(
boxes_i, scores_i, min_score_threshold=pre_nms_score_threshold)
(nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
tf.cast(scores_i, tf.float32),
tf.cast(boxes_i, tf.float32),
max_num_detections,
iou_threshold=nms_iou_threshold)
nmsed_classes_i = tf.fill([batch_size, max_num_detections], i)
nmsed_boxes.append(nmsed_boxes_i)
nmsed_scores.append(nmsed_scores_i)
nmsed_classes.append(nmsed_classes_i)
nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
nmsed_scores = tf.concat(nmsed_scores, axis=1)
nmsed_classes = tf.concat(nmsed_classes, axis=1)
nmsed_scores, indices = tf.nn.top_k(
nmsed_scores, k=max_num_detections, sorted=True)
nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
valid_detections = tf.reduce_sum(
input_tensor=tf.cast(tf.greater(nmsed_scores, 0.0), tf.int32), axis=1)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
def _generate_detections_batched(boxes: tf.Tensor, scores: tf.Tensor,
pre_nms_score_threshold: float,
nms_iou_threshold: float,
max_num_detections: int):
"""Generates detected boxes with scores and classes for one-stage detector.
The function takes output of multi-level ConvNets and anchor boxes and
generates detected boxes. Note that this used batched nms, which is not
supported on TPU currently.
Args:
boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or
`[batch_size, N, 1, 4]`, which box predictions on all feature levels. The
N is the number of total anchors on all levels.
scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which
stacks class probability on all feature levels. The N is the number of
total anchors on all levels. The num_classes is the number of classes
predicted by the model. Note that the class_outputs here is the raw score.
pre_nms_score_threshold: A `float` representing the threshold for deciding
when to remove boxes based on score.
nms_iou_threshold: A `float` representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
max_num_detections: A `scalar` representing maximum number of boxes retained
over all classes.
Returns:
nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections]
representing classes for detected boxes.
valid_detections: An `int` tf.Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
with tf.name_scope('generate_detections'):
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
tf.image.combined_non_max_suppression(
boxes,
scores,
max_output_size_per_class=max_num_detections,
max_total_size=max_num_detections,
iou_threshold=nms_iou_threshold,
score_threshold=pre_nms_score_threshold,
pad_per_class=False,
clip_boxes=False))
nmsed_classes = tf.cast(nmsed_classes, tf.int32)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
def _generate_detections_tflite_implements_signature(
config: Dict[str, Any]) -> str:
"""Returns `experimental_implements` signature for TFLite's custom NMS op.
This signature encodes the arguments to correctly initialize TFLite's custom
post-processing op in the MLIR converter.
For details on `experimental_implements` see here:
https://www.tensorflow.org/api_docs/python/tf/function
Args:
config: A dictionary of configs defining parameters for TFLite NMS op.
Returns:
An `experimental_implements` signature string.
"""
scale_value = 1.0
implements_signature = [
'name: "%s"' % 'TFLite_Detection_PostProcess',
'attr { key: "max_detections" value { i: %d } }' %
config['max_detections'],
'attr { key: "max_classes_per_detection" value { i: %d } }' %
config['max_classes_per_detection'],
'attr { key: "use_regular_nms" value { b: %s } }' %
str(config['use_regular_nms']).lower(),
'attr { key: "nms_score_threshold" value { f: %f } }' %
config['nms_score_threshold'],
'attr { key: "nms_iou_threshold" value { f: %f } }' %
config['nms_iou_threshold'],
'attr { key: "y_scale" value { f: %f } }' % scale_value,
'attr { key: "x_scale" value { f: %f } }' % scale_value,
'attr { key: "h_scale" value { f: %f } }' % scale_value,
'attr { key: "w_scale" value { f: %f } }' % scale_value,
'attr { key: "num_classes" value { i: %d } }' % config['num_classes']
]
implements_signature = ' '.join(implements_signature)
return implements_signature
def _generate_detections_tflite(raw_boxes: Mapping[str, tf.Tensor],
raw_scores: Mapping[str, tf.Tensor],
anchor_boxes: Mapping[str, tf.Tensor],
config: Dict[str, Any]) -> Sequence[Any]:
"""Generate detections for conversion to TFLite.
Mathematically same as class-agnostic NMS, except that the last portion of
the TF graph constitutes a dummy `tf.function` that contains an annotation
for conversion to TFLite's custom NMS op. Using this custom op allows
features like post-training quantization & accelerator support.
NOTE: This function does NOT return a valid output, and is only meant to
generate a SavedModel for TFLite conversion via MLIR. The generated SavedModel
should not be used for inference.
For TFLite op details, see tensorflow/lite/kernels/detection_postprocess.cc
Args:
raw_boxes: A dictionary of tensors for raw boxes. Key is level of features
and value is a tensor denoting a level of boxes with shape [1, H, W, 4 *
num_anchors].
raw_scores: A dictionary of tensors for classes. Key is level of features
and value is a tensor denoting a level of logits with shape [1, H, W,
num_class * num_anchors].
anchor_boxes: A dictionary of tensors for anchor boxes. Key is level of
features and value is a tensor denoting a level of anchors with shape
[num_anchors, 4].
config: A dictionary of configs defining parameters for TFLite NMS op.
Returns:
A (dummy) tuple of (boxes, scores, classess, num_detections).
Raises:
ValueError: If the last dimension of predicted boxes is not divisible by 4,
or the last dimension of predicted scores is not divisible by number of
anchors per location.
"""
scores, boxes, anchors = [], [], []
levels = list(raw_scores.keys())
min_level = int(min(levels))
max_level = int(max(levels))
batch_size = tf.shape(raw_scores[str(min_level)])[0]
num_anchors_per_locations_times_4 = raw_boxes[str(
min_level)].get_shape().as_list()[-1]
if num_anchors_per_locations_times_4 % 4 != 0:
raise ValueError(
'The last dimension of predicted boxes should be divisible by 4.')
num_anchors_per_locations = num_anchors_per_locations_times_4 // 4
if num_anchors_per_locations_times_4 % 4 != 0:
raise ValueError(
f'The last dimension of predicted scores should be divisible by {num_anchors_per_locations}.'
)
num_classes = raw_scores[str(
min_level)].get_shape().as_list()[-1] // num_anchors_per_locations
config.update({'num_classes': num_classes})
for i in range(min_level, max_level + 1):
scores.append(
tf.sigmoid(
tf.reshape(raw_scores[str(i)], [batch_size, -1, num_classes])))
boxes.append(tf.reshape(raw_boxes[str(i)], [batch_size, -1, 4]))
anchors.append(tf.reshape(anchor_boxes[str(i)], [-1, 4]))
scores = tf.concat(scores, 1)
boxes = tf.concat(boxes, 1)
anchors = tf.concat(anchors, 0)
ycenter_a = (anchors[..., 0] + anchors[..., 2]) / 2
xcenter_a = (anchors[..., 1] + anchors[..., 3]) / 2
ha = anchors[..., 2] - anchors[..., 0]
wa = anchors[..., 3] - anchors[..., 1]
anchors = tf.stack([ycenter_a, xcenter_a, ha, wa], axis=-1)
# There is no TF equivalent for TFLite's custom post-processing op.
# So we add an 'empty' composite function here, that is legalized to the
# custom op with MLIR.
# For details, see: tensorflow/compiler/mlir/lite/utils/nms_utils.cc
@tf.function(
experimental_implements=_generate_detections_tflite_implements_signature(
config))
# pylint: disable=g-unused-argument,unused-argument
def dummy_post_processing(input_boxes, input_scores, input_anchors):
boxes = tf.constant(0.0, dtype=tf.float32, name='boxes')
scores = tf.constant(0.0, dtype=tf.float32, name='scores')
classes = tf.constant(0.0, dtype=tf.float32, name='classes')
num_detections = tf.constant(0.0, dtype=tf.float32, name='num_detections')
return boxes, classes, scores, num_detections
return dummy_post_processing(boxes, scores, anchors)[::-1]
@tf.keras.utils.register_keras_serializable(package='Vision')
class DetectionGenerator(tf.keras.layers.Layer):
"""Generates the final detected boxes with scores and classes."""
def __init__(self,
apply_nms: bool = True,
pre_nms_top_k: int = 5000,
pre_nms_score_threshold: float = 0.05,
nms_iou_threshold: float = 0.5,
max_num_detections: int = 100,
nms_version: str = 'v2',
use_cpu_nms: bool = False,
soft_nms_sigma: Optional[float] = None,
**kwargs):
"""Initializes a detection generator.
Args:
apply_nms: A `bool` of whether or not apply non maximum suppression.
If False, the decoded boxes and their scores are returned.
pre_nms_top_k: An `int` of the number of top scores proposals to be kept
before applying NMS.
pre_nms_score_threshold: A `float` of the score threshold to apply before
applying NMS. Proposals whose scores are below this threshold are
thrown away.
nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
max_num_detections: An `int` of the final number of total detections to
generate.
nms_version: A string of `batched`, `v1` or `v2` specifies NMS version.
use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU.
soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
When soft_nms_sigma=0.0, we fall back to standard NMS.
**kwargs: Additional keyword arguments passed to Layer.
"""
self._config_dict = {
'apply_nms': apply_nms,
'pre_nms_top_k': pre_nms_top_k,
'pre_nms_score_threshold': pre_nms_score_threshold,
'nms_iou_threshold': nms_iou_threshold,
'max_num_detections': max_num_detections,
'nms_version': nms_version,
'use_cpu_nms': use_cpu_nms,
'soft_nms_sigma': soft_nms_sigma,
}
super(DetectionGenerator, self).__init__(**kwargs)
def __call__(self,
raw_boxes: tf.Tensor,
raw_scores: tf.Tensor,
anchor_boxes: tf.Tensor,
image_shape: tf.Tensor,
regression_weights: Optional[List[float]] = None,
bbox_per_class: bool = True):
"""Generates final detections.
Args:
raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]`
representing the class-specific box coordinates relative to anchors.
raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]`
representing the class logits before applying score activiation.
anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing
the corresponding anchor boxes w.r.t `box_outputs`.
image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image
height and width w.r.t. the scaled image, i.e. the same image space as
`box_outputs` and `anchor_boxes`.
regression_weights: A list of four float numbers to scale coordinates.
bbox_per_class: A `bool`. If True, perform per-class box regression.
Returns:
If `apply_nms` = True, the return is a dictionary with keys:
`detection_boxes`: A `float` tf.Tensor of shape
[batch, max_num_detections, 4] representing top detected boxes in
[y1, x1, y2, x2].
`detection_scores`: A `float` `tf.Tensor` of shape
[batch, max_num_detections] representing sorted confidence scores for
detected boxes. The values are between [0, 1].
`detection_classes`: An `int` tf.Tensor of shape
[batch, max_num_detections] representing classes for detected boxes.
`num_detections`: An `int` tf.Tensor of shape [batch] only the first
`num_detections` boxes are valid detections
If `apply_nms` = False, the return is a dictionary with keys:
`decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
representing all the decoded boxes.
`decoded_box_scores`: A `float` tf.Tensor of shape
[batch, num_raw_boxes] representing socres of all the decoded boxes.
"""
box_scores = tf.nn.softmax(raw_scores, axis=-1)
# Removes the background class.
box_scores_shape = tf.shape(box_scores)
box_scores_shape_list = box_scores.get_shape().as_list()
batch_size = box_scores_shape[0]
num_locations = box_scores_shape_list[1]
num_classes = box_scores_shape_list[-1]
box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1])
if bbox_per_class:
num_detections = num_locations * (num_classes - 1)
raw_boxes = tf.reshape(raw_boxes,
[batch_size, num_locations, num_classes, 4])
raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1])
anchor_boxes = tf.tile(
tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4])
anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
# Box decoding.
decoded_boxes = box_ops.decode_boxes(
raw_boxes, anchor_boxes, weights=regression_weights)
# Box clipping
decoded_boxes = box_ops.clip_boxes(
decoded_boxes, tf.expand_dims(image_shape, axis=1))
if bbox_per_class:
decoded_boxes = tf.reshape(
decoded_boxes, [batch_size, num_locations, num_classes - 1, 4])
else:
decoded_boxes = tf.expand_dims(decoded_boxes, axis=2)
if not self._config_dict['apply_nms']:
return {
'decoded_boxes': decoded_boxes,
'decoded_box_scores': box_scores,
}
# Optionally force the NMS be run on CPU.
if self._config_dict['use_cpu_nms']:
nms_context = tf.device('cpu:0')
else:
nms_context = contextlib.nullcontext()
with nms_context:
if self._config_dict['nms_version'] == 'batched':
(nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
_generate_detections_batched(
decoded_boxes, box_scores,
self._config_dict['pre_nms_score_threshold'],
self._config_dict['nms_iou_threshold'],
self._config_dict['max_num_detections']))
elif self._config_dict['nms_version'] == 'v1':
(nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = (
_generate_detections_v1(
decoded_boxes,
box_scores,
pre_nms_top_k=self._config_dict['pre_nms_top_k'],
pre_nms_score_threshold=self
._config_dict['pre_nms_score_threshold'],
nms_iou_threshold=self._config_dict['nms_iou_threshold'],
max_num_detections=self._config_dict['max_num_detections'],
soft_nms_sigma=self._config_dict['soft_nms_sigma']))
elif self._config_dict['nms_version'] == 'v2':
(nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
_generate_detections_v2(
decoded_boxes,
box_scores,
pre_nms_top_k=self._config_dict['pre_nms_top_k'],
pre_nms_score_threshold=self
._config_dict['pre_nms_score_threshold'],
nms_iou_threshold=self._config_dict['nms_iou_threshold'],
max_num_detections=self._config_dict['max_num_detections']))
else:
raise ValueError('NMS version {} not supported.'.format(
self._config_dict['nms_version']))
# Adds 1 to offset the background class which has index 0.
nmsed_classes += 1
return {
'num_detections': valid_detections,
'detection_boxes': nmsed_boxes,
'detection_classes': nmsed_classes,
'detection_scores': nmsed_scores,
}
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
@tf.keras.utils.register_keras_serializable(package='Vision')
class MultilevelDetectionGenerator(tf.keras.layers.Layer):
"""Generates detected boxes with scores and classes for one-stage detector."""
def __init__(self,
apply_nms: bool = True,
pre_nms_top_k: int = 5000,
pre_nms_score_threshold: float = 0.05,
nms_iou_threshold: float = 0.5,
max_num_detections: int = 100,
nms_version: str = 'v1',
use_cpu_nms: bool = False,
soft_nms_sigma: Optional[float] = None,
tflite_post_processing_config: Optional[Dict[str, Any]] = None,
**kwargs):
"""Initializes a multi-level detection generator.
Args:
apply_nms: A `bool` of whether or not apply non maximum suppression. If
False, the decoded boxes and their scores are returned.
pre_nms_top_k: An `int` of the number of top scores proposals to be kept
before applying NMS.
pre_nms_score_threshold: A `float` of the score threshold to apply before
applying NMS. Proposals whose scores are below this threshold are thrown
away.
nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
max_num_detections: An `int` of the final number of total detections to
generate.
nms_version: A string of `batched`, `v1` or `v2` specifies NMS version
use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU.
soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
When soft_nms_sigma=0.0, we fall back to standard NMS.
tflite_post_processing_config: An optional dictionary containing
post-processing parameters used for TFLite custom NMS op.
**kwargs: Additional keyword arguments passed to Layer.
"""
self._config_dict = {
'apply_nms': apply_nms,
'pre_nms_top_k': pre_nms_top_k,
'pre_nms_score_threshold': pre_nms_score_threshold,
'nms_iou_threshold': nms_iou_threshold,
'max_num_detections': max_num_detections,
'nms_version': nms_version,
'use_cpu_nms': use_cpu_nms,
'soft_nms_sigma': soft_nms_sigma
}
if tflite_post_processing_config is not None:
self._config_dict.update(
{'tflite_post_processing_config': tflite_post_processing_config})
super(MultilevelDetectionGenerator, self).__init__(**kwargs)
def _decode_multilevel_outputs(
self,
raw_boxes: Mapping[str, tf.Tensor],
raw_scores: Mapping[str, tf.Tensor],
anchor_boxes: Mapping[str, tf.Tensor],
image_shape: tf.Tensor,
raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
"""Collects dict of multilevel boxes, scores, attributes into lists."""
boxes = []
scores = []
if raw_attributes:
attributes = {att_name: [] for att_name in raw_attributes.keys()}
else:
attributes = {}
levels = list(raw_boxes.keys())
min_level = int(min(levels))
max_level = int(max(levels))
for i in range(min_level, max_level + 1):
raw_boxes_i = raw_boxes[str(i)]
raw_scores_i = raw_scores[str(i)]
batch_size = tf.shape(raw_boxes_i)[0]
(_, feature_h_i, feature_w_i,
num_anchors_per_locations_times_4) = raw_boxes_i.get_shape().as_list()
num_locations = feature_h_i * feature_w_i
num_anchors_per_locations = num_anchors_per_locations_times_4 // 4
num_classes = raw_scores_i.get_shape().as_list(
)[-1] // num_anchors_per_locations
# Applies score transformation and remove the implicit background class.
scores_i = tf.sigmoid(
tf.reshape(raw_scores_i, [
batch_size, num_locations * num_anchors_per_locations, num_classes
]))
scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
# Box decoding.
# The anchor boxes are shared for all data in a batch.
# One stage detector only supports class agnostic box regression.
anchor_boxes_i = tf.reshape(
anchor_boxes[str(i)],
[batch_size, num_locations * num_anchors_per_locations, 4])
raw_boxes_i = tf.reshape(
raw_boxes_i,
[batch_size, num_locations * num_anchors_per_locations, 4])
boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)
# Box clipping.
boxes_i = box_ops.clip_boxes(
boxes_i, tf.expand_dims(image_shape, axis=1))
boxes.append(boxes_i)
scores.append(scores_i)
if raw_attributes:
for att_name, raw_att in raw_attributes.items():
attribute_size = raw_att[str(
i)].get_shape().as_list()[-1] // num_anchors_per_locations
att_i = tf.reshape(raw_att[str(i)], [
batch_size, num_locations * num_anchors_per_locations,
attribute_size
])
attributes[att_name].append(att_i)
boxes = tf.concat(boxes, axis=1)
boxes = tf.expand_dims(boxes, axis=2)
scores = tf.concat(scores, axis=1)
if raw_attributes:
for att_name in raw_attributes.keys():
attributes[att_name] = tf.concat(attributes[att_name], axis=1)
attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2)
return boxes, scores, attributes
def __call__(self,
raw_boxes: Mapping[str, tf.Tensor],
raw_scores: Mapping[str, tf.Tensor],
anchor_boxes: Mapping[str, tf.Tensor],
image_shape: tf.Tensor,
raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
"""Generates final detections.
Args:
raw_boxes: A `dict` with keys representing FPN levels and values
representing box tenors of shape `[batch, feature_h, feature_w,
num_anchors * 4]`.
raw_scores: A `dict` with keys representing FPN levels and values
representing logit tensors of shape `[batch, feature_h, feature_w,
num_anchors]`.
anchor_boxes: A `dict` with keys representing FPN levels and values
representing anchor tenors of shape `[batch_size, K, 4]` representing
the corresponding anchor boxes w.r.t `box_outputs`.
image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image
height and width w.r.t. the scaled image, i.e. the same image space as
`box_outputs` and `anchor_boxes`.
raw_attributes: If not None, a `dict` of (attribute_name,
attribute_prediction) pairs. `attribute_prediction` is a dict that
contains keys representing FPN levels and values representing tenors of
shape `[batch, feature_h, feature_w, num_anchors * attribute_size]`.
Returns:
If `apply_nms` = True, the return is a dictionary with keys:
`detection_boxes`: A `float` tf.Tensor of shape
[batch, max_num_detections, 4] representing top detected boxes in
[y1, x1, y2, x2].
`detection_scores`: A `float` tf.Tensor of shape
[batch, max_num_detections] representing sorted confidence scores for
detected boxes. The values are between [0, 1].
`detection_classes`: An `int` tf.Tensor of shape
[batch, max_num_detections] representing classes for detected boxes.
`num_detections`: An `int` tf.Tensor of shape [batch] only the first
`num_detections` boxes are valid detections
`detection_attributes`: A dict. Values of the dict is a `float`
tf.Tensor of shape [batch, max_num_detections, attribute_size]
representing attribute predictions for detected boxes.
If `apply_nms` = False, the return is a dictionary with keys:
`decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
representing all the decoded boxes.
`decoded_box_scores`: A `float` tf.Tensor of shape
[batch, num_raw_boxes] representing socres of all the decoded boxes.
`decoded_box_attributes`: A dict. Values in the dict is a
`float` tf.Tensor of shape [batch, num_raw_boxes, attribute_size]
representing attribute predictions of all the decoded boxes.
"""
if self._config_dict['apply_nms'] and self._config_dict[
'nms_version'] == 'tflite':
boxes, classes, scores, num_detections = _generate_detections_tflite(
raw_boxes, raw_scores, anchor_boxes,
self.get_config()['tflite_post_processing_config'])
return {
'num_detections': num_detections,
'detection_boxes': boxes,
'detection_classes': classes,
'detection_scores': scores
}
boxes, scores, attributes = self._decode_multilevel_outputs(
raw_boxes, raw_scores, anchor_boxes, image_shape, raw_attributes)
if not self._config_dict['apply_nms']:
return {
'decoded_boxes': boxes,
'decoded_box_scores': scores,
'decoded_box_attributes': attributes,
}
# Optionally force the NMS to run on CPU.
if self._config_dict['use_cpu_nms']:
nms_context = tf.device('cpu:0')
else:
nms_context = contextlib.nullcontext()
with nms_context:
if raw_attributes and (self._config_dict['nms_version'] != 'v1'):
raise ValueError(
'Attribute learning is only supported for NMSv1 but NMS {} is used.'
.format(self._config_dict['nms_version']))
if self._config_dict['nms_version'] == 'batched':
(nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
_generate_detections_batched(
boxes, scores, self._config_dict['pre_nms_score_threshold'],
self._config_dict['nms_iou_threshold'],
self._config_dict['max_num_detections']))
# Set `nmsed_attributes` to None for batched NMS.
nmsed_attributes = {}
elif self._config_dict['nms_version'] == 'v1':
(nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections,
nmsed_attributes) = (
_generate_detections_v1(
boxes,
scores,
attributes=attributes if raw_attributes else None,
pre_nms_top_k=self._config_dict['pre_nms_top_k'],
pre_nms_score_threshold=self
._config_dict['pre_nms_score_threshold'],
nms_iou_threshold=self._config_dict['nms_iou_threshold'],
max_num_detections=self._config_dict['max_num_detections'],
soft_nms_sigma=self._config_dict['soft_nms_sigma']))
elif self._config_dict['nms_version'] == 'v2':
(nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
_generate_detections_v2(
boxes,
scores,
pre_nms_top_k=self._config_dict['pre_nms_top_k'],
pre_nms_score_threshold=self
._config_dict['pre_nms_score_threshold'],
nms_iou_threshold=self._config_dict['nms_iou_threshold'],
max_num_detections=self._config_dict['max_num_detections']))
# Set `nmsed_attributes` to None for v2.
nmsed_attributes = {}
else:
raise ValueError('NMS version {} not supported.'.format(
self._config_dict['nms_version']))
# Adds 1 to offset the background class which has index 0.
nmsed_classes += 1
return {
'num_detections': valid_detections,
'detection_boxes': nmsed_boxes,
'detection_classes': nmsed_classes,
'detection_scores': nmsed_scores,
'detection_attributes': nmsed_attributes,
}
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for detection_generator.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling.layers import detection_generator
from official.vision.ops import anchor
class SelectTopKScoresTest(tf.test.TestCase):
def testSelectTopKScores(self):
pre_nms_num_boxes = 2
scores_data = [[[0.2, 0.2], [0.1, 0.9], [0.5, 0.1], [0.3, 0.5]]]
scores_in = tf.constant(scores_data, dtype=tf.float32)
top_k_scores, top_k_indices = detection_generator._select_top_k_scores(
scores_in, pre_nms_num_detections=pre_nms_num_boxes)
expected_top_k_scores = np.array([[[0.5, 0.9], [0.3, 0.5]]],
dtype=np.float32)
expected_top_k_indices = [[[2, 1], [3, 3]]]
self.assertAllEqual(top_k_scores.numpy(), expected_top_k_scores)
self.assertAllEqual(top_k_indices.numpy(), expected_top_k_indices)
class DetectionGeneratorTest(
parameterized.TestCase, tf.test.TestCase):
@parameterized.product(
nms_version=['batched', 'v1', 'v2'],
use_cpu_nms=[True, False],
soft_nms_sigma=[None, 0.1])
def testDetectionsOutputShape(self, nms_version, use_cpu_nms, soft_nms_sigma):
max_num_detections = 10
num_classes = 4
pre_nms_top_k = 5000
pre_nms_score_threshold = 0.01
batch_size = 1
kwargs = {
'apply_nms': True,
'pre_nms_top_k': pre_nms_top_k,
'pre_nms_score_threshold': pre_nms_score_threshold,
'nms_iou_threshold': 0.5,
'max_num_detections': max_num_detections,
'nms_version': nms_version,
'use_cpu_nms': use_cpu_nms,
'soft_nms_sigma': soft_nms_sigma,
}
generator = detection_generator.DetectionGenerator(**kwargs)
cls_outputs_all = (
np.random.rand(84, num_classes) - 0.5) * 3 # random 84x3 outputs.
box_outputs_all = np.random.rand(84, 4 * num_classes) # random 84 boxes.
anchor_boxes_all = np.random.rand(84, 4) # random 84 boxes.
class_outputs = tf.reshape(
tf.convert_to_tensor(cls_outputs_all, dtype=tf.float32),
[1, 84, num_classes])
box_outputs = tf.reshape(
tf.convert_to_tensor(box_outputs_all, dtype=tf.float32),
[1, 84, 4 * num_classes])
anchor_boxes = tf.reshape(
tf.convert_to_tensor(anchor_boxes_all, dtype=tf.float32),
[1, 84, 4])
image_info = tf.constant(
[[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
dtype=tf.float32)
results = generator(
box_outputs, class_outputs, anchor_boxes, image_info[:, 1, :])
boxes = results['detection_boxes']
classes = results['detection_classes']
scores = results['detection_scores']
valid_detections = results['num_detections']
self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,))
self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,))
self.assertEqual(valid_detections.numpy().shape, (batch_size,))
def test_serialize_deserialize(self):
kwargs = {
'apply_nms': True,
'pre_nms_top_k': 1000,
'pre_nms_score_threshold': 0.1,
'nms_iou_threshold': 0.5,
'max_num_detections': 10,
'nms_version': 'v2',
'use_cpu_nms': False,
'soft_nms_sigma': None,
}
generator = detection_generator.DetectionGenerator(**kwargs)
expected_config = dict(kwargs)
self.assertEqual(generator.get_config(), expected_config)
new_generator = (
detection_generator.DetectionGenerator.from_config(
generator.get_config()))
self.assertAllEqual(generator.get_config(), new_generator.get_config())
class MultilevelDetectionGeneratorTest(
parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
('batched', False, True, None, None),
('batched', False, False, None, None),
('v2', False, True, None, None),
('v2', False, False, None, None),
('v1', True, True, 0.0, None),
('v1', True, False, 0.1, None),
('v1', True, False, None, None),
('tflite', False, False, None, True),
('tflite', False, False, None, False),
)
def testDetectionsOutputShape(self, nms_version, has_att_heads, use_cpu_nms,
soft_nms_sigma, use_regular_nms):
min_level = 4
max_level = 6
num_scales = 2
max_num_detections = 10
aspect_ratios = [1.0, 2.0]
anchor_scale = 2.0
output_size = [64, 64]
num_classes = 4
pre_nms_top_k = 5000
pre_nms_score_threshold = 0.01
batch_size = 1
tflite_post_processing_config = {
'max_detections': max_num_detections,
'max_classes_per_detection': 1,
'use_regular_nms': use_regular_nms,
'nms_score_threshold': 0.01,
'nms_iou_threshold': 0.5
}
kwargs = {
'apply_nms': True,
'pre_nms_top_k': pre_nms_top_k,
'pre_nms_score_threshold': pre_nms_score_threshold,
'nms_iou_threshold': 0.5,
'max_num_detections': max_num_detections,
'nms_version': nms_version,
'use_cpu_nms': use_cpu_nms,
'soft_nms_sigma': soft_nms_sigma,
'tflite_post_processing_config': tflite_post_processing_config
}
input_anchor = anchor.build_anchor_generator(min_level, max_level,
num_scales, aspect_ratios,
anchor_scale)
anchor_boxes = input_anchor(output_size)
cls_outputs_all = (
np.random.rand(84, num_classes) - 0.5) * 3 # random 84x3 outputs.
box_outputs_all = np.random.rand(84, 4) # random 84 boxes.
class_outputs = {
'4':
tf.reshape(
tf.convert_to_tensor(cls_outputs_all[0:64], dtype=tf.float32),
[1, 8, 8, num_classes]),
'5':
tf.reshape(
tf.convert_to_tensor(cls_outputs_all[64:80], dtype=tf.float32),
[1, 4, 4, num_classes]),
'6':
tf.reshape(
tf.convert_to_tensor(cls_outputs_all[80:84], dtype=tf.float32),
[1, 2, 2, num_classes]),
}
box_outputs = {
'4': tf.reshape(tf.convert_to_tensor(
box_outputs_all[0:64], dtype=tf.float32), [1, 8, 8, 4]),
'5': tf.reshape(tf.convert_to_tensor(
box_outputs_all[64:80], dtype=tf.float32), [1, 4, 4, 4]),
'6': tf.reshape(tf.convert_to_tensor(
box_outputs_all[80:84], dtype=tf.float32), [1, 2, 2, 4]),
}
if has_att_heads:
att_outputs_all = np.random.rand(84, 1) # random attributes.
att_outputs = {
'depth': {
'4':
tf.reshape(
tf.convert_to_tensor(
att_outputs_all[0:64], dtype=tf.float32),
[1, 8, 8, 1]),
'5':
tf.reshape(
tf.convert_to_tensor(
att_outputs_all[64:80], dtype=tf.float32),
[1, 4, 4, 1]),
'6':
tf.reshape(
tf.convert_to_tensor(
att_outputs_all[80:84], dtype=tf.float32),
[1, 2, 2, 1]),
}
}
else:
att_outputs = None
image_info = tf.constant([[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
dtype=tf.float32)
generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
results = generator(box_outputs, class_outputs, anchor_boxes,
image_info[:, 1, :], att_outputs)
boxes = results['detection_boxes']
classes = results['detection_classes']
scores = results['detection_scores']
valid_detections = results['num_detections']
if nms_version == 'tflite':
# When nms_version is `tflite`, all output tensors are empty as the actual
# post-processing happens in the TFLite model.
self.assertEqual(boxes.numpy().shape, ())
self.assertEqual(scores.numpy().shape, ())
self.assertEqual(classes.numpy().shape, ())
self.assertEqual(valid_detections.numpy().shape, ())
else:
self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
self.assertEqual(scores.numpy().shape, (
batch_size,
max_num_detections,
))
self.assertEqual(classes.numpy().shape, (
batch_size,
max_num_detections,
))
self.assertEqual(valid_detections.numpy().shape, (batch_size,))
if has_att_heads:
for att in results['detection_attributes'].values():
self.assertEqual(att.numpy().shape,
(batch_size, max_num_detections, 1))
def test_serialize_deserialize(self):
tflite_post_processing_config = {
'max_detections': 100,
'max_classes_per_detection': 1,
'use_regular_nms': True,
'nms_score_threshold': 0.01,
'nms_iou_threshold': 0.5
}
kwargs = {
'apply_nms': True,
'pre_nms_top_k': 1000,
'pre_nms_score_threshold': 0.1,
'nms_iou_threshold': 0.5,
'max_num_detections': 10,
'nms_version': 'v2',
'use_cpu_nms': False,
'soft_nms_sigma': None,
'tflite_post_processing_config': tflite_post_processing_config
}
generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
expected_config = dict(kwargs)
self.assertEqual(generator.get_config(), expected_config)
new_generator = (
detection_generator.MultilevelDetectionGenerator.from_config(
generator.get_config()))
self.assertAllEqual(generator.get_config(), new_generator.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of mask sampler."""
# Import libraries
import tensorflow as tf
from official.vision.ops import spatial_transform_ops
def _sample_and_crop_foreground_masks(candidate_rois: tf.Tensor,
candidate_gt_boxes: tf.Tensor,
candidate_gt_classes: tf.Tensor,
candidate_gt_indices: tf.Tensor,
gt_masks: tf.Tensor,
num_sampled_masks: int = 128,
mask_target_size: int = 28):
"""Samples and creates cropped foreground masks for training.
Args:
candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is the
number of candidate RoIs to be considered for mask sampling. It includes
both positive and negative RoIs. The `num_mask_samples_per_image` positive
RoIs will be sampled to create mask training targets.
candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
the corresponding groundtruth boxes to the `candidate_rois`.
candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing the
corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
corresponds to the background class, i.e. negative RoIs.
candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
corresponding groundtruth instance indices to the `candidate_gt_boxes`,
i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
the superset of candidate_gt_boxes.
gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
mask_width] containing all the groundtruth masks which sample masks are
drawn from.
num_sampled_masks: An `int` that specifies the number of masks to sample.
mask_target_size: An `int` that specifies the final cropped mask size after
sampling. The output masks are resized w.r.t the sampled RoIs.
Returns:
foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
RoI that corresponds to the sampled foreground masks, where
K = num_mask_samples_per_image.
foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
classes corresponding to the sampled foreground masks.
cropoped_foreground_masks: A `tf.Tensor` of shape of
[batch_size, K, mask_target_size, mask_target_size] storing the cropped
foreground masks used for training.
"""
_, fg_instance_indices = tf.nn.top_k(
tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
k=num_sampled_masks)
fg_instance_indices_shape = tf.shape(fg_instance_indices)
batch_indices = (
tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
gather_nd_instance_indices = tf.stack(
[batch_indices, fg_instance_indices], axis=-1)
foreground_rois = tf.gather_nd(
candidate_rois, gather_nd_instance_indices)
foreground_boxes = tf.gather_nd(
candidate_gt_boxes, gather_nd_instance_indices)
foreground_classes = tf.gather_nd(
candidate_gt_classes, gather_nd_instance_indices)
foreground_gt_indices = tf.gather_nd(
candidate_gt_indices, gather_nd_instance_indices)
foreground_gt_indices = tf.where(
tf.equal(foreground_gt_indices, -1),
tf.zeros_like(foreground_gt_indices),
foreground_gt_indices)
foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
batch_indices = (
tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
gather_nd_gt_indices = tf.stack(
[batch_indices, foreground_gt_indices], axis=-1)
foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
foreground_masks, foreground_boxes, foreground_rois, mask_target_size,
sample_offset=0.5)
return foreground_rois, foreground_classes, cropped_foreground_masks
@tf.keras.utils.register_keras_serializable(package='Vision')
class MaskSampler(tf.keras.layers.Layer):
"""Samples and creates mask training targets."""
def __init__(self, mask_target_size: int, num_sampled_masks: int, **kwargs):
self._config_dict = {
'mask_target_size': mask_target_size,
'num_sampled_masks': num_sampled_masks,
}
super(MaskSampler, self).__init__(**kwargs)
def call(self, candidate_rois: tf.Tensor, candidate_gt_boxes: tf.Tensor,
candidate_gt_classes: tf.Tensor, candidate_gt_indices: tf.Tensor,
gt_masks: tf.Tensor):
"""Samples and creates mask targets for training.
Args:
candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is
the number of candidate RoIs to be considered for mask sampling. It
includes both positive and negative RoIs. The
`num_mask_samples_per_image` positive RoIs will be sampled to create
mask training targets.
candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing
the corresponding groundtruth boxes to the `candidate_rois`.
candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing
the corresponding groundtruth classes to the `candidate_rois`. 0 in the
tensor corresponds to the background class, i.e. negative RoIs.
candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the
corresponding groundtruth instance indices to the `candidate_gt_boxes`,
i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
N, is the superset of candidate_gt_boxes.
gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height,
mask_width] containing all the groundtruth masks which sample masks are
drawn from. after sampling. The output masks are resized w.r.t the
sampled RoIs.
Returns:
foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the
RoI that corresponds to the sampled foreground masks, where
K = num_mask_samples_per_image.
foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the
classes corresponding to the sampled foreground masks.
cropoped_foreground_masks: A `tf.Tensor` of shape of
[batch_size, K, mask_target_size, mask_target_size] storing the
cropped foreground masks used for training.
"""
foreground_rois, foreground_classes, cropped_foreground_masks = (
_sample_and_crop_foreground_masks(
candidate_rois,
candidate_gt_boxes,
candidate_gt_classes,
candidate_gt_indices,
gt_masks,
self._config_dict['num_sampled_masks'],
self._config_dict['mask_target_size']))
return foreground_rois, foreground_classes, cropped_foreground_masks
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains common building blocks for neural networks."""
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Text
# Import libraries
from absl import logging
import tensorflow as tf
from official.modeling import tf_utils
from official.vision.modeling.layers import nn_layers
def _pad_strides(strides: int, axis: int) -> Tuple[int, int, int, int]:
"""Converts int to len 4 strides (`tf.nn.avg_pool` uses length 4)."""
if axis == 1:
return (1, 1, strides, strides)
else:
return (1, strides, strides, 1)
def _maybe_downsample(x: tf.Tensor, out_filter: int, strides: int,
axis: int) -> tf.Tensor:
"""Downsamples feature map and 0-pads tensor if in_filter != out_filter."""
data_format = 'NCHW' if axis == 1 else 'NHWC'
strides = _pad_strides(strides, axis=axis)
x = tf.nn.avg_pool(x, strides, strides, 'VALID', data_format=data_format)
in_filter = x.shape[axis]
if in_filter < out_filter:
# Pad on channel dimension with 0s: half on top half on bottom.
pad_size = [(out_filter - in_filter) // 2, (out_filter - in_filter) // 2]
if axis == 1:
x = tf.pad(x, [[0, 0], pad_size, [0, 0], [0, 0]])
else:
x = tf.pad(x, [[0, 0], [0, 0], [0, 0], pad_size])
return x + 0.
@tf.keras.utils.register_keras_serializable(package='Vision')
class ResidualBlock(tf.keras.layers.Layer):
"""A residual block."""
def __init__(self,
filters,
strides,
use_projection=False,
se_ratio=None,
resnetd_shortcut=False,
stochastic_depth_drop_rate=None,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_explicit_padding: bool = False,
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
bn_trainable=True,
**kwargs):
"""Initializes a residual block with BN after convolutions.
Args:
filters: An `int` number of filters for the first two convolutions. Note
that the third and final convolution will use 4 times as many filters.
strides: An `int` block stride. If greater than 1, this block will
ultimately downsample the input.
use_projection: A `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
resnetd_shortcut: A `bool` if True, apply the resnetd style modification
to the shortcut connection. Not implemented in residual blocks.
stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
the stochastic depth layer.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
inputs so that the output dimensions are the same as if 'SAME' padding
were used.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
bn_trainable: A `bool` that indicates whether batch norm layers should be
trainable. Default to True.
**kwargs: Additional keyword arguments to be passed.
"""
super(ResidualBlock, self).__init__(**kwargs)
self._filters = filters
self._strides = strides
self._use_projection = use_projection
self._se_ratio = se_ratio
self._resnetd_shortcut = resnetd_shortcut
self._use_explicit_padding = use_explicit_padding
self._use_sync_bn = use_sync_bn
self._activation = activation
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
self._bn_trainable = bn_trainable
def build(self, input_shape):
if self._use_projection:
self._shortcut = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=1,
strides=self._strides,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
conv1_padding = 'same'
# explicit padding here is added for centernet
if self._use_explicit_padding:
self._pad = tf.keras.layers.ZeroPadding2D(padding=(1, 1))
conv1_padding = 'valid'
self._conv1 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=3,
strides=self._strides,
padding=conv1_padding,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._conv2 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=3,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
self._squeeze_excitation = nn_layers.SqueezeExcitation(
in_filters=self._filters,
out_filters=self._filters,
se_ratio=self._se_ratio,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
else:
self._squeeze_excitation = None
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
super(ResidualBlock, self).build(input_shape)
def get_config(self):
config = {
'filters': self._filters,
'strides': self._strides,
'use_projection': self._use_projection,
'se_ratio': self._se_ratio,
'resnetd_shortcut': self._resnetd_shortcut,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_explicit_padding': self._use_explicit_padding,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'bn_trainable': self._bn_trainable
}
base_config = super(ResidualBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
shortcut = inputs
if self._use_projection:
shortcut = self._shortcut(shortcut)
shortcut = self._norm0(shortcut)
if self._use_explicit_padding:
inputs = self._pad(inputs)
x = self._conv1(inputs)
x = self._norm1(x)
x = self._activation_fn(x)
x = self._conv2(x)
x = self._norm2(x)
if self._squeeze_excitation:
x = self._squeeze_excitation(x)
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
return self._activation_fn(x + shortcut)
@tf.keras.utils.register_keras_serializable(package='Vision')
class BottleneckBlock(tf.keras.layers.Layer):
"""A standard bottleneck block."""
def __init__(self,
filters,
strides,
dilation_rate=1,
use_projection=False,
se_ratio=None,
resnetd_shortcut=False,
stochastic_depth_drop_rate=None,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
bn_trainable=True,
**kwargs):
"""Initializes a standard bottleneck block with BN after convolutions.
Args:
filters: An `int` number of filters for the first two convolutions. Note
that the third and final convolution will use 4 times as many filters.
strides: An `int` block stride. If greater than 1, this block will
ultimately downsample the input.
dilation_rate: An `int` dilation_rate of convolutions. Default to 1.
use_projection: A `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
resnetd_shortcut: A `bool`. If True, apply the resnetd style modification
to the shortcut connection.
stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
the stochastic depth layer.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
bn_trainable: A `bool` that indicates whether batch norm layers should be
trainable. Default to True.
**kwargs: Additional keyword arguments to be passed.
"""
super(BottleneckBlock, self).__init__(**kwargs)
self._filters = filters
self._strides = strides
self._dilation_rate = dilation_rate
self._use_projection = use_projection
self._se_ratio = se_ratio
self._resnetd_shortcut = resnetd_shortcut
self._use_sync_bn = use_sync_bn
self._activation = activation
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._bn_trainable = bn_trainable
def build(self, input_shape):
if self._use_projection:
if self._resnetd_shortcut:
self._shortcut0 = tf.keras.layers.AveragePooling2D(
pool_size=2, strides=self._strides, padding='same')
self._shortcut1 = tf.keras.layers.Conv2D(
filters=self._filters * 4,
kernel_size=1,
strides=1,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
else:
self._shortcut = tf.keras.layers.Conv2D(
filters=self._filters * 4,
kernel_size=1,
strides=self._strides,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._conv1 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=1,
strides=1,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._activation1 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
self._conv2 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=3,
strides=self._strides,
dilation_rate=self._dilation_rate,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._activation2 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
self._conv3 = tf.keras.layers.Conv2D(
filters=self._filters * 4,
kernel_size=1,
strides=1,
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm3 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon,
trainable=self._bn_trainable)
self._activation3 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
self._squeeze_excitation = nn_layers.SqueezeExcitation(
in_filters=self._filters * 4,
out_filters=self._filters * 4,
se_ratio=self._se_ratio,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
else:
self._squeeze_excitation = None
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
self._add = tf.keras.layers.Add()
super(BottleneckBlock, self).build(input_shape)
def get_config(self):
config = {
'filters': self._filters,
'strides': self._strides,
'dilation_rate': self._dilation_rate,
'use_projection': self._use_projection,
'se_ratio': self._se_ratio,
'resnetd_shortcut': self._resnetd_shortcut,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'bn_trainable': self._bn_trainable
}
base_config = super(BottleneckBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
shortcut = inputs
if self._use_projection:
if self._resnetd_shortcut:
shortcut = self._shortcut0(shortcut)
shortcut = self._shortcut1(shortcut)
else:
shortcut = self._shortcut(shortcut)
shortcut = self._norm0(shortcut)
x = self._conv1(inputs)
x = self._norm1(x)
x = self._activation1(x)
x = self._conv2(x)
x = self._norm2(x)
x = self._activation2(x)
x = self._conv3(x)
x = self._norm3(x)
if self._squeeze_excitation:
x = self._squeeze_excitation(x)
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
x = self._add([x, shortcut])
return self._activation3(x)
@tf.keras.utils.register_keras_serializable(package='Vision')
class InvertedBottleneckBlock(tf.keras.layers.Layer):
"""An inverted bottleneck block."""
def __init__(self,
in_filters,
out_filters,
expand_ratio,
strides,
kernel_size=3,
se_ratio=None,
stochastic_depth_drop_rate=None,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
se_inner_activation='relu',
se_gating_activation='sigmoid',
se_round_down_protect=True,
expand_se_in_filters=False,
depthwise_activation=None,
use_sync_bn=False,
dilation_rate=1,
divisible_by=1,
regularize_depthwise=False,
use_depthwise=True,
use_residual=True,
norm_momentum=0.99,
norm_epsilon=0.001,
output_intermediate_endpoints=False,
**kwargs):
"""Initializes an inverted bottleneck block with BN after convolutions.
Args:
in_filters: An `int` number of filters of the input tensor.
out_filters: An `int` number of filters of the output tensor.
expand_ratio: An `int` of expand_ratio for an inverted bottleneck block.
strides: An `int` block stride. If greater than 1, this block will
ultimately downsample the input.
kernel_size: An `int` kernel_size of the depthwise conv layer.
se_ratio: A `float` or None. If not None, se ratio for the squeeze and
excitation layer.
stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
the stochastic depth layer.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
se_inner_activation: A `str` name of squeeze-excitation inner activation.
se_gating_activation: A `str` name of squeeze-excitation gating
activation.
se_round_down_protect: A `bool` of whether round down more than 10%
will be allowed in SE layer.
expand_se_in_filters: A `bool` of whether or not to expand in_filter in
squeeze and excitation layer.
depthwise_activation: A `str` name of the activation function for
depthwise only.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
dilation_rate: An `int` that specifies the dilation rate to use for.
divisible_by: An `int` that ensures all inner dimensions are divisible by
this number.
dilated convolution: An `int` to specify the same value for all spatial
dimensions.
regularize_depthwise: A `bool` of whether or not apply regularization on
depthwise.
use_depthwise: A `bool` of whether to uses fused convolutions instead of
depthwise.
use_residual: A `bool` of whether to include residual connection between
input and output.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
output_intermediate_endpoints: A `bool` of whether or not output the
intermediate endpoints.
**kwargs: Additional keyword arguments to be passed.
"""
super(InvertedBottleneckBlock, self).__init__(**kwargs)
self._in_filters = in_filters
self._out_filters = out_filters
self._expand_ratio = expand_ratio
self._strides = strides
self._kernel_size = kernel_size
self._se_ratio = se_ratio
self._divisible_by = divisible_by
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._dilation_rate = dilation_rate
self._use_sync_bn = use_sync_bn
self._regularize_depthwise = regularize_depthwise
self._use_depthwise = use_depthwise
self._use_residual = use_residual
self._activation = activation
self._se_inner_activation = se_inner_activation
self._se_gating_activation = se_gating_activation
self._depthwise_activation = depthwise_activation
self._se_round_down_protect = se_round_down_protect
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
self._expand_se_in_filters = expand_se_in_filters
self._output_intermediate_endpoints = output_intermediate_endpoints
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
if not depthwise_activation:
self._depthwise_activation = activation
if regularize_depthwise:
self._depthsize_regularizer = kernel_regularizer
else:
self._depthsize_regularizer = None
def build(self, input_shape):
expand_filters = self._in_filters
if self._expand_ratio > 1:
# First 1x1 conv for channel expansion.
expand_filters = nn_layers.make_divisible(
self._in_filters * self._expand_ratio, self._divisible_by)
expand_kernel = 1 if self._use_depthwise else self._kernel_size
expand_stride = 1 if self._use_depthwise else self._strides
self._conv0 = tf.keras.layers.Conv2D(
filters=expand_filters,
kernel_size=expand_kernel,
strides=expand_stride,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._activation_layer = tf_utils.get_activation(
self._activation, use_keras_layer=True)
if self._use_depthwise:
# Depthwise conv.
self._conv1 = tf.keras.layers.DepthwiseConv2D(
kernel_size=(self._kernel_size, self._kernel_size),
strides=self._strides,
padding='same',
depth_multiplier=1,
dilation_rate=self._dilation_rate,
use_bias=False,
depthwise_initializer=self._kernel_initializer,
depthwise_regularizer=self._depthsize_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._depthwise_activation_layer = tf_utils.get_activation(
self._depthwise_activation, use_keras_layer=True)
# Squeeze and excitation.
if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
logging.info('Use Squeeze and excitation.')
in_filters = self._in_filters
if self._expand_se_in_filters:
in_filters = expand_filters
self._squeeze_excitation = nn_layers.SqueezeExcitation(
in_filters=in_filters,
out_filters=expand_filters,
se_ratio=self._se_ratio,
divisible_by=self._divisible_by,
round_down_protect=self._se_round_down_protect,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activation=self._se_inner_activation,
gating_activation=self._se_gating_activation)
else:
self._squeeze_excitation = None
# Last 1x1 conv.
self._conv2 = tf.keras.layers.Conv2D(
filters=self._out_filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
self._add = tf.keras.layers.Add()
super(InvertedBottleneckBlock, self).build(input_shape)
def get_config(self):
config = {
'in_filters': self._in_filters,
'out_filters': self._out_filters,
'expand_ratio': self._expand_ratio,
'strides': self._strides,
'kernel_size': self._kernel_size,
'se_ratio': self._se_ratio,
'divisible_by': self._divisible_by,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'se_inner_activation': self._se_inner_activation,
'se_gating_activation': self._se_gating_activation,
'se_round_down_protect': self._se_round_down_protect,
'expand_se_in_filters': self._expand_se_in_filters,
'depthwise_activation': self._depthwise_activation,
'dilation_rate': self._dilation_rate,
'use_sync_bn': self._use_sync_bn,
'regularize_depthwise': self._regularize_depthwise,
'use_depthwise': self._use_depthwise,
'use_residual': self._use_residual,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
base_config = super(InvertedBottleneckBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
endpoints = {}
shortcut = inputs
if self._expand_ratio > 1:
x = self._conv0(inputs)
x = self._norm0(x)
x = self._activation_layer(x)
else:
x = inputs
if self._use_depthwise:
x = self._conv1(x)
x = self._norm1(x)
x = self._depthwise_activation_layer(x)
if self._output_intermediate_endpoints:
endpoints['depthwise'] = x
if self._squeeze_excitation:
x = self._squeeze_excitation(x)
x = self._conv2(x)
x = self._norm2(x)
if (self._use_residual and self._in_filters == self._out_filters and
self._strides == 1):
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
x = self._add([x, shortcut])
if self._output_intermediate_endpoints:
return x, endpoints
return x
@tf.keras.utils.register_keras_serializable(package='Vision')
class ResidualInner(tf.keras.layers.Layer):
"""Creates a single inner block of a residual.
This corresponds to `F`/`G` functions in the RevNet paper:
Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
The Reversible Residual Network: Backpropagation Without Storing Activations.
(https://arxiv.org/pdf/1707.04585.pdf)
"""
def __init__(
self,
filters: int,
strides: int,
kernel_initializer: Union[str, Callable[
..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
batch_norm_first: bool = True,
**kwargs):
"""Initializes a ResidualInner.
Args:
filters: An `int` of output filter size.
strides: An `int` of stride size for convolution for the residual block.
kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
instance for convolutional layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
activation: A `str` or `callable` instance of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
batch_norm_first: A `bool` of whether to apply activation and batch norm
before conv.
**kwargs: Additional keyword arguments to be passed.
"""
super(ResidualInner, self).__init__(**kwargs)
self.strides = strides
self.filters = filters
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._kernel_regularizer = kernel_regularizer
self._activation = tf.keras.activations.get(activation)
self._use_sync_bn = use_sync_bn
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._batch_norm_first = batch_norm_first
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
def build(self, input_shape: tf.TensorShape):
if self._batch_norm_first:
self._batch_norm_0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_1 = tf.keras.layers.Conv2D(
filters=self.filters,
kernel_size=3,
strides=self.strides,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
self._batch_norm_1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_2 = tf.keras.layers.Conv2D(
filters=self.filters,
kernel_size=3,
strides=1,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
super(ResidualInner, self).build(input_shape)
def get_config(self) -> Dict[str, Any]:
config = {
'filters': self.filters,
'strides': self.strides,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'batch_norm_first': self._batch_norm_first,
}
base_config = super(ResidualInner, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self,
inputs: tf.Tensor,
training: Optional[bool] = None) -> tf.Tensor:
x = inputs
if self._batch_norm_first:
x = self._batch_norm_0(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_1(x)
x = self._batch_norm_1(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_2(x)
return x
@tf.keras.utils.register_keras_serializable(package='Vision')
class BottleneckResidualInner(tf.keras.layers.Layer):
"""Creates a single inner block of a bottleneck.
This corresponds to `F`/`G` functions in the RevNet paper:
Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
The Reversible Residual Network: Backpropagation Without Storing Activations.
(https://arxiv.org/pdf/1707.04585.pdf)
"""
def __init__(
self,
filters: int,
strides: int,
kernel_initializer: Union[str, Callable[
..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
batch_norm_first: bool = True,
**kwargs):
"""Initializes a BottleneckResidualInner.
Args:
filters: An `int` number of filters for first 2 convolutions. Last Last,
and thus the number of output channels from the bottlneck block is
`4*filters`
strides: An `int` of stride size for convolution for the residual block.
kernel_initializer: A `str` or `tf.keras.initializers.Initializer`
instance for convolutional layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D.
activation: A `str` or `callable` instance of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
batch_norm_first: A `bool` of whether to apply activation and batch norm
before conv.
**kwargs: Additional keyword arguments to be passed.
"""
super(BottleneckResidualInner, self).__init__(**kwargs)
self.strides = strides
self.filters = filters
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._kernel_regularizer = kernel_regularizer
self._activation = tf.keras.activations.get(activation)
self._use_sync_bn = use_sync_bn
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._batch_norm_first = batch_norm_first
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
def build(self, input_shape: tf.TensorShape):
if self._batch_norm_first:
self._batch_norm_0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_1 = tf.keras.layers.Conv2D(
filters=self.filters,
kernel_size=1,
strides=self.strides,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
self._batch_norm_1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_2 = tf.keras.layers.Conv2D(
filters=self.filters,
kernel_size=3,
strides=1,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
self._batch_norm_2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv2d_3 = tf.keras.layers.Conv2D(
filters=self.filters * 4,
kernel_size=1,
strides=1,
use_bias=False,
padding='same',
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
super(BottleneckResidualInner, self).build(input_shape)
def get_config(self) -> Dict[str, Any]:
config = {
'filters': self.filters,
'strides': self.strides,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'batch_norm_first': self._batch_norm_first,
}
base_config = super(BottleneckResidualInner, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self,
inputs: tf.Tensor,
training: Optional[bool] = None) -> tf.Tensor:
x = inputs
if self._batch_norm_first:
x = self._batch_norm_0(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_1(x)
x = self._batch_norm_1(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_2(x)
x = self._batch_norm_2(x, training=training)
x = self._activation_fn(x)
x = self._conv2d_3(x)
return x
@tf.keras.utils.register_keras_serializable(package='Vision')
class ReversibleLayer(tf.keras.layers.Layer):
"""Creates a reversible layer.
Computes y1 = x1 + f(x2), y2 = x2 + g(y1), where f and g can be arbitrary
layers that are stateless, which in this case are `ResidualInner` layers.
"""
def __init__(self,
f: tf.keras.layers.Layer,
g: tf.keras.layers.Layer,
manual_grads: bool = True,
**kwargs):
"""Initializes a ReversibleLayer.
Args:
f: A `tf.keras.layers.Layer` instance of `f` inner block referred to in
paper. Each reversible layer consists of two inner functions. For
example, in RevNet the reversible residual consists of two f/g inner
(bottleneck) residual functions. Where the input to the reversible layer
is x, the input gets partitioned in the channel dimension and the
forward pass follows (eq8): x = [x1; x2], z1 = x1 + f(x2), y2 = x2 +
g(z1), y1 = stop_gradient(z1).
g: A `tf.keras.layers.Layer` instance of `g` inner block referred to in
paper. Detailed explanation same as above as `f` arg.
manual_grads: A `bool` [Testing Only] of whether to manually take
gradients as in Algorithm 1 or defer to autograd.
**kwargs: Additional keyword arguments to be passed.
"""
super(ReversibleLayer, self).__init__(**kwargs)
self._f = f
self._g = g
self._manual_grads = manual_grads
if tf.keras.backend.image_data_format() == 'channels_last':
self._axis = -1
else:
self._axis = 1
def get_config(self) -> Dict[str, Any]:
config = {
'f': self._f,
'g': self._g,
'manual_grads': self._manual_grads,
}
base_config = super(ReversibleLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def _ckpt_non_trainable_vars(self):
self._f_non_trainable_vars = [
v.read_value() for v in self._f.non_trainable_variables
]
self._g_non_trainable_vars = [
v.read_value() for v in self._g.non_trainable_variables
]
def _load_ckpt_non_trainable_vars(self):
for v, v_chkpt in zip(self._f.non_trainable_variables,
self._f_non_trainable_vars):
v.assign(v_chkpt)
for v, v_chkpt in zip(self._g.non_trainable_variables,
self._g_non_trainable_vars):
v.assign(v_chkpt)
def call(self,
inputs: tf.Tensor,
training: Optional[bool] = None) -> tf.Tensor:
@tf.custom_gradient
def reversible(
x: tf.Tensor
) -> Tuple[tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor],
List[tf.Tensor]]]]:
"""Implements Algorithm 1 in the RevNet paper.
Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse.
The Reversible Residual Network: Backpropagation Without Storing
Activations.
(https://arxiv.org/pdf/1707.04585.pdf)
Args:
x: An input `tf.Tensor.
Returns:
y: The output [y1; y2] in Algorithm 1.
grad_fn: A callable function that computes the gradients.
"""
with tf.GradientTape() as fwdtape:
fwdtape.watch(x)
x1, x2 = tf.split(x, num_or_size_splits=2, axis=self._axis)
f_x2 = self._f(x2, training=training)
x1_down = _maybe_downsample(x1, f_x2.shape[self._axis], self._f.strides,
self._axis)
z1 = f_x2 + x1_down
g_z1 = self._g(z1, training=training)
x2_down = _maybe_downsample(x2, g_z1.shape[self._axis], self._f.strides,
self._axis)
y2 = x2_down + g_z1
# Equation 8: https://arxiv.org/pdf/1707.04585.pdf
# Decouple y1 and z1 so that their derivatives are different.
y1 = tf.identity(z1)
y = tf.concat([y1, y2], axis=self._axis)
irreversible = ((self._f.strides != 1 or self._g.strides != 1) or
(y.shape[self._axis] != inputs.shape[self._axis]))
# Checkpointing moving mean/variance for batch normalization layers
# as they shouldn't be updated during the custom gradient pass of f/g.
self._ckpt_non_trainable_vars()
def grad_fn(
dy: tf.Tensor,
variables: Optional[List[tf.Variable]] = None,
) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
"""Given dy calculate (dy/dx)|_{x_{input}} using f/g."""
if irreversible or not self._manual_grads:
grads_combined = fwdtape.gradient(
y, [x] + variables, output_gradients=dy)
dx = grads_combined[0]
grad_vars = grads_combined[1:]
else:
y1_nograd = tf.stop_gradient(y1)
y2_nograd = tf.stop_gradient(y2)
dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self._axis)
# Index mapping from self.f/g.trainable_variables to grad_fn
# input `variables` kwarg so that we can reorder dwf + dwg
# variable gradient list to match `variables` order.
f_var_refs = [v.ref() for v in self._f.trainable_variables]
g_var_refs = [v.ref() for v in self._g.trainable_variables]
fg_var_refs = f_var_refs + g_var_refs
self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables]
# Algorithm 1 in paper (line # documented in-line)
z1 = y1_nograd # line 2
with tf.GradientTape() as gtape:
gtape.watch(z1)
g_z1 = self._g(z1, training=training)
x2 = y2_nograd - g_z1 # line 3
with tf.GradientTape() as ftape:
ftape.watch(x2)
f_x2 = self._f(x2, training=training)
x1 = z1 - f_x2 # pylint: disable=unused-variable # line 4
# Compute gradients
g_grads_combined = gtape.gradient(
g_z1, [z1] + self._g.trainable_variables, output_gradients=dy2)
dz1 = dy1 + g_grads_combined[0] # line 5
dwg = g_grads_combined[1:] # line 9
f_grads_combined = ftape.gradient(
f_x2, [x2] + self._f.trainable_variables, output_gradients=dz1)
dx2 = dy2 + f_grads_combined[0] # line 6
dwf = f_grads_combined[1:] # line 8
dx1 = dz1 # line 7
# Pack the input and variable gradients.
dx = tf.concat([dx1, dx2], axis=self._axis)
grad_vars = dwf + dwg
# Reorder gradients (trainable_variables to variables kwarg order)
grad_vars = [grad_vars[i] for i in self_to_var_index]
# Restore batch normalization moving mean/variance for correctness.
self._load_ckpt_non_trainable_vars()
return dx, grad_vars # grad_fn end
return y, grad_fn # reversible end
activations = reversible(inputs)
return activations
@tf.keras.utils.register_keras_serializable(package='Vision')
class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
"""Creates an depthwise separable convolution block with batch normalization."""
def __init__(
self,
filters: int,
kernel_size: int = 3,
strides: int = 1,
regularize_depthwise=False,
activation: Text = 'relu6',
kernel_initializer: Text = 'VarianceScaling',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
dilation_rate: int = 1,
use_sync_bn: bool = False,
norm_momentum: float = 0.99,
norm_epsilon: float = 0.001,
**kwargs):
"""Initializes a convolution block with batch normalization.
Args:
filters: An `int` number of filters for the first two convolutions. Note
that the third and final convolution will use 4 times as many filters.
kernel_size: An `int` that specifies the height and width of the 2D
convolution window.
strides: An `int` of block stride. If greater than 1, this block will
ultimately downsample the input.
regularize_depthwise: A `bool`. If Ture, apply regularization on
depthwise.
activation: A `str` name of the activation function.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
dilation_rate: An `int` or tuple/list of 2 `int`, specifying the dilation
rate to use for dilated convolution. Can be a single integer to specify
the same value for all spatial dimensions.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
**kwargs: Additional keyword arguments to be passed.
"""
super(DepthwiseSeparableConvBlock, self).__init__(**kwargs)
self._filters = filters
self._kernel_size = kernel_size
self._strides = strides
self._activation = activation
self._regularize_depthwise = regularize_depthwise
self._kernel_initializer = kernel_initializer
self._kernel_regularizer = kernel_regularizer
self._dilation_rate = dilation_rate
self._use_sync_bn = use_sync_bn
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
if regularize_depthwise:
self._depthsize_regularizer = kernel_regularizer
else:
self._depthsize_regularizer = None
def get_config(self):
config = {
'filters': self._filters,
'strides': self._strides,
'regularize_depthwise': self._regularize_depthwise,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
base_config = super(DepthwiseSeparableConvBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
self._dwconv0 = tf.keras.layers.DepthwiseConv2D(
kernel_size=self._kernel_size,
strides=self._strides,
padding='same',
depth_multiplier=1,
dilation_rate=self._dilation_rate,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._depthsize_regularizer,
use_bias=False)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._conv1 = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
super(DepthwiseSeparableConvBlock, self).build(input_shape)
def call(self, inputs, training=None):
x = self._dwconv0(inputs)
x = self._norm0(x)
x = self._activation_fn(x)
x = self._conv1(x)
x = self._norm1(x)
return self._activation_fn(x)
@tf.keras.utils.register_keras_serializable(package='Vision')
class TuckerConvBlock(tf.keras.layers.Layer):
"""An Tucker block (generalized bottleneck)."""
def __init__(self,
in_filters,
out_filters,
input_compression_ratio,
output_compression_ratio,
strides,
kernel_size=3,
stochastic_depth_drop_rate=None,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_sync_bn=False,
divisible_by=1,
use_residual=True,
norm_momentum=0.99,
norm_epsilon=0.001,
**kwargs):
"""Initializes an inverted bottleneck block with BN after convolutions.
Args:
in_filters: An `int` number of filters of the input tensor.
out_filters: An `int` number of filters of the output tensor.
input_compression_ratio: An `float` of compression ratio for
input filters.
output_compression_ratio: An `float` of compression ratio for
output filters.
strides: An `int` block stride. If greater than 1, this block will
ultimately downsample the input.
kernel_size: An `int` kernel_size of the depthwise conv layer.
stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for
the stochastic depth layer.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
divisible_by: An `int` that ensures all inner dimensions are divisible by
this number.
use_residual: A `bool` of whether to include residual connection between
input and output.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
**kwargs: Additional keyword arguments to be passed.
"""
super(TuckerConvBlock, self).__init__(**kwargs)
self._in_filters = in_filters
self._out_filters = out_filters
self._input_compression_ratio = input_compression_ratio
self._output_compression_ratio = output_compression_ratio
self._strides = strides
self._kernel_size = kernel_size
self._divisible_by = divisible_by
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._use_sync_bn = use_sync_bn
self._use_residual = use_residual
self._activation = activation
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
def build(self, input_shape):
input_compressed_filters = nn_layers.make_divisible(
value=self._in_filters * self._input_compression_ratio,
divisor=self._divisible_by,
round_down_protect=False)
self._conv0 = tf.keras.layers.Conv2D(
filters=input_compressed_filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._activation_layer0 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
output_compressed_filters = nn_layers.make_divisible(
value=self._out_filters * self._output_compression_ratio,
divisor=self._divisible_by,
round_down_protect=False)
self._conv1 = tf.keras.layers.Conv2D(
filters=output_compressed_filters,
kernel_size=self._kernel_size,
strides=self._strides,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._activation_layer1 = tf_utils.get_activation(
self._activation, use_keras_layer=True)
# Last 1x1 conv.
self._conv2 = tf.keras.layers.Conv2D(
filters=self._out_filters,
kernel_size=1,
strides=1,
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
self._add = tf.keras.layers.Add()
super(TuckerConvBlock, self).build(input_shape)
def get_config(self):
config = {
'in_filters': self._in_filters,
'out_filters': self._out_filters,
'input_compression_ratio': self._input_compression_ratio,
'output_compression_ratio': self._output_compression_ratio,
'strides': self._strides,
'kernel_size': self._kernel_size,
'divisible_by': self._divisible_by,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'use_residual': self._use_residual,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
base_config = super(TuckerConvBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
shortcut = inputs
x = self._conv0(inputs)
x = self._norm0(x)
x = self._activation_layer0(x)
x = self._conv1(x)
x = self._norm1(x)
x = self._activation_layer1(x)
x = self._conv2(x)
x = self._norm2(x)
if (self._use_residual and
self._in_filters == self._out_filters and
self._strides == 1):
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
x = self._add([x, shortcut])
return x
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains common building blocks for 3D networks."""
# Import libraries
import tensorflow as tf
from official.modeling import tf_utils
from official.vision.modeling.layers import nn_layers
@tf.keras.utils.register_keras_serializable(package='Vision')
class SelfGating(tf.keras.layers.Layer):
"""Feature gating as used in S3D-G.
This implements the S3D-G network from:
Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, Kevin Murphy.
Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video
Classification.
(https://arxiv.org/pdf/1712.04851.pdf)
"""
def __init__(self, filters, **kwargs):
"""Initializes a self-gating layer.
Args:
filters: An `int` number of filters for the convolutional layer.
**kwargs: Additional keyword arguments to be passed.
"""
super(SelfGating, self).__init__(**kwargs)
self._filters = filters
def build(self, input_shape):
self._spatial_temporal_average = tf.keras.layers.GlobalAveragePooling3D()
# No BN and activation after conv.
self._transformer_w = tf.keras.layers.Conv3D(
filters=self._filters,
kernel_size=[1, 1, 1],
use_bias=True,
kernel_initializer=tf.keras.initializers.TruncatedNormal(
mean=0.0, stddev=0.01))
super(SelfGating, self).build(input_shape)
def call(self, inputs):
x = self._spatial_temporal_average(inputs)
x = tf.expand_dims(x, 1)
x = tf.expand_dims(x, 2)
x = tf.expand_dims(x, 3)
x = self._transformer_w(x)
x = tf.nn.sigmoid(x)
return tf.math.multiply(x, inputs)
@tf.keras.utils.register_keras_serializable(package='Vision')
class BottleneckBlock3D(tf.keras.layers.Layer):
"""Creates a 3D bottleneck block."""
def __init__(self,
filters,
temporal_kernel_size,
temporal_strides,
spatial_strides,
stochastic_depth_drop_rate=0.0,
se_ratio=None,
use_self_gating=False,
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_regularizer=None,
activation='relu',
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
**kwargs):
"""Initializes a 3D bottleneck block with BN after convolutions.
Args:
filters: An `int` number of filters for the first two convolutions. Note
that the third and final convolution will use 4 times as many filters.
temporal_kernel_size: An `int` of kernel size for the temporal
convolutional layer.
temporal_strides: An `int` of ftemporal stride for the temporal
convolutional layer.
spatial_strides: An `int` of spatial stride for the spatial convolutional
layer.
stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for
the stochastic depth layer.
se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer.
use_self_gating: A `bool` of whether to apply self-gating module or not.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
use_sync_bn: A `bool`. If True, use synchronized batch normalization.
norm_momentum: A `float` of normalization momentum for the moving average.
norm_epsilon: A `float` added to variance to avoid dividing by zero.
**kwargs: Additional keyword arguments to be passed.
"""
super(BottleneckBlock3D, self).__init__(**kwargs)
self._filters = filters
self._temporal_kernel_size = temporal_kernel_size
self._spatial_strides = spatial_strides
self._temporal_strides = temporal_strides
self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
self._use_self_gating = use_self_gating
self._se_ratio = se_ratio
self._use_sync_bn = use_sync_bn
self._activation = activation
self._kernel_initializer = kernel_initializer
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
if use_sync_bn:
self._norm = tf.keras.layers.experimental.SyncBatchNormalization
else:
self._norm = tf.keras.layers.BatchNormalization
if tf.keras.backend.image_data_format() == 'channels_last':
self._bn_axis = -1
else:
self._bn_axis = 1
self._activation_fn = tf_utils.get_activation(activation)
def build(self, input_shape):
self._shortcut_maxpool = tf.keras.layers.MaxPool3D(
pool_size=[1, 1, 1],
strides=[
self._temporal_strides, self._spatial_strides, self._spatial_strides
])
self._shortcut_conv = tf.keras.layers.Conv3D(
filters=4 * self._filters,
kernel_size=1,
strides=[
self._temporal_strides, self._spatial_strides, self._spatial_strides
],
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm0 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._temporal_conv = tf.keras.layers.Conv3D(
filters=self._filters,
kernel_size=[self._temporal_kernel_size, 1, 1],
strides=[self._temporal_strides, 1, 1],
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm1 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._spatial_conv = tf.keras.layers.Conv3D(
filters=self._filters,
kernel_size=[1, 3, 3],
strides=[1, self._spatial_strides, self._spatial_strides],
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm2 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
self._expand_conv = tf.keras.layers.Conv3D(
filters=4 * self._filters,
kernel_size=[1, 1, 1],
strides=[1, 1, 1],
padding='same',
use_bias=False,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
self._norm3 = self._norm(
axis=self._bn_axis,
momentum=self._norm_momentum,
epsilon=self._norm_epsilon)
if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
self._squeeze_excitation = nn_layers.SqueezeExcitation(
in_filters=self._filters * 4,
out_filters=self._filters * 4,
se_ratio=self._se_ratio,
use_3d_input=True,
kernel_initializer=self._kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)
else:
self._squeeze_excitation = None
if self._stochastic_depth_drop_rate:
self._stochastic_depth = nn_layers.StochasticDepth(
self._stochastic_depth_drop_rate)
else:
self._stochastic_depth = None
if self._use_self_gating:
self._self_gating = SelfGating(filters=4 * self._filters)
else:
self._self_gating = None
super(BottleneckBlock3D, self).build(input_shape)
def get_config(self):
config = {
'filters': self._filters,
'temporal_kernel_size': self._temporal_kernel_size,
'temporal_strides': self._temporal_strides,
'spatial_strides': self._spatial_strides,
'use_self_gating': self._use_self_gating,
'se_ratio': self._se_ratio,
'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_regularizer': self._bias_regularizer,
'activation': self._activation,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon
}
base_config = super(BottleneckBlock3D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, training=None):
in_filters = inputs.shape.as_list()[-1]
if in_filters == 4 * self._filters:
if self._temporal_strides == 1 and self._spatial_strides == 1:
shortcut = inputs
else:
shortcut = self._shortcut_maxpool(inputs)
else:
shortcut = self._shortcut_conv(inputs)
shortcut = self._norm0(shortcut)
x = self._temporal_conv(inputs)
x = self._norm1(x)
x = self._activation_fn(x)
x = self._spatial_conv(x)
x = self._norm2(x)
x = self._activation_fn(x)
x = self._expand_conv(x)
x = self._norm3(x)
# Apply self-gating, SE, stochastic depth.
if self._self_gating:
x = self._self_gating(x)
if self._squeeze_excitation:
x = self._squeeze_excitation(x)
if self._stochastic_depth:
x = self._stochastic_depth(x, training=training)
# Apply activation before additional modules.
x = self._activation_fn(x + shortcut)
return x
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for resnet."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from official.vision.modeling.layers import nn_blocks_3d
class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(nn_blocks_3d.BottleneckBlock3D, 1, 1, 2, True, 0.2, 0.1),
(nn_blocks_3d.BottleneckBlock3D, 3, 2, 1, False, 0.0, 0.0),
)
def test_bottleneck_block_creation(self, block_fn, temporal_kernel_size,
temporal_strides, spatial_strides,
use_self_gating, se_ratio,
stochastic_depth):
temporal_size = 16
spatial_size = 128
filters = 256
inputs = tf.keras.Input(
shape=(temporal_size, spatial_size, spatial_size, filters * 4),
batch_size=1)
block = block_fn(
filters=filters,
temporal_kernel_size=temporal_kernel_size,
temporal_strides=temporal_strides,
spatial_strides=spatial_strides,
use_self_gating=use_self_gating,
se_ratio=se_ratio,
stochastic_depth_drop_rate=stochastic_depth)
features = block(inputs)
self.assertAllEqual([
1, temporal_size // temporal_strides, spatial_size // spatial_strides,
spatial_size // spatial_strides, filters * 4
], features.shape.as_list())
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment