# Copyright 2022 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Contains definitions of segmentation head of the MOSAIC model.""" from typing import Any, Dict, List, Mapping, Optional, Tuple, Union import tensorflow as tf from official.modeling import tf_utils from official.projects.mosaic.modeling import mosaic_blocks @tf.keras.utils.register_keras_serializable(package='Vision') class MosaicDecoderHead(tf.keras.layers.Layer): """Creates a MOSAIC decoder in segmentation head. Reference: [MOSAIC: Mobile Segmentation via decoding Aggregated Information and encoded Context](https://arxiv.org/pdf/2112.11623.pdf) """ def __init__( self, num_classes: int, decoder_input_levels: Optional[List[str]] = None, decoder_stage_merge_styles: Optional[List[str]] = None, decoder_filters: Optional[List[int]] = None, decoder_projected_filters: Optional[List[int]] = None, encoder_end_level: Optional[int] = 4, use_additional_classifier_layer: bool = False, classifier_kernel_size: int = 1, activation: str = 'relu', use_sync_bn: bool = False, batchnorm_momentum: float = 0.99, batchnorm_epsilon: float = 0.001, kernel_initializer: str = 'GlorotUniform', kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, interpolation: str = 'bilinear', bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, **kwargs): """Initializes a MOSAIC segmentation head. Args: num_classes: An `int` number of mask classification categories. The number of classes does not include background class. decoder_input_levels: A list of `str` specifying additional input levels from the backbone outputs for mask refinement in decoder. decoder_stage_merge_styles: A list of `str` specifying the merge style at each stage of the decoder, merge styles can be 'concat_merge' or 'sum_merge'. decoder_filters: A list of integers specifying the number of channels used at each decoder stage. Note: this only has affects if the decoder merge style is 'concat_merge'. decoder_projected_filters: A list of integers specifying the number of projected channels at the end of each decoder stage. encoder_end_level: An optional integer specifying the output level of the encoder stage, which is used if the input from the encoder to the decoder head is a dictionary. use_additional_classifier_layer: A `bool` specifying whether to use an additional classifier layer or not. It must be True if the final decoder projected filters does not match the `num_classes`. classifier_kernel_size: An `int` number to specify the kernel size of the classifier layer. activation: A `str` that indicates which activation is used, e.g. 'relu', 'swish', etc. use_sync_bn: A `bool` that indicates whether to use synchronized batch normalization across different replicas. batchnorm_momentum: A `float` of normalization momentum for the moving average. batchnorm_epsilon: A `float` added to variance to avoid dividing by zero. kernel_initializer: Kernel initializer for conv layers. Defaults to `glorot_uniform`. kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. Default is None. interpolation: The interpolation method for upsampling. Defaults to `bilinear`. bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. **kwargs: Additional keyword arguments to be passed. """ super(MosaicDecoderHead, self).__init__(**kwargs) # Assuming 'decoder_input_levels' are sorted in descending order and the # other setting are listed in the order according to 'decoder_input_levels'. if decoder_input_levels is None: decoder_input_levels = ['3', '2'] if decoder_stage_merge_styles is None: decoder_stage_merge_styles = ['concat_merge', 'sum_merge'] if decoder_filters is None: decoder_filters = [64, 64] if decoder_projected_filters is None: decoder_projected_filters = [32, 32] self._decoder_input_levels = decoder_input_levels self._decoder_stage_merge_styles = decoder_stage_merge_styles self._decoder_filters = decoder_filters self._decoder_projected_filters = decoder_projected_filters if (len(decoder_input_levels) != len(decoder_stage_merge_styles) or len(decoder_input_levels) != len(decoder_filters) or len(decoder_input_levels) != len(decoder_projected_filters)): raise ValueError('The number of Decoder inputs and settings must match.') self._merge_stages = [] for (stage_merge_style, decoder_filter, decoder_projected_filter) in zip(decoder_stage_merge_styles, decoder_filters, decoder_projected_filters): if stage_merge_style == 'concat_merge': concat_merge_stage = mosaic_blocks.DecoderConcatMergeBlock( decoder_internal_depth=decoder_filter, decoder_projected_depth=decoder_projected_filter, output_size=(0, 0), use_sync_bn=use_sync_bn, batchnorm_momentum=batchnorm_momentum, batchnorm_epsilon=batchnorm_epsilon, activation=activation, kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, interpolation=interpolation) self._merge_stages.append(concat_merge_stage) elif stage_merge_style == 'sum_merge': sum_merge_stage = mosaic_blocks.DecoderSumMergeBlock( decoder_projected_depth=decoder_projected_filter, output_size=(0, 0), use_sync_bn=use_sync_bn, batchnorm_momentum=batchnorm_momentum, batchnorm_epsilon=batchnorm_epsilon, activation=activation, kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, interpolation=interpolation) self._merge_stages.append(sum_merge_stage) else: raise ValueError( 'A stage merge style in MOSAIC Decoder can only be concat_merge ' 'or sum_merge.') # Concat merge or sum merge does not require an additional classifer layer # unless the final decoder projected filter does not match num_classes. final_decoder_projected_filter = decoder_projected_filters[-1] if (final_decoder_projected_filter != num_classes and not use_additional_classifier_layer): raise ValueError('Additional classifier layer is needed if final decoder ' 'projected filters does not match num_classes!') self._use_additional_classifier_layer = use_additional_classifier_layer if use_additional_classifier_layer: # This additional classification layer uses different kernel # initializers and bias compared to earlier blocks. self._pixelwise_classifier = tf.keras.layers.Conv2D( name='pixelwise_classifier', filters=num_classes, kernel_size=classifier_kernel_size, padding='same', bias_initializer=tf.zeros_initializer(), kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01), kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=True) self._activation_fn = tf_utils.get_activation(activation) self._config_dict = { 'num_classes': num_classes, 'decoder_input_levels': decoder_input_levels, 'decoder_stage_merge_styles': decoder_stage_merge_styles, 'decoder_filters': decoder_filters, 'decoder_projected_filters': decoder_projected_filters, 'encoder_end_level': encoder_end_level, 'use_additional_classifier_layer': use_additional_classifier_layer, 'classifier_kernel_size': classifier_kernel_size, 'activation': activation, 'use_sync_bn': use_sync_bn, 'batchnorm_momentum': batchnorm_momentum, 'batchnorm_epsilon': batchnorm_epsilon, 'kernel_initializer': kernel_initializer, 'kernel_regularizer': kernel_regularizer, 'interpolation': interpolation, 'bias_regularizer': bias_regularizer } def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]], Union[tf.Tensor, Mapping[str, tf.Tensor]]], training: Optional[bool] = None) -> tf.Tensor: """Forward pass of the segmentation head. It supports a tuple of 2 elements. Each element is a tensor or a tensor dictionary. The first one is the final (low-resolution) encoder endpoints, and the second one is higher-resolution backbone endpoints. When inputs are tensors, they are from a single level of feature maps. When inputs are dictionaries, they contain multiple levels of feature maps, where the key is the level/index of feature map. Note: 'level' denotes the number of 2x downsampling, defined in backbone. Args: inputs: A tuple of 2 elements, each element can either be a tensor representing feature maps or 1 dictionary of tensors: - key: A `str` of the level of the multilevel features. - values: A `tf.Tensor` of the feature map tensors. The first is encoder endpoints, and the second is backbone endpoints. training: a `Boolean` indicating whether it is in `training` mode. Returns: segmentation mask prediction logits: A `tf.Tensor` representing the output logits before the final segmentation mask. """ encoder_outputs = inputs[0] backbone_outputs = inputs[1] y = encoder_outputs[str( self._config_dict['encoder_end_level'])] if isinstance( encoder_outputs, dict) else encoder_outputs if isinstance(backbone_outputs, dict): for level, merge_stage in zip( self._decoder_input_levels, self._merge_stages): x = backbone_outputs[str(level)] y = merge_stage([y, x], training=training) else: x = backbone_outputs y = self._merge_stages[0]([y, x], training=training) if self._use_additional_classifier_layer: y = self._pixelwise_classifier(y) y = self._activation_fn(y) return y def get_config(self) -> Dict[str, Any]: """Returns a config dictionary for initialization from serialization.""" base_config = super().get_config() base_config.update(self._config_dict) return base_config @classmethod def from_config(cls, config: Dict[str, Any]): return cls(**config)