# Copyright 2021 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Build BASNet models.""" from typing import Mapping import tensorflow as tf from official.modeling import tf_utils from official.projects.basnet.modeling import nn_blocks from official.vision.beta.modeling.backbones import factory # Specifications for BASNet encoder. # Each element in the block configuration is in the following format: # (num_filters, stride, block_repeats, maxpool) BASNET_ENCODER_SPECS = [ (64, 1, 3, 0), # ResNet-34, (128, 2, 4, 0), # ResNet-34, (256, 2, 6, 0), # ResNet-34, (512, 2, 3, 1), # ResNet-34, (512, 1, 3, 1), # BASNet, (512, 1, 3, 0), # BASNet, ] # Specifications for BASNet decoder. # Each element in the block configuration is in the following format: # (conv1_nf, conv1_dr, convm_nf, convm_dr, conv2_nf, conv2_dr, scale_factor) # nf : num_filters, dr : dilation_rate BASNET_BRIDGE_SPECS = [ (512, 2, 512, 2, 512, 2, 32), # Sup0, Bridge ] BASNET_DECODER_SPECS = [ (512, 1, 512, 2, 512, 2, 32), # Sup1, stage6d (512, 1, 512, 1, 512, 1, 16), # Sup2, stage5d (512, 1, 512, 1, 256, 1, 8), # Sup3, stage4d (256, 1, 256, 1, 128, 1, 4), # Sup4, stage3d (128, 1, 128, 1, 64, 1, 2), # Sup5, stage2d (64, 1, 64, 1, 64, 1, 1) # Sup6, stage1d ] @tf.keras.utils.register_keras_serializable(package='Vision') class BASNetModel(tf.keras.Model): """A BASNet model. Boundary-Awar network (BASNet) were proposed in: [1] Qin, Xuebin, et al. Basnet: Boundary-aware salient object detection. Input images are passed through backbone first. Decoder network is then applied, and finally, refinement module is applied on the output of the decoder network. """ def __init__(self, backbone, decoder, refinement=None, **kwargs): """BASNet initialization function. Args: backbone: a backbone network. basnet_encoder. decoder: a decoder network. basnet_decoder. refinement: a module for salient map refinement. **kwargs: keyword arguments to be passed. """ super(BASNetModel, self).__init__(**kwargs) self._config_dict = { 'backbone': backbone, 'decoder': decoder, 'refinement': refinement, } self.backbone = backbone self.decoder = decoder self.refinement = refinement def call(self, inputs, training=None): features = self.backbone(inputs) if self.decoder: features = self.decoder(features) levels = sorted(features.keys()) new_key = str(len(levels)) if self.refinement: features[new_key] = self.refinement(features[levels[-1]]) return features @property def checkpoint_items(self): """Returns a dictionary of items to be additionally checkpointed.""" items = dict(backbone=self.backbone) if self.decoder is not None: items.update(decoder=self.decoder) if self.refinement is not None: items.update(refinement=self.refinement) return items def get_config(self): return self._config_dict @classmethod def from_config(cls, config, custom_objects=None): return cls(**config) @tf.keras.utils.register_keras_serializable(package='Vision') class BASNetEncoder(tf.keras.Model): """BASNet encoder.""" def __init__( self, input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), activation='relu', use_sync_bn=False, use_bias=True, norm_momentum=0.99, norm_epsilon=0.001, kernel_initializer='VarianceScaling', kernel_regularizer=None, bias_regularizer=None, **kwargs): """BASNet encoder initialization function. Args: input_specs: `tf.keras.layers.InputSpec` specs of the input tensor. activation: `str` name of the activation function. use_sync_bn: if True, use synchronized batch normalization. use_bias: if True, use bias in conv2d. norm_momentum: `float` normalization omentum for the moving average. norm_epsilon: `float` small float added to variance to avoid dividing by zero. kernel_initializer: kernel_initializer for convolutional layers. kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D. Default to None. bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d. Default to None. **kwargs: keyword arguments to be passed. """ self._input_specs = input_specs self._use_sync_bn = use_sync_bn self._use_bias = use_bias self._activation = activation self._norm_momentum = norm_momentum self._norm_epsilon = norm_epsilon if use_sync_bn: self._norm = tf.keras.layers.experimental.SyncBatchNormalization else: self._norm = tf.keras.layers.BatchNormalization self._kernel_initializer = kernel_initializer self._kernel_regularizer = kernel_regularizer self._bias_regularizer = bias_regularizer if tf.keras.backend.image_data_format() == 'channels_last': bn_axis = -1 else: bn_axis = 1 # Build BASNet Encoder. inputs = tf.keras.Input(shape=input_specs.shape[1:]) x = tf.keras.layers.Conv2D( filters=64, kernel_size=3, strides=1, use_bias=self._use_bias, padding='same', kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer)( inputs) x = self._norm( axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( x) x = tf_utils.get_activation(activation)(x) endpoints = {} for i, spec in enumerate(BASNET_ENCODER_SPECS): x = self._block_group( inputs=x, filters=spec[0], strides=spec[1], block_repeats=spec[2], name='block_group_l{}'.format(i + 2)) endpoints[str(i)] = x if spec[3]: x = tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='same')(x) self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} super(BASNetEncoder, self).__init__( inputs=inputs, outputs=endpoints, **kwargs) def _block_group(self, inputs, filters, strides, block_repeats=1, name='block_group'): """Creates one group of residual blocks for the BASNet encoder model. Args: inputs: `Tensor` of size `[batch, channels, height, width]`. filters: `int` number of filters for the first convolution of the layer. strides: `int` stride to use for the first convolution of the layer. If greater than 1, this layer will downsample the input. block_repeats: `int` number of blocks contained in the layer. name: `str`name for the block. Returns: The output `Tensor` of the block layer. """ x = nn_blocks.ResBlock( filters=filters, strides=strides, use_projection=True, kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activation=self._activation, use_sync_bn=self._use_sync_bn, use_bias=self._use_bias, norm_momentum=self._norm_momentum, norm_epsilon=self._norm_epsilon)( inputs) for _ in range(1, block_repeats): x = nn_blocks.ResBlock( filters=filters, strides=1, use_projection=False, kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activation=self._activation, use_sync_bn=self._use_sync_bn, use_bias=self._use_bias, norm_momentum=self._norm_momentum, norm_epsilon=self._norm_epsilon)( x) return tf.identity(x, name=name) @classmethod def from_config(cls, config, custom_objects=None): return cls(**config) @property def output_specs(self): """A dict of {level: TensorShape} pairs for the model output.""" return self._output_specs @factory.register_backbone_builder('basnet_encoder') def build_basnet_encoder( input_specs: tf.keras.layers.InputSpec, model_config, l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: # pytype: disable=annotation-type-mismatch # typed-keras """Builds BASNet Encoder backbone from a config.""" backbone_type = model_config.backbone.type norm_activation_config = model_config.norm_activation assert backbone_type == 'basnet_encoder', (f'Inconsistent backbone type ' f'{backbone_type}') return BASNetEncoder( input_specs=input_specs, activation=norm_activation_config.activation, use_sync_bn=norm_activation_config.use_sync_bn, use_bias=norm_activation_config.use_bias, norm_momentum=norm_activation_config.norm_momentum, norm_epsilon=norm_activation_config.norm_epsilon, kernel_regularizer=l2_regularizer) @tf.keras.utils.register_keras_serializable(package='Vision') class BASNetDecoder(tf.keras.layers.Layer): """BASNet decoder.""" def __init__(self, activation='relu', use_sync_bn=False, use_bias=True, norm_momentum=0.99, norm_epsilon=0.001, kernel_initializer='VarianceScaling', kernel_regularizer=None, bias_regularizer=None, **kwargs): """BASNet decoder initialization function. Args: activation: `str` name of the activation function. use_sync_bn: if True, use synchronized batch normalization. use_bias: if True, use bias in convolution. norm_momentum: `float` normalization omentum for the moving average. norm_epsilon: `float` small float added to variance to avoid dividing by zero. kernel_initializer: kernel_initializer for convolutional layers. kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D. bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d. **kwargs: keyword arguments to be passed. """ super(BASNetDecoder, self).__init__(**kwargs) self._config_dict = { 'activation': activation, 'use_sync_bn': use_sync_bn, 'use_bias': use_bias, 'norm_momentum': norm_momentum, 'norm_epsilon': norm_epsilon, 'kernel_initializer': kernel_initializer, 'kernel_regularizer': kernel_regularizer, 'bias_regularizer': bias_regularizer, } self._activation = tf_utils.get_activation(activation) self._concat = tf.keras.layers.Concatenate(axis=-1) self._sigmoid = tf.keras.layers.Activation(activation='sigmoid') def build(self, input_shape): """Creates the variables of the BASNet decoder.""" conv_op = tf.keras.layers.Conv2D conv_kwargs = { 'kernel_size': 3, 'strides': 1, 'use_bias': self._config_dict['use_bias'], 'kernel_initializer': self._config_dict['kernel_initializer'], 'kernel_regularizer': self._config_dict['kernel_regularizer'], 'bias_regularizer': self._config_dict['bias_regularizer'], } self._out_convs = [] self._out_usmps = [] # Bridge layers. self._bdg_convs = [] for spec in BASNET_BRIDGE_SPECS: blocks = [] for j in range(3): blocks.append(nn_blocks.ConvBlock( filters=spec[2*j], dilation_rate=spec[2*j+1], activation='relu', use_sync_bn=self._config_dict['use_sync_bn'], norm_momentum=0.99, norm_epsilon=0.001, **conv_kwargs)) self._bdg_convs.append(blocks) self._out_convs.append(conv_op( filters=1, padding='same', **conv_kwargs)) self._out_usmps.append(tf.keras.layers.UpSampling2D( size=spec[6], interpolation='bilinear' )) # Decoder layers. self._dec_convs = [] for spec in BASNET_DECODER_SPECS: blocks = [] for j in range(3): blocks.append(nn_blocks.ConvBlock( filters=spec[2*j], dilation_rate=spec[2*j+1], activation='relu', use_sync_bn=self._config_dict['use_sync_bn'], norm_momentum=0.99, norm_epsilon=0.001, **conv_kwargs)) self._dec_convs.append(blocks) self._out_convs.append(conv_op( filters=1, padding='same', **conv_kwargs)) self._out_usmps.append(tf.keras.layers.UpSampling2D( size=spec[6], interpolation='bilinear' )) def call(self, backbone_output: Mapping[str, tf.Tensor]): """Forward pass of the BASNet decoder. Args: backbone_output: A `dict` of tensors - key: A `str` of the level of the multilevel features. - values: A `tf.Tensor` of the feature map tensors, whose shape is [batch, height_l, width_l, channels]. Returns: sup: A `dict` of tensors - key: A `str` of the level of the multilevel features. - values: A `tf.Tensor` of the feature map tensors, whose shape is [batch, height_l, width_l, channels]. """ levels = sorted(backbone_output.keys(), reverse=True) sup = {} x = backbone_output[levels[0]] for blocks in self._bdg_convs: for block in blocks: x = block(x) sup['0'] = x for i, blocks in enumerate(self._dec_convs): x = self._concat([x, backbone_output[levels[i]]]) for block in blocks: x = block(x) sup[str(i+1)] = x x = tf.keras.layers.UpSampling2D( size=2, interpolation='bilinear' )(x) for i, (conv, usmp) in enumerate(zip(self._out_convs, self._out_usmps)): sup[str(i)] = self._sigmoid(usmp(conv(sup[str(i)]))) self._output_specs = { str(order): sup[str(order)].get_shape() for order in range(0, len(BASNET_DECODER_SPECS)) } return sup def get_config(self): return self._config_dict @classmethod def from_config(cls, config, custom_objects=None): return cls(**config) @property def output_specs(self): """A dict of {order: TensorShape} pairs for the model output.""" return self._output_specs