added `PanopticFPNFusion`

c18fc1bb · srihari-humbarwadi · d3998b07 · c18fc1bb · c18fc1bb · c18fc1bb
Unverified Commit c18fc1bb authored Oct 09, 2021 by srihari-humbarwadi
4 changed files
--- a/official/vision/beta/configs/semantic_segmentation.py
+++ b/official/vision/beta/configs/semantic_segmentation.py
@@ -65,11 +65,13 @@ class SegmentationHead(hyperparams.Config):
  use_depthwise_convolution: bool = False
  prediction_kernel_size: int = 1
  upsample_factor: int = 1
-  feature_fusion: Optional[str] = None  # None, deeplabv3plus, or pyramid_fusion
+  feature_fusion: Optional[str] = None  # None, deeplabv3plus, panoptic_fpn_fusion or pyramid_fusion
  # deeplabv3plus feature fusion params
  low_level: Union[int, str] = 2
  low_level_num_filters: int = 48
-
+  # panoptic_fpn_fusion params
+  decoder_min_level: Optional[Union[int, str]] = None
+  decoder_max_level: Optional[Union[int, str]] = None

 @dataclasses.dataclass
 class SemanticSegmentationModel(hyperparams.Config):

--- a/official/vision/beta/modeling/heads/segmentation_heads.py
+++ b/official/vision/beta/modeling/heads/segmentation_heads.py
@@ -21,7 +21,7 @@ from official.vision.beta.modeling.layers import nn_layers
 from official.vision.beta.ops import spatial_transform_ops


-@tf.keras.utils.register_keras_serializable(package='Vision')
+# @tf.keras.utils.register_keras_serializable(package='Vision')
 class SegmentationHead(tf.keras.layers.Layer):
  """Creates a segmentation head."""

@@ -35,8 +35,11 @@ class SegmentationHead(tf.keras.layers.Layer):
      prediction_kernel_size: int = 1,
      upsample_factor: int = 1,
      feature_fusion: Optional[str] = None,
+      decoder_min_level: Optional[int] = None,
+      decoder_max_level: Optional[int] = None,
      low_level: int = 2,
      low_level_num_filters: int = 48,
+      num_decoder_filters: int = 256,
      activation: str = 'relu',
      use_sync_bn: bool = False,
      norm_momentum: float = 0.99,
@@ -60,15 +63,24 @@ class SegmentationHead(tf.keras.layers.Layer):
      prediction layer.
      upsample_factor: An `int` number to specify the upsampling factor to
        generate finer mask. Default 1 means no upsampling is applied.
-      feature_fusion: One of `deeplabv3plus`, `pyramid_fusion`, or None. If
-        `deeplabv3plus`, features from decoder_features[level] will be fused
-        with low level feature maps from backbone. If `pyramid_fusion`,
-        multiscale features will be resized and fused at the target level.
+      feature_fusion: One of `deeplabv3plus`, `pyramid_fusion`,
+        `panoptic_fpn_fusion`, or None. If `deeplabv3plus`, features from
+        decoder_features[level] will be fused with low level feature maps from
+        backbone. If `pyramid_fusion`, multiscale features will be resized and
+        fused at the target level.
+      decoder_min_level: An `int` of minimum level from decoder to use in
+        feature fusion. It is only used when feature_fusion is set to
+        `panoptic_fpn_fusion`.
+      decoder_max_level: An `int` of maximum level from decoder to use in
+        feature fusion. It is only used when feature_fusion is set to
+        `panoptic_fpn_fusion`.
      low_level: An `int` of backbone level to be used for feature fusion. It is
        used when feature_fusion is set to `deeplabv3plus`.
      low_level_num_filters: An `int` of reduced number of filters for the low
        level features before fusing it with higher level features. It is only
        used when feature_fusion is set to `deeplabv3plus`.
+      num_decoder_filters: An `int` of number of filters in the decoder outputs.
+        It is only used when feature_fusion is set to `panoptic_fpn_fusion`.
      activation: A `str` that indicates which activation is used, e.g. 'relu',
        'swish', etc.
      use_sync_bn: A `bool` that indicates whether to use synchronized batch
@@ -91,14 +103,17 @@ class SegmentationHead(tf.keras.layers.Layer):
        'prediction_kernel_size': prediction_kernel_size,
        'upsample_factor': upsample_factor,
        'feature_fusion': feature_fusion,
+        'decoder_min_level': decoder_min_level,
+        'decoder_max_level': decoder_max_level,
        'low_level': low_level,
        'low_level_num_filters': low_level_num_filters,
+        'num_decoder_filters': num_decoder_filters,
        'activation': activation,
        'use_sync_bn': use_sync_bn,
        'norm_momentum': norm_momentum,
        'norm_epsilon': norm_epsilon,
        'kernel_regularizer': kernel_regularizer,
-        'bias_regularizer': bias_regularizer,
+        'bias_regularizer': bias_regularizer
    }
    if tf.keras.backend.image_data_format() == 'channels_last':
      self._bn_axis = -1
@@ -141,6 +156,17 @@ class SegmentationHead(tf.keras.layers.Layer):
      self._dlv3p_norm = bn_op(
          name='segmentation_head_deeplabv3p_fusion_norm', **bn_kwargs)

+    elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
+      self._panoptic_fpn_fusion = nn_layers.PanopticFPNFusion(
+          min_level=self._config_dict['decoder_min_level'],
+          max_level=self._config_dict['decoder_max_level'],
+          target_level=self._config_dict['level'],
+          num_filters=self._config_dict['num_filters'],
+          num_fpn_filters=self._config_dict['num_decoder_filters'],
+          activation=self._config_dict['activation'],
+          kernel_regularizer=self._config_dict['kernel_regularizer'],
+          bias_regularizer=self._config_dict['bias_regularizer'])
+
    # Segmentation head layers.
    self._convs = []
    self._norms = []
@@ -210,6 +236,8 @@ class SegmentationHead(tf.keras.layers.Layer):
    elif self._config_dict['feature_fusion'] == 'pyramid_fusion':
      x = nn_layers.pyramid_feature_fusion(decoder_output,
                                           self._config_dict['level'])
+    elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion':
+      x = self._panoptic_fpn_fusion(decoder_output)
    else:
      x = decoder_output[str(self._config_dict['level'])]


--- a/official/vision/beta/modeling/heads/segmentation_heads_test.py
+++ b/official/vision/beta/modeling/heads/segmentation_heads_test.py
@@ -22,24 +22,41 @@ import tensorflow as tf

 from official.vision.beta.modeling.heads import segmentation_heads

-
 class SegmentationHeadTest(parameterized.TestCase, tf.test.TestCase):

  @parameterized.parameters(
-      (2, 'pyramid_fusion'),
-      (3, 'pyramid_fusion'),
-  )
-  def test_forward(self, level, feature_fusion):
-    head = segmentation_heads.SegmentationHead(
-        num_classes=10, level=level, feature_fusion=feature_fusion)
+      (2, 'pyramid_fusion', None, None),
+      (3, 'pyramid_fusion', None, None),
+      (2, 'panoptic_fpn_fusion', 2, 5),
+      (2, 'panoptic_fpn_fusion', 2, 6),
+      (3, 'panoptic_fpn_fusion', 3, 5),
+      (3, 'panoptic_fpn_fusion', 3, 6))
+  def test_forward(self, level, feature_fusion,
+                   decoder_min_level, decoder_max_level):
    backbone_features = {
        '3': np.random.rand(2, 128, 128, 16),
        '4': np.random.rand(2, 64, 64, 16),
+        '5': np.random.rand(2, 32, 32, 16),
    }
    decoder_features = {
-        '3': np.random.rand(2, 128, 128, 16),
-        '4': np.random.rand(2, 64, 64, 16),
+        '3': np.random.rand(2, 128, 128, 64),
+        '4': np.random.rand(2, 64, 64, 64),
+        '5': np.random.rand(2, 32, 32, 64),
+        '6': np.random.rand(2, 16, 16, 64),
    }
+
+    if feature_fusion == 'panoptic_fpn_fusion':
+      backbone_features['2'] = np.random.rand(2, 256, 256, 16)
+      decoder_features['2'] = np.random.rand(2, 256, 256, 64)
+
+    head = segmentation_heads.SegmentationHead(
+        num_classes=10,
+        level=level,
+        feature_fusion=feature_fusion,
+        decoder_min_level=decoder_min_level,
+        decoder_max_level=decoder_max_level,
+        num_decoder_filters=64)
+
    logits = head(backbone_features, decoder_features)

    if level in decoder_features:

--- a/official/vision/beta/modeling/layers/nn_layers.py
+++ b/official/vision/beta/modeling/layers/nn_layers.py
@@ -13,12 +13,14 @@
 # limitations under the License.

 """Contains common building blocks for neural networks."""
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union

 from absl import logging
 import tensorflow as tf
+import tensorflow_addons as tfa

 from official.modeling import tf_utils
+from official.vision.beta.ops import spatial_transform_ops


 # Type annotations.
@@ -308,6 +310,113 @@ def pyramid_feature_fusion(inputs, target_level):
  return tf.math.add_n(resampled_feats)


+class PanopticFPNFusion(tf.keras.Model):
+  """Creates a Panoptic FPN feature Fusion layer.
+
+  This implements feature fusion for semantic segmentation head from the paper:
+  Alexander Kirillov, Ross Girshick, Kaiming He and Piotr Dollar.
+  Panoptic Feature Pyramid Networks.
+  (https://arxiv.org/pdf/1901.02446.pdf)
+  """
+
+  def __init__(
+      self,
+      min_level: int = 2,
+      max_level: int = 5,
+      target_level: int = 2,
+      num_filters: int = 128,
+      num_fpn_filters: int = 256,
+      activation: str = 'relu',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+
+    """Initializes panoptic FPN feature fusion layer.
+
+    Args:
+      min_level: An `int` of minimum level to use in feature fusion.
+      max_level: An `int` of maximum level to use in feature fusion.
+      target_level: An `int` of the target feature level for feature fusion..      
+      num_filters: An `int` number of filters in conv2d layers.
+      num_fpn_filters: An `int` number of filters in the FPN outputs
+      activation: A `str` name of the activation function.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    Returns:
+      A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
+        feature_channel].
+    """
+    if target_level > max_level:
+      raise ValueError('target_level should be less than max_level')
+
+    self._config_dict = {
+        'min_level': min_level,
+        'max_level': max_level,
+        'target_level': target_level,
+        'num_filters': num_filters,
+        'num_fpn_filters': num_fpn_filters,
+        'activation': activation,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+    }
+    norm = tfa.layers.GroupNormalization
+    conv2d = tf.keras.layers.Conv2D
+    activation_fn = tf.keras.layers.Activation(
+        tf_utils.get_activation(activation))
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      norm_axis = -1
+    else:
+      norm_axis = 1
+    inputs = self._build_inputs(num_fpn_filters, min_level, max_level)
+
+    upscaled_features = []
+    for level in range(min_level, max_level + 1):
+      num_conv_layers = max(1, level - target_level)
+      x = inputs[str(level)]
+      for i in range(num_conv_layers):
+        x = conv2d(
+            filters=num_filters,
+            kernel_size=3,
+            padding='same',
+            kernel_initializer=tf.keras.initializers.VarianceScaling(),
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer)(x)
+        x = norm(groups=32, axis=norm_axis)(x)
+        x = activation_fn(x)
+        if not level == target_level:
+          x = spatial_transform_ops.nearest_upsampling(x, scale=2)
+      upscaled_features.append(x)
+
+    fused_features = tf.math.add_n(upscaled_features)
+    self._output_specs = {str(target_level): fused_features.get_shape()}
+
+
+    super(PanopticFPNFusion, self).__init__(
+        inputs=inputs, outputs=fused_features, **kwargs)
+
+
+  def _build_inputs(self, num_filters: int,
+                    min_level: int, max_level: int):
+    inputs = {}
+    for level in range(min_level, max_level + 1):
+      inputs[str(level)] = tf.keras.Input(shape=[None, None, num_filters])
+    return inputs
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def output_specs(self) -> Mapping[str, tf.TensorShape]:
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
+
+
 @tf.keras.utils.register_keras_serializable(package='Vision')
 class Scale(tf.keras.layers.Layer):
  """Scales the input by a trainable scalar weight.