[MobileNet] Add Mobilenet Backbone Implementation (#9303)

* factor make_divisible function and move round_filters to nn_layers * modify SqueezeExcitation to add two additional parameter: divisible_by and gating_activation * modify the InvertedBottleneckBlock to include 1. use_depthwise, 2. use_residual, 3. regularize_depthwise additional boolean flag; Add control for depthwise activation and regularizer; remove expand_ratio from SqueezeExcitation * add Conv2DBNBlock definition * add mobilenet v2, v3 implementation * add mobilenet v1 * put mobilenet_base into class body * fix a type hint error * the invertedbottlenetblock is different for mobilenet and efficientnet. Made necessary changes to cope both. * add target_backbone while call invertedbottleneckblock * add relu6 and hard_sigmoid * add test for mobilenet * add mobilenet to factory * fix some typo; link the reference to the architectures * remove future import Co-authored-by: Shixin Luo <luoshixin@google.com>

[MobileNet] Add Mobilenet Backbone Implementation (#9303)
* factor make_divisible function and move round_filters to nn_layers * modify SqueezeExcitation to add two additional parameter: divisible_by and gating_activation * modify the InvertedBottleneckBlock to include 1. use_depthwise, 2. use_residual, 3. regularize_depthwise additional boolean flag; Add control for depthwise activation and regularizer; remove expand_ratio from SqueezeExcitation * add Conv2DBNBlock definition * add mobilenet v2, v3 implementation * add mobilenet v1 * put mobilenet_base into class body * fix a type hint error * the invertedbottlenetblock is different for mobilenet and efficientnet. Made necessary changes to cope both. * add target_backbone while call invertedbottleneckblock * add relu6 and hard_sigmoid * add test for mobilenet * add mobilenet to factory * fix some typo; link the reference to the architectures * remove future import Co-authored-by: Shixin Luo <luoshixin@google.com>
e61588cd · Shixin · GitHub · 2f737e1e · e61588cd · e61588cd
Unverified Commit e61588cd authored Sep 28, 2020 by Shixin Committed by GitHub Sep 28, 2020
15 changed files
--- a/official/modeling/activations/__init__.py
+++ b/official/modeling/activations/__init__.py
@@ -17,3 +17,5 @@ from official.modeling.activations.gelu import gelu
 from official.modeling.activations.swish import hard_swish
 from official.modeling.activations.swish import identity
 from official.modeling.activations.swish import simple_swish
+from official.modeling.activations.relu import relu6
+from official.modeling.activations.sigmoid import hard_sigmoid
--- a/official/modeling/activations/relu.py
+++ b/official/modeling/activations/relu.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Customized Relu activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def relu6(features):
+  """Computes the Relu6 activation function.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.nn.relu6(features)
--- a/official/modeling/activations/relu_test.py
+++ b/official/modeling/activations/relu_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the customized Relu activation."""
+
+import tensorflow as tf
+
+from tensorflow.python.keras import \
+  keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class CustomizedReluTest(keras_parameterized.TestCase):
+
+  def test_relu6(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_relu6_data = activations.relu6(features)
+    relu6_data = tf.nn.relu6(features)
+    self.assertAllClose(customized_relu6_data, relu6_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/activations/sigmoid.py
+++ b/official/modeling/activations/sigmoid.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Customized Sigmoid activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def hard_sigmoid(features):
+  """Computes the hard sigmoid activation function.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.nn.relu6(features + tf.constant(3.)) * 0.16667
--- a/official/modeling/activations/sigmoid_test.py
+++ b/official/modeling/activations/sigmoid_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the customized Sigmoid activation."""
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import \
+  keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class CustomizedSigmoidTest(keras_parameterized.TestCase):
+
+  def _hard_sigmoid_nn(self, x):
+    x = np.float32(x)
+    return tf.nn.relu6(x + 3.) * 0.16667
+
+  def test_hard_sigmoid(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_hard_sigmoid_data = activations.hard_sigmoid(features)
+    sigmoid_data = self._hard_sigmoid_nn(features)
+    self.assertAllClose(customized_hard_sigmoid_data, sigmoid_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/tf_utils.py
+++ b/official/modeling/tf_utils.py
@@ -104,6 +104,8 @@ def get_activation(identifier):
        "gelu": activations.gelu,
        "simple_swish": activations.simple_swish,
        "hard_swish": activations.hard_swish,
+        "relu6": activations.relu6,
+        "hard_sigmoid": activations.hard_sigmoid,
        "identity": activations.identity,
    }
    identifier = str(identifier).lower()

--- a/official/vision/beta/configs/backbones.py
+++ b/official/vision/beta/configs/backbones.py
@@ -36,6 +36,14 @@ class EfficientNet(hyperparams.Config):
  se_ratio: float = 0.0


+@dataclasses.dataclass
+class MobileNet(hyperparams.Config):
+  """Mobilenet config."""
+  model_id: str = 'MobileNetV2'
+  width_multiplier: float = 1.0
+  stochastic_depth_drop_rate: float = 0.0
+
+
 @dataclasses.dataclass
 class SpineNet(hyperparams.Config):
  """SpineNet config."""
@@ -59,9 +67,11 @@ class Backbone(hyperparams.OneOfConfig):
    revnet: revnet backbone config.
    efficientnet: efficientnet backbone config.
    spinenet: spinenet backbone config.
+    mobilenet: mobilenet backbone config.
  """
  type: Optional[str] = None
  resnet: ResNet = ResNet()
  revnet: RevNet = RevNet()
  efficientnet: EfficientNet = EfficientNet()
  spinenet: SpineNet = SpineNet()
+  mobilenet: MobileNet = MobileNet()
--- a/official/vision/beta/modeling/backbones/__init__.py
+++ b/official/vision/beta/modeling/backbones/__init__.py
@@ -20,3 +20,4 @@ from official.vision.beta.modeling.backbones.resnet import ResNet
 from official.vision.beta.modeling.backbones.resnet_3d import ResNet3D
 from official.vision.beta.modeling.backbones.revnet import RevNet
 from official.vision.beta.modeling.backbones.spinenet import SpineNet
+from official.vision.beta.modeling.backbones.mobilenet import MobileNet
--- a/official/vision/beta/modeling/backbones/efficientnet.py
+++ b/official/vision/beta/modeling/backbones/efficientnet.py
@@ -20,6 +20,7 @@ from absl import logging
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.vision.beta.modeling.layers import nn_blocks
+from official.vision.beta.modeling.layers import nn_layers

 layers = tf.keras.layers

@@ -49,22 +50,6 @@ SCALING_MAP = {
 }


-def round_filters(filters, multiplier, divisor=8, min_depth=None, skip=False):
-  """Round number of filters based on depth multiplier."""
-  orig_f = filters
-  if skip or not multiplier:
-    return filters
-
-  filters *= multiplier
-  min_depth = min_depth or divisor
-  new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
-  # Make sure that round down does not go down by more than 10%.
-  if new_filters < 0.9 * filters:
-    new_filters += divisor
-  logging.info('round_filter input=%s output=%s', orig_f, new_filters)
-  return int(new_filters)
-
-
 def round_repeats(repeats, multiplier, skip=False):
  """Round number of filters based on depth multiplier."""
  if skip or not multiplier:
@@ -95,8 +80,8 @@ class BlockSpec(object):
    self.kernel_size = kernel_size
    self.strides = strides
    self.expand_ratio = expand_ratio
-    self.in_filters = round_filters(in_filters, width_scale)
-    self.out_filters = round_filters(out_filters, width_scale)
+    self.in_filters = nn_layers.round_filters(in_filters, width_scale)
+    self.out_filters = nn_layers.round_filters(out_filters, width_scale)
    self.is_output = is_output


@@ -165,7 +150,7 @@ class EfficientNet(tf.keras.Model):

    # Build stem.
    x = layers.Conv2D(
-        filters=round_filters(32, width_scale),
+        filters=nn_layers.round_filters(32, width_scale),
        kernel_size=3,
        strides=2,
        use_bias=False,
@@ -197,7 +182,7 @@ class EfficientNet(tf.keras.Model):

    # Build the final conv for classification.
    x = layers.Conv2D(
-        filters=round_filters(1280, width_scale),
+        filters=nn_layers.round_filters(1280, width_scale),
        kernel_size=1,
        strides=1,
        use_bias=False,

--- a/official/vision/beta/modeling/backbones/factory.py
+++ b/official/vision/beta/modeling/backbones/factory.py
@@ -87,6 +87,16 @@ def build_backbone(input_specs: tf.keras.layers.InputSpec,
        norm_momentum=norm_activation_config.norm_momentum,
        norm_epsilon=norm_activation_config.norm_epsilon,
        kernel_regularizer=l2_regularizer)
+  elif backbone_type == 'mobilenet':
+    backbone = backbones.MobileNet(
+        model_id=backbone_cfg.model_id,
+        width_multiplier=backbone_cfg.width_multiplier,
+        input_specs=input_specs,
+        stochastic_depth_drop_rate=backbone_cfg.stochastic_depth_drop_rate,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon,
+        kernel_regularizer=l2_regularizer)
  else:
    raise ValueError('Backbone {!r} not implement'.format(backbone_type))


--- a/official/vision/beta/modeling/backbones/factory_test.py
+++ b/official/vision/beta/modeling/backbones/factory_test.py
@@ -86,7 +86,41 @@ class FactoryTest(tf.test.TestCase, parameterized.TestCase):

    self.assertEqual(network_config, factory_network_config)

-  @combinations.generate(combinations.combine(model_id=['49'],))
+  @combinations.generate(
+      combinations.combine(
+          model_id=['MobileNetV1', 'MobileNetV2',
+                    'MobileNetV3Large', 'MobileNetV3Small',
+                    'MobileNetV3EdgeTPU'],
+          width_multiplier=[1.0, 0.75],
+      ))
+  def test_mobilenet_creation(self, model_id, width_multiplier):
+    """Test creation of Mobilenet models."""
+
+    network = backbones.MobileNet(
+        model_id=model_id,
+        width_multiplier=width_multiplier,
+        norm_momentum=0.99,
+        norm_epsilon=1e-5)
+
+    backbone_config = backbones_cfg.Backbone(
+        type='mobilenet',
+        mobilenet=backbones_cfg.MobileNet(
+            model_id=model_id, width_multiplier=width_multiplier))
+    norm_activation_config = common_cfg.NormActivation(
+        norm_momentum=0.99, norm_epsilon=1e-5)
+    model_config = retinanet_cfg.RetinaNet(
+        backbone=backbone_config, norm_activation=norm_activation_config)
+
+    factory_network = factory.build_backbone(
+        input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
+        model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(combinations.combine(model_id=['49'], ))
  def test_spinenet_creation(self, model_id):
    """Test creation of SpineNet models."""
    input_size = 128

--- a/official/vision/beta/modeling/backbones/mobilenet.py
+++ b/official/vision/beta/modeling/backbones/mobilenet.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains definitions of Mobilenet Networks."""
+
+from typing import Text, Optional, Dict
+
+# Import libraries
+import tensorflow as tf
+from official.vision.beta.modeling.layers import nn_blocks
+from official.vision.beta.modeling.layers import nn_layers
+
+layers = tf.keras.layers
+regularizers = tf.keras.regularizers
+
+
+class GlobalPoolingBlock(tf.keras.layers.Layer):
+  def __init__(self, **kwargs):
+    super(GlobalPoolingBlock, self).__init__(**kwargs)
+
+  def call(self, inputs, training=None):
+    x = layers.GlobalAveragePooling2D()(inputs)
+    outputs = layers.Reshape((1, 1, x.shape[1]))(x)
+    return outputs
+
+
+"""
+Architecture: https://arxiv.org/abs/1704.04861.
+
+"MobileNets: Efficient Convolutional Neural Networks for
+  Mobile Vision Applications"
+Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang,
+  Tobias Weyand, Marco Andreetto, Hartwig Adam
+"""
+MNV1_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV1',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters'],
+    'block_specs': [
+        ('convbn', 3, 2, 32),
+        ('depsepconv', 3, 1, 64),
+        ('depsepconv', 3, 2, 128),
+        ('depsepconv', 3, 1, 128),
+        ('depsepconv', 3, 2, 256),
+        ('depsepconv', 3, 1, 256),
+        ('depsepconv', 3, 2, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 2, 1024),
+        ('depsepconv', 3, 1, 1024),
+    ]
+}
+
+"""
+Architecture: https://arxiv.org/abs/1801.04381
+
+"MobileNetV2: Inverted Residuals and Linear Bottlenecks"
+Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
+"""
+MNV2_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV2',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'expand_ratio'],
+    'block_specs': [
+        ('convbn', 3, 2, 32, None),
+
+        ('mbconv', 3, 1, 16, 1.),
+
+        ('mbconv', 3, 2, 24, 6.),
+        ('mbconv', 3, 1, 24, 6.),
+
+        ('mbconv', 3, 2, 32, 6.),
+        ('mbconv', 3, 1, 32, 6.),
+        ('mbconv', 3, 1, 32, 6.),
+
+        ('mbconv', 3, 2, 64, 6.),
+        ('mbconv', 3, 1, 64, 6.),
+        ('mbconv', 3, 1, 64, 6.),
+        ('mbconv', 3, 1, 64, 6.),
+
+        ('mbconv', 3, 1, 96, 6.),
+        ('mbconv', 3, 1, 96, 6.),
+        ('mbconv', 3, 1, 96, 6.),
+
+        ('mbconv', 3, 2, 160, 6.),
+        ('mbconv', 3, 1, 160, 6.),
+        ('mbconv', 3, 1, 160, 6.),
+
+        ('mbconv', 3, 1, 320, 6.),
+
+        ('convbn', 1, 2, 1280, None),
+    ]
+}
+
+"""
+Architecture: https://arxiv.org/abs/1905.02244
+
+"Searching for MobileNetV3"
+Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, 
+Weijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V. Le, Hartwig Adam
+"""
+MNV3Large_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV3Large',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'activation', 'se_ratio', 'expand_ratio',
+                          'use_normalization', 'use_bias'],
+    'block_specs': [
+        ('convbn', 3, 2, 16, 'hard_swish', None, None, True, False),
+
+        ('mbconv', 3, 1, 16, 'relu', None, 1., None, False),
+
+        ('mbconv', 3, 2, 24, 'relu', None, 4., None, False),
+        ('mbconv', 3, 1, 24, 'relu', None, 3., None, False),
+
+        ('mbconv', 5, 2, 40, 'relu', 1. / 4, 3., None, False),
+        ('mbconv', 5, 1, 40, 'relu', 1. / 4, 3., None, False),
+        ('mbconv', 5, 1, 40, 'relu', 1. / 4, 3., None, False),
+
+        ('mbconv', 3, 2, 80, 'hard_swish', None, 6., None, False),
+        ('mbconv', 3, 1, 80, 'hard_swish', None, 2.5, None, False),
+        ('mbconv', 3, 1, 80, 'hard_swish', None, 2.3, None, False),
+        ('mbconv', 3, 1, 80, 'hard_swish', None, 2.3, None, False),
+
+        ('mbconv', 3, 1, 112, 'hard_swish', 1. / 4, 6., None, False),
+        ('mbconv', 3, 1, 112, 'hard_swish', 1. / 4, 6., None, False),
+
+        ('mbconv', 5, 2, 160, 'hard_swish', 1. / 4, 6, None, False),
+        ('mbconv', 5, 1, 160, 'hard_swish', 1. / 4, 6, None, False),
+        ('mbconv', 5, 1, 160, 'hard_swish', 1. / 4, 6, None, False),
+
+        ('convbn', 1, 1, 960, 'hard_swish', None, None, True, False),
+        ('gpooling', None, None, None, None, None, None, None, None),
+        ('convbn', 1, 1, 1280, 'hard_swish', None, None, False, True),
+    ]
+}
+
+MNV3Small_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV3Small',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'activation', 'se_ratio', 'expand_ratio',
+                          'use_normalization', 'use_bias'],
+    'block_specs': [
+        ('convbn', 3, 2, 16, 'hard_swish', None, None, True, False),
+
+        ('mbconv', 3, 2, 16, 'relu', 1. / 4, 1, None, False),
+
+        ('mbconv', 3, 2, 24, 'relu', None, 72. / 16, None, False),
+        ('mbconv', 3, 1, 24, 'relu', None, 88. / 24, None, False),
+
+        ('mbconv', 5, 2, 40, 'hard_swish', 1. / 4, 4., None, False),
+        ('mbconv', 5, 1, 40, 'hard_swish', 1. / 4, 6., None, False),
+        ('mbconv', 5, 1, 40, 'hard_swish', 1. / 4, 6., None, False),
+
+        ('mbconv', 5, 1, 48, 'hard_swish', 1. / 4, 3., None, False),
+        ('mbconv', 5, 1, 48, 'hard_swish', 1. / 4, 3., None, False),
+
+        ('mbconv', 5, 2, 96, 'hard_swish', 1. / 4, 6., None, False),
+        ('mbconv', 5, 1, 96, 'hard_swish', 1. / 4, 6., None, False),
+        ('mbconv', 5, 1, 96, 'hard_swish', 1. / 4, 6., None, False),
+
+        ('convbn', 1, 1, 576, 'hard_swish', None, None, True, False),
+        ('gpooling', None, None, None, None, None, None, None, None),
+        ('convbn', 1, 1, 1024, 'hard_swish', None, None, False, True),
+    ]
+}
+
+"""
+The EdgeTPU version is taken from
+github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v3.py
+"""
+MNV3EdgeTPU_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV3EdgeTPU',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'activation', 'se_ratio', 'expand_ratio',
+                          'use_residual', 'use_depthwise'],
+    'block_specs': [
+        ('convbn', 3, 2, 32, 'relu', None, None, None, None),
+
+        ('mbconv', 3, 1, 16, 'relu', None, 1., True, False),
+
+        ('mbconv', 3, 2, 32, 'relu', None, 8., True, False),
+        ('mbconv', 3, 1, 32, 'relu', None, 4., True, False),
+        ('mbconv', 3, 1, 32, 'relu', None, 4., True, False),
+        ('mbconv', 3, 1, 32, 'relu', None, 4., True, False),
+
+        ('mbconv', 3, 2, 48, 'relu', None, 8., True, False),
+        ('mbconv', 3, 1, 48, 'relu', None, 4., True, False),
+        ('mbconv', 3, 1, 48, 'relu', None, 4., True, False),
+        ('mbconv', 3, 1, 48, 'relu', None, 4., True, False),
+
+        ('mbconv', 3, 2, 96, 'relu', None, 8., True, True),
+        ('mbconv', 3, 1, 96, 'relu', None, 4., True, True),
+        ('mbconv', 3, 1, 96, 'relu', None, 4., True, True),
+        ('mbconv', 3, 1, 96, 'relu', None, 4., True, True),
+
+        ('mbconv', 3, 1, 96, 'relu', None, 8., False, True),
+        ('mbconv', 3, 1, 96, 'relu', None, 4., True, True),
+        ('mbconv', 3, 1, 96, 'relu', None, 4., True, True),
+        ('mbconv', 3, 1, 96, 'relu', None, 4., True, True),
+
+        ('mbconv', 5, 2, 160, 'relu', None, 8., True, True),
+        ('mbconv', 5, 1, 160, 'relu', None, 4., True, True),
+        ('mbconv', 5, 1, 160, 'relu', None, 4., True, True),
+        ('mbconv', 5, 1, 160, 'relu', None, 4., True, True),
+
+        ('mbconv', 3, 1, 192, 'relu', None, 8., True, True),
+
+        ('convbn', 1, 1, 1280, 'relu', None, None, None, None),
+    ]
+}
+
+SUPPORTED_SPECS_MAP = {
+    'MobileNetV1': MNV1_BLOCK_SPECS,
+    'MobileNetV2': MNV2_BLOCK_SPECS,
+    'MobileNetV3Large': MNV3Large_BLOCK_SPECS,
+    'MobileNetV3Small': MNV3Small_BLOCK_SPECS,
+    'MobileNetV3EdgeTPU': MNV3EdgeTPU_BLOCK_SPECS,
+}
+
+BLOCK_FN_MAP = {
+    'convbn': nn_blocks.Conv2DBNBlock,
+    'depsepconv': nn_blocks.DepthwiseSeparableConvBlock,
+    'mbconv': nn_blocks.InvertedBottleneckBlock,
+    'gpooling': GlobalPoolingBlock,
+}
+
+
+class BlockSpec(object):
+  """A container class that specifies the block configuration for MobileNet."""
+
+  def __init__(self,
+               block_fn: Text = 'convbn',
+               kernel_size: int = 3,
+               strides: int = 1,
+               filters: int = 32,
+               use_bias: bool = False,
+               use_normalization: bool = True,
+               activation: Text = 'relu6',
+               # used for block type InvertedResConv
+               expand_ratio: Optional[float] = 6.,
+               # used for block type InvertedResConv with SE
+               se_ratio: Optional[float] = None,
+               use_depthwise: bool = True,
+               use_residual: bool = True, ):
+    self.block_fn = block_fn
+    self.kernel_size = kernel_size
+    self.strides = strides
+    self.filters = filters
+    self.use_bias = use_bias
+    self.use_normalization = use_normalization
+    self.activation = activation
+    self.expand_ratio = expand_ratio
+    self.se_ratio = se_ratio
+    self.use_depthwise = use_depthwise
+    self.use_residual = use_residual
+
+
+def block_spec_decoder(specs: Dict,
+                       width_multiplier: float,
+                       # set to 1 for mobilenetv1
+                       divisible_by: int = 8,
+                       finegrain_classification_mode: bool = True):
+  """Decode specs for a block.
+
+  Args:
+    specs: `dict` specification of block specs of a mobilenet version.
+    width_multiplier: `float` multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    divisible_by: `int` ensures all inner dimensions are divisible by
+      this number.
+    finegrain_classification_mode: if True, the model
+      will keep the last layer large even for small multipliers. Following
+      https://arxiv.org/abs/1801.04381
+
+  Returns:
+    List[BlockSpec]` defines structure of the base network.
+  """
+
+  spec_name = specs['spec_name']
+  block_spec_schema = specs['block_spec_schema']
+  block_specs = specs['block_specs']
+
+  if len(block_specs) == 0:
+    raise ValueError('The block spec cannot be empty for {} !'.format(spec_name))
+
+  if len(block_specs[0]) != len(block_spec_schema):
+    raise ValueError('The block spec values {} do not match with '
+                     'the schema {}'.format(block_specs[0], block_spec_schema))
+
+  decoded_specs = []
+
+  for s in block_specs:
+    kw_s = dict(zip(block_spec_schema, s))
+    decoded_specs.append(BlockSpec(**kw_s))
+
+  # This adjustment applies to V2 and V3
+  if (spec_name != 'MobileNetV1'
+      and finegrain_classification_mode
+      and width_multiplier < 1.0):
+    decoded_specs[-1].filters /= width_multiplier
+
+  for ds in decoded_specs:
+    if ds.filters:
+      ds.filters = nn_layers.round_filters(filters=ds.filters,
+                                           multiplier=width_multiplier,
+                                           divisor=divisible_by,
+                                           min_depth=8)
+
+  return decoded_specs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MobileNet(tf.keras.Model):
+  def __init__(self,
+               model_id: Text = 'MobileNetV2',
+               width_multiplier: float = 1.0,
+               input_specs: layers.InputSpec = layers.InputSpec(
+                   shape=[None, None, None, 3]),
+               # The followings are for hyper-parameter tuning
+               norm_momentum: float = 0.99,
+               norm_epsilon: float = 0.001,
+               kernel_initializer: Text = 'VarianceScaling',
+               kernel_regularizer: Optional[regularizers.Regularizer] = None,
+               bias_regularizer: Optional[regularizers.Regularizer] = None,
+               # The followings should be kept the same most of the times
+               output_stride: int = None,
+               min_depth: int = 8,
+               # divisible is not used in MobileNetV1
+               divisible_by: int = 8,
+               stochastic_depth_drop_rate: float = 0.0,
+               regularize_depthwise: bool = False,
+               use_sync_bn: bool = False,
+               # finegrain is not used in MobileNetV1
+               finegrain_classification_mode: bool = True,
+               **kwargs):
+    """
+
+    Args:
+      model_id: `str` version of MobileNet. The supported values
+        are 'MobileNetV1', 'MobileNetV2', 'MobileNetV3Large', 'MobileNetV3Small',
+        and 'MobileNetV3EdgeTPU'.
+      width_multiplier: `float` multiplier for the depth (number of channels)
+        for all convolution ops. The value must be greater than zero. Typical
+        usage will be to set this value in (0, 1) to reduce the number of
+        parameters or computation cost of the model.
+      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: `str` kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      output_stride: `int` specifies the requested ratio of input to
+        output spatial resolution. If not None, then we invoke atrous convolution
+        if necessary to prevent the network from reducing the spatial resolution
+        of activation maps. Allowed values are 8 (accurate fully convolutional
+        mode), 16 (fast fully convolutional mode), 32 (classification mode).
+      min_depth: `int` minimum depth (number of channels) for all conv ops.
+        Enforced when width_multiplier < 1, and not an active constraint when
+        width_multiplier >= 1.
+      divisible_by: `int` ensures all inner dimensions are divisible by
+        this number.
+      stochastic_depth_drop_rate: `float` drop rate for drop connect layer.
+      regularize_depthwise: if Ture, apply regularization on depthwise.
+      use_sync_bn: if True, use synchronized batch normalization.
+      finegrain_classification_mode: if True, the model
+        will keep the last layer large even for small multipliers. Following
+        https://arxiv.org/abs/1801.04381
+      **kwargs: keyword arguments to be passed.
+    """
+    if model_id not in SUPPORTED_SPECS_MAP:
+      raise ValueError('The MobileNet version {} '
+                       'is not supported'.format(model_id))
+
+    if width_multiplier <= 0:
+      raise ValueError('depth_multiplier is not greater than zero.')
+
+    if output_stride is not None:
+      if model_id == 'MobileNetV1':
+        if output_stride not in [8, 16, 32]:
+          raise ValueError('Only allowed output_stride values are 8, 16, 32.')
+      else:
+        if output_stride == 0 or (output_stride > 1 and output_stride % 2):
+          raise ValueError('Output stride must be None, 1 or a multiple of 2.')
+
+    self._model_id = model_id
+    self._input_specs = input_specs
+    self._width_multiplier = width_multiplier
+    self._min_depth = min_depth
+    self._output_stride = output_stride
+    self._divisible_by = divisible_by
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._regularize_depthwise = regularize_depthwise
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._finegrain_classification_mode = finegrain_classification_mode
+
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+
+    block_specs = SUPPORTED_SPECS_MAP.get(model_id)
+    self._decoded_specs = block_spec_decoder(
+        specs=block_specs,
+        width_multiplier=self._width_multiplier,
+        divisible_by=self._get_divisible_by(),
+        finegrain_classification_mode=self._finegrain_classification_mode)
+
+    x, endpoints = self._mobilenet_base(inputs=inputs)
+
+    endpoints[max(endpoints.keys()) + 1] = x
+    self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
+
+    super(MobileNet, self).__init__(
+        inputs=inputs, outputs=endpoints, **kwargs)
+
+  def _get_divisible_by(self):
+    if self._model_id == 'MobileNetV1':
+      return 1
+    else:
+      return self._divisible_by
+
+  def _mobilenet_base(self,
+                      inputs: tf.Tensor
+                      ) -> (tf.Tensor, Dict[int, tf.Tensor]):
+    """Build the base MobileNet architecture.
+
+    Args:
+      inputs: Input tensor of shape [batch_size, height, width, channels].
+
+    Returns:
+      A tuple of output Tensor and dictionary that collects endpoints.
+    """
+
+    input_shape = inputs.get_shape().as_list()
+    if len(input_shape) != 4:
+      raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
+
+    # The current_stride variable keeps track of the output stride of the
+    # activations, i.e., the running product of convolution strides up to the
+    # current network layer. This allows us to invoke atrous convolution
+    # whenever applying the next convolution would result in the activations
+    # having output stride larger than the target output_stride.
+    current_stride = 1
+
+    # The atrous convolution rate parameter.
+    rate = 1
+
+    net = inputs
+    endpoints = {}
+    endpoint_level = 1
+    for i, block_def in enumerate(self._decoded_specs):
+      block_name = 'block_group_{}_{}'.format(block_def.block_fn, i)
+      # A small catch for gpooling block with None strides
+      if not block_def.strides:
+        block_def.strides = 1
+      if self._output_stride is not None \
+          and current_stride == self._output_stride:
+        # If we have reached the target output_stride, then we need to employ
+        # atrous convolution with stride=1 and multiply the atrous rate by the
+        # current unit's stride for use in subsequent layers.
+        layer_stride = 1
+        layer_rate = rate
+        rate *= block_def.strides
+      else:
+        layer_stride = block_def.strides
+        layer_rate = 1
+        current_stride *= block_def.strides
+
+      if block_def.block_fn == 'convbn':
+
+        net = nn_blocks.Conv2DBNBlock(
+            filters=block_def.filters,
+            kernel_size=block_def.kernel_size,
+            strides=block_def.strides,
+            activation=block_def.activation,
+            use_bias=block_def.use_bias,
+            use_normalization=block_def.use_normalization,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer,
+            use_sync_bn=self._use_sync_bn,
+            norm_momentum=self._norm_momentum,
+            norm_epsilon=self._norm_epsilon
+        )(net)
+
+      elif block_def.block_fn == 'depsepconv':
+        net = nn_blocks.DepthwiseSeparableConvBlock(
+            filters=block_def.filters,
+            kernel_size=block_def.kernel_size,
+            strides=block_def.strides,
+            activation=block_def.activation,
+            dilation_rate=layer_rate,
+            regularize_depthwise=self._regularize_depthwise,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            use_sync_bn=self._use_sync_bn,
+            norm_momentum=self._norm_momentum,
+            norm_epsilon=self._norm_epsilon,
+        )(net)
+
+      elif block_def.block_fn == 'mbconv':
+        use_rate = rate
+        if layer_rate > 1 and block_def.kernel_size != 1:
+          # We will apply atrous rate in the following cases:
+          # 1) When kernel_size is not in params, the operation then uses
+          #   default kernel size 3x3.
+          # 2) When kernel_size is in params, and if the kernel_size is not
+          #   equal to (1, 1) (there is no need to apply atrous convolution to
+          #   any 1x1 convolution).
+          use_rate = layer_rate
+        in_filters = net.shape.as_list()[-1]
+        net = nn_blocks.InvertedBottleneckBlock(
+            in_filters=in_filters,
+            out_filters=block_def.filters,
+            kernel_size=block_def.kernel_size,
+            strides=layer_stride,
+            expand_ratio=block_def.expand_ratio,
+            se_ratio=block_def.se_ratio,
+            activation=block_def.activation,
+            use_depthwise=block_def.use_depthwise,
+            use_residual=block_def.use_residual,
+            dilation_rate=use_rate,
+            regularize_depthwise=self._regularize_depthwise,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer,
+            use_sync_bn=self._use_sync_bn,
+            norm_momentum=self._norm_momentum,
+            norm_epsilon=self._norm_epsilon,
+            stochastic_depth_drop_rate=self._stochastic_depth_drop_rate,
+            divisible_by=self._get_divisible_by(),
+            target_backbone='mobilenet'
+        )(net)
+
+      elif block_def.block_fn == 'gpooling':
+        net = GlobalPoolingBlock()(net)
+
+      else:
+        raise ValueError('Unknown block type {} for layer {}'.format(
+            block_def.block_fn, i))
+
+      endpoints[endpoint_level] = net
+      endpoint_level += 1
+      net = tf.identity(net, name=block_name)
+    return net, endpoints
+
+  def get_config(self):
+    config_dict = {
+        'model_id': self._model_id,
+        'width_multiplier': self._width_multiplier,
+        'min_depth': self._min_depth,
+        'output_stride': self._output_stride,
+        'divisible_by': self._divisible_by,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'regularize_depthwise': self._regularize_depthwise,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'finegrain_classification_mode': self._finegrain_classification_mode,
+    }
+    return config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def output_specs(self):
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
--- a/official/vision/beta/modeling/backbones/mobilenet_test.py
+++ b/official/vision/beta/modeling/backbones/mobilenet_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MobileNet."""
+
+# Import libraries
+from absl.testing import parameterized
+from itertools import product
+
+import tensorflow as tf
+
+from official.vision.beta.modeling.backbones import mobilenet
+
+
+class MobileNetTest(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.parameters('MobileNetV1', 'MobileNetV2',
+                            'MobileNetV3Large', 'MobileNetV3Small',
+                            'MobileNetV3EdgeTPU')
+  def test_serialize_deserialize(self, model_id):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        model_id=model_id,
+        width_multiplier=1.0,
+        stochastic_depth_drop_rate=None,
+        use_sync_bn=False,
+        kernel_initializer='VarianceScaling',
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        output_stride=None,
+        min_depth=8,
+        divisible_by=8,
+        regularize_depthwise=False,
+        finegrain_classification_mode=True
+    )
+    network = mobilenet.MobileNet(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(network.get_config(), expected_config)
+
+    # Create another network object from the first object's config.
+    new_network = mobilenet.MobileNet.from_config(network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+
+  @parameterized.parameters(
+      product((1, 3),
+              ('MobileNetV1', 'MobileNetV2',
+               'MobileNetV3Large', 'MobileNetV3Small',
+               'MobileNetV3EdgeTPU'))
+  )
+  def test_input_specs(self, input_dim, model_id):
+    """Test different input feature dimensions."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim])
+    network = mobilenet.MobileNet(model_id=model_id, input_specs=input_specs)
+
+    inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v1_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV1', width_multiplier=0.75)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 24],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 48],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 96],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 96],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v2_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV2', width_multiplier=1.0)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 32],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v3_small_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV3Small',
+                                  width_multiplier=0.75)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 16],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 3, input_size / 2 ** 3, 24],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 3, input_size / 2 ** 3, 24],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v3_large_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV3Large',
+                                  width_multiplier=0.75)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v3_edgetpu_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV3EdgeTPU',
+                                  width_multiplier=0.75)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 24],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v1_scaling(self, width_multiplier):
+    mobilenet_v1_params = {
+        1.0: 3228864,
+        0.75: 1832976
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV1',
+                                  width_multiplier=width_multiplier)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v1_params[width_multiplier])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v2_scaling(self, width_multiplier):
+    mobilenet_v2_params = {
+        1.0: 2257984,
+        0.75: 1382064
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV2',
+                                  width_multiplier=width_multiplier)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v2_params[width_multiplier])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v3_large_scaling(self, width_multiplier):
+    mobilenet_v3_large_params = {
+        1.0: 4226432,
+        0.75: 2731616
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV3Large',
+                                  width_multiplier=width_multiplier)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v3_large_params[width_multiplier])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v3_small_scaling(self, width_multiplier):
+    mobilenet_v3_small_params = {
+        1.0: 1529968,
+        0.75: 1026552
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV3Small',
+                                  width_multiplier=width_multiplier)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v3_small_params[width_multiplier])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v3_edgetpu_scaling(self, width_multiplier):
+    mobilenet_v3_edgetpu_params = {
+        1.0: 2849312,
+        0.75: 1737288
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV3EdgeTPU',
+                                  width_multiplier=width_multiplier)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v3_edgetpu_params[width_multiplier])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
--- a/official/vision/beta/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/modeling/layers/nn_blocks.py
@@ -14,10 +14,10 @@
 # ==============================================================================
 """Contains common building blocks for neural networks."""

-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Text

 # Import libraries
-
+from absl import logging
 import tensorflow as tf

 from official.modeling import tf_utils
@@ -391,9 +391,18 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
               kernel_regularizer=None,
               bias_regularizer=None,
               activation='relu',
+               se_inner_activation='relu',
+               se_gating_activation='sigmoid',
+               depthwise_activation=None,
               use_sync_bn=False,
+               dilation_rate=1,
+               divisible_by=1,
+               regularize_depthwise=False,
+               use_depthwise=True,
+               use_residual=True,
               norm_momentum=0.99,
               norm_epsilon=0.001,
+               target_backbone='efficientnet',
               **kwargs):
    """An inverted bottleneck block with BN after convolutions.

@@ -414,7 +423,16 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
        Default to None.
      activation: `str` name of the activation function.
+      depthwise_activation: `str` name of the activation function for depthwise only.
      use_sync_bn: if True, use synchronized batch normalization.
+      dilation_rate: `int` an integer specifying the dilation rate to use for.
+      divisible_by: `int` ensures all inner dimensions are divisible by this number.
+      dilated convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      regularize_depthwise: `bool` whether or not apply regularization on depthwise.
+      use_depthwise: `bool` whether to uses fused convolutions instead of depthwise.
+      use_residual: `bool`whether to include residual connection between input
+      and output.
      norm_momentum: `float` normalization omentum for the moving average.
      norm_epsilon: `float` small float added to variance to avoid dividing by
        zero.
@@ -428,15 +446,26 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
    self._strides = strides
    self._kernel_size = kernel_size
    self._se_ratio = se_ratio
+    self._divisible_by = divisible_by
    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._dilation_rate = dilation_rate
    self._use_sync_bn = use_sync_bn
+    self._regularize_depthwise = regularize_depthwise
+    self._use_depthwise = use_depthwise
+    self._use_residual = use_residual
    self._activation = activation
+    self._se_inner_activation = se_inner_activation
+    self._se_gating_activation = se_gating_activation
+    self._depthwise_activation = depthwise_activation
    self._kernel_initializer = kernel_initializer
    self._norm_momentum = norm_momentum
    self._norm_epsilon = norm_epsilon
    self._kernel_regularizer = kernel_regularizer
    self._bias_regularizer = bias_regularizer
+    self._target_backbone = target_backbone

+    if target_backbone == 'mobilenet':
+      self._se_gating_activation = 'hard_sigmoid'
    if use_sync_bn:
      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
    else:
@@ -446,14 +475,32 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
    else:
      self._bn_axis = 1
    self._activation_fn = tf_utils.get_activation(activation)
+    if not depthwise_activation:
+      self._depthwise_activation = activation
+    self._depthwise_activation_fn = tf_utils.get_activation(
+        self._depthwise_activation)
+    if regularize_depthwise:
+      self._depthsize_regularizer = kernel_regularizer
+    else:
+      self._depthsize_regularizer = None

  def build(self, input_shape):
-    if self._expand_ratio != 1:
+    expand_filters = self._in_filters
+    if self._expand_ratio > 1:
      # First 1x1 conv for channel expansion.
+      expand_filters = nn_layers.make_divisible(
+          self._in_filters * self._expand_ratio, self._divisible_by)
+      logging.info('expand_filter: {}, divisible_version {}'.format(
+          self._in_filters * self._expand_ratio, expand_filters
+      ))
+      expand_kernel = 1 if self._use_depthwise else self._kernel_size
+      expand_stride = 1 if self._use_depthwise else self._strides
+
      self._conv0 = tf.keras.layers.Conv2D(
-          filters=self._in_filters * self._expand_ratio,
-          kernel_size=1,
-          strides=1,
+          filters=expand_filters,
+          kernel_size=expand_kernel,
+          strides=expand_stride,
+          padding='same',
          use_bias=False,
          kernel_initializer=self._kernel_initializer,
          kernel_regularizer=self._kernel_regularizer,
@@ -463,29 +510,39 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
          momentum=self._norm_momentum,
          epsilon=self._norm_epsilon)

-    # Depthwise conv.
-    self._conv1 = tf.keras.layers.DepthwiseConv2D(
-        kernel_size=(self._kernel_size, self._kernel_size),
-        strides=self._strides,
-        padding='same',
-        use_bias=False,
-        depthwise_initializer=self._kernel_initializer,
-        depthwise_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
+    if self._use_depthwise:
+      # Depthwise conv.
+      self._conv1 = tf.keras.layers.DepthwiseConv2D(
+          kernel_size=(self._kernel_size, self._kernel_size),
+          strides=self._strides,
+          padding='same',
+          depth_multiplier=1,
+          dilation_rate=self._dilation_rate,
+          use_bias=False,
+          depthwise_initializer=self._kernel_initializer,
+          depthwise_regularizer=self._depthsize_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm1 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)

    # Squeeze and excitation.
    if self._se_ratio is not None and self._se_ratio > 0 and self._se_ratio <= 1:
+      logging.info('Use Squeeze and excitation.')
+      in_filters = self._in_filters
+      if self._target_backbone == 'mobilenet':
+        in_filters = expand_filters
      self._squeeze_excitation = nn_layers.SqueezeExcitation(
-          in_filters=self._in_filters,
+          in_filters=in_filters,
+          out_filters=expand_filters,
          se_ratio=self._se_ratio,
-          expand_ratio=self._expand_ratio,
+          divisible_by=self._divisible_by,
          kernel_initializer=self._kernel_initializer,
          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
+          bias_regularizer=self._bias_regularizer,
+          activation=self._se_inner_activation,
+          gating_activation=self._se_gating_activation)
    else:
      self._squeeze_excitation = None

@@ -494,6 +551,7 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
        filters=self._out_filters,
        kernel_size=1,
        strides=1,
+        padding='same',
        use_bias=False,
        kernel_initializer=self._kernel_initializer,
        kernel_regularizer=self._kernel_regularizer,
@@ -519,12 +577,20 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
        'strides': self._strides,
        'kernel_size': self._kernel_size,
        'se_ratio': self._se_ratio,
+        'divisible_by': self._divisible_by,
        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
        'kernel_initializer': self._kernel_initializer,
        'kernel_regularizer': self._kernel_regularizer,
        'bias_regularizer': self._bias_regularizer,
        'activation': self._activation,
+        'se_inner_activation': self._se_inner_activation,
+        'se_gating_activation': self._se_gating_activation,
+        'depthwise_activation': self._depthwise_activation,
+        'dilation_rate': self._dilation_rate,
        'use_sync_bn': self._use_sync_bn,
+        'regularize_depthwise': self._regularize_depthwise,
+        'use_depthwise': self._use_depthwise,
+        'use_residual': self._use_residual,
        'norm_momentum': self._norm_momentum,
        'norm_epsilon': self._norm_epsilon
    }
@@ -533,16 +599,17 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):

  def call(self, inputs, training=None):
    shortcut = inputs
-    if self._expand_ratio != 1:
+    if self._expand_ratio > 1:
      x = self._conv0(inputs)
      x = self._norm0(x)
      x = self._activation_fn(x)
    else:
      x = inputs

-    x = self._conv1(x)
-    x = self._norm1(x)
-    x = self._activation_fn(x)
+    if self._use_depthwise:
+      x = self._conv1(x)
+      x = self._norm1(x)
+      x = self._depthwise_activation_fn(x)

    if self._squeeze_excitation:
      x = self._squeeze_excitation(x)
@@ -550,7 +617,9 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
    x = self._conv2(x)
    x = self._norm2(x)

-    if self._in_filters == self._out_filters and self._strides == 1:
+    if (self._use_residual and
+        self._in_filters == self._out_filters and
+        self._strides == 1):
      if self._stochastic_depth:
        x = self._stochastic_depth(x, training=training)
      x = tf.add(x, shortcut)
@@ -571,7 +640,7 @@ class ResidualInner(tf.keras.layers.Layer):
      filters: int,
      strides: int,
      kernel_initializer: Union[
-          str, Callable[..., tf.keras.initializers.Initializer]]
+        str, Callable[..., tf.keras.initializers.Initializer]]
      = 'VarianceScaling',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
@@ -694,7 +763,7 @@ class BottleneckResidualInner(tf.keras.layers.Layer):
      filters: int,
      strides: int,
      kernel_initializer: Union[
-          str, Callable[..., tf.keras.initializers.Initializer]]
+        str, Callable[..., tf.keras.initializers.Initializer]]
      = 'VarianceScaling',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
@@ -887,7 +956,7 @@ class ReversibleLayer(tf.keras.layers.Layer):

    @tf.custom_gradient
    def reversible(x: tf.Tensor) -> Tuple[
-        tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor], List[tf.Tensor]]]]:
+      tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor], List[tf.Tensor]]]]:
      """Implements Algorithm 1 in RevNet paper.

      Paper: https://arxiv.org/pdf/1707.04585.pdf
@@ -926,7 +995,7 @@ class ReversibleLayer(tf.keras.layers.Layer):

      def grad_fn(dy: tf.Tensor,
                  variables: Optional[List[tf.Variable]] = None,
-                 ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
+                  ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
        """Given dy calculate (dy/dx)|_{x_{input}} using f/g."""
        if irreversible or not self._manual_grads:
          grads_combined = fwdtape.gradient(
@@ -947,11 +1016,11 @@ class ReversibleLayer(tf.keras.layers.Layer):
          self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables]

          # Algorithm 1 in paper (line # documented in-line)
-          z1 = y1_nograd                                         # line 2
+          z1 = y1_nograd  # line 2
          with tf.GradientTape() as gtape:
            gtape.watch(z1)
            g_z1 = self._g(z1, training=training)
-          x2 = y2_nograd - g_z1                                  # line 3
+          x2 = y2_nograd - g_z1  # line 3

          with tf.GradientTape() as ftape:
            ftape.watch(x2)
@@ -963,16 +1032,16 @@ class ReversibleLayer(tf.keras.layers.Layer):
              g_z1,
              [z1] + self._g.trainable_variables,
              output_gradients=dy2)
-          dz1 = dy1 + g_grads_combined[0]                        # line 5
-          dwg = g_grads_combined[1:]                             # line 9
+          dz1 = dy1 + g_grads_combined[0]  # line 5
+          dwg = g_grads_combined[1:]  # line 9

          f_grads_combined = ftape.gradient(
              f_x2,
              [x2] + self._f.trainable_variables,
              output_gradients=dz1)
-          dx2 = dy2 + f_grads_combined[0]                        # line 6
-          dwf = f_grads_combined[1:]                             # line 8
-          dx1 = dz1                                              # line 7
+          dx2 = dy2 + f_grads_combined[0]  # line 6
+          dwf = f_grads_combined[1:]  # line 8
+          dx1 = dz1  # line 7

          # Pack the input and variable gradients.
          dx = tf.concat([dx1, dx2], axis=self._axis)
@@ -989,3 +1058,241 @@ class ReversibleLayer(tf.keras.layers.Layer):

    activations = reversible(inputs)
    return activations
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
+  """An depthwise separable convolution block with batch normalization."""
+
+  def __init__(self,
+               filters: int,
+               kernel_size: int = 3,
+               strides: int = 1,
+               regularize_depthwise=False,
+               activation: Text = 'relu6',
+               kernel_initializer: Text = 'VarianceScaling',
+               kernel_regularizer: Optional[
+                 tf.keras.regularizers.Regularizer] = None,
+               dilation_rate: int = 1,
+               use_sync_bn: bool = False,
+               norm_momentum: float = 0.99,
+               norm_epsilon: float = 0.001,
+               **kwargs):
+    """An convolution block with batch normalization.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      kernel_size: `int` an integer specifying the height and width of the
+      2D convolution window.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      regularize_depthwise: if Ture, apply regularization on depthwise.
+      activation: `str` name of the activation function.
+      kernel_size: `int` kernel_size of the conv layer.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+                          Default to None.
+      dilation_rate: an integer or tuple/list of 2 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(DepthwiseSeparableConvBlock, self).__init__(**kwargs)
+    self._filters = filters
+    self._kernel_size = kernel_size
+    self._strides = strides
+    self._activation = activation
+    self._regularize_depthwise = regularize_depthwise
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._dilation_rate = dilation_rate
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+    if regularize_depthwise:
+      self._depthsize_regularizer = kernel_regularizer
+    else:
+      self._depthsize_regularizer = None
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'regularize_depthwise': self._regularize_depthwise,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(DepthwiseSeparableConvBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+
+    self._dwconv0 = tf.keras.layers.DepthwiseConv2D(
+        kernel_size=self._kernel_size,
+        strides=self._strides,
+        padding='same',
+        depth_multiplier=1,
+        dilation_rate=self._dilation_rate,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._depthsize_regularizer,
+        use_bias=False)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    super(DepthwiseSeparableConvBlock, self).build(input_shape)
+
+  def call(self, inputs, training=None):
+    x = self._dwconv0(inputs)
+    x = self._norm0(x)
+    x = self._activation_fn(x)
+
+    x = self._conv1(x)
+    x = self._norm1(x)
+    return self._activation_fn(x)
+
+
+class Conv2DBNBlock(tf.keras.layers.Layer):
+  """A convolution block with batch normalization."""
+
+  def __init__(self,
+               filters: int,
+               kernel_size: int = 3,
+               strides: int = 1,
+               use_bias: bool = False,
+               activation: Text = 'relu6',
+               kernel_initializer: Text = 'VarianceScaling',
+               kernel_regularizer: Optional[
+                 tf.keras.regularizers.Regularizer] = None,
+               bias_regularizer: Optional[
+                 tf.keras.regularizers.Regularizer] = None,
+               use_normalization: bool = True,
+               use_sync_bn: bool = False,
+               norm_momentum: float = 0.99,
+               norm_epsilon: float = 0.001,
+               **kwargs):
+    """A convolution block with batch normalization.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      kernel_size: `int` an integer specifying the height and width of the
+      2D convolution window.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_bias: if True, use biase in the convolution layer.
+      activation: `str` name of the activation function.
+      kernel_size: `int` kernel_size of the conv layer.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+                          Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+                        Default to None.
+      use_normalization: if True, use batch normalization.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization momentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(Conv2DBNBlock, self).__init__(**kwargs)
+    self._filters = filters
+    self._kernel_size = kernel_size
+    self._strides = strides
+    self._activation = activation
+    self._use_bias = use_bias
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_normalization = use_normalization
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'kernel_size': self._kernel_size,
+        'use_bias': self._use_bias,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'use_normalization': self._use_normalization,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(Conv2DBNBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    self._conv0 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=self._kernel_size,
+        strides=self._strides,
+        padding='same',
+        use_bias=self._use_bias,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    if self._use_normalization:
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    super(Conv2DBNBlock, self).build(input_shape)
+
+  def call(self, inputs, training=None):
+    x = self._conv0(inputs)
+    if self._use_normalization:
+      x = self._norm0(x)
+    return self._activation_fn(x)
\ No newline at end of file
--- a/official/vision/beta/modeling/layers/nn_layers.py
+++ b/official/vision/beta/modeling/layers/nn_layers.py
@@ -15,45 +15,95 @@
 """Contains common building blocks for neural networks."""

 # Import libraries
+from absl import logging
+from typing import Optional
+
 import tensorflow as tf

 from official.modeling import tf_utils


+def make_divisible(value: float,
+                   divisor: int,
+                   min_value: Optional[float] = None
+                   ) -> int:
+  """This utility function is to ensure that all layers have a channel number
+  that is divisible by 8.
+  Args:
+    value: `float` original value.
+    divisor: `int` the divisor that need to be checked upon.
+    min_value: `float` minimum value threshold.
+
+  Returns:
+    The adjusted value in `int` that divisible against divisor.
+  """
+  if min_value is None:
+    min_value = divisor
+  new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_value < 0.9 * value:
+    new_value += divisor
+  return new_value
+
+
+def round_filters(filters: int,
+                  multiplier: float,
+                  divisor: int = 8,
+                  min_depth: Optional[int] = None,
+                  skip: bool = False):
+  """Round number of filters based on width multiplier."""
+  orig_f = filters
+  if skip or not multiplier:
+    return filters
+
+  new_filters = make_divisible(value=filters * multiplier,
+                               divisor=divisor,
+                               min_value=min_depth)
+
+  logging.info('round_filter input=%s output=%s', orig_f, new_filters)
+  return int(new_filters)
+
+
 @tf.keras.utils.register_keras_serializable(package='Vision')
 class SqueezeExcitation(tf.keras.layers.Layer):
  """Squeeze and excitation layer."""

  def __init__(self,
               in_filters,
+               out_filters,
               se_ratio,
-               expand_ratio,
+               divisible_by=1,
               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               activation='relu',
+               gating_activation='sigmoid',
               **kwargs):
    """Implementation for squeeze and excitation.

    Args:
      in_filters: `int` number of filters of the input tensor.
+      out_filters: `int` number of filters of the output tensor.
      se_ratio: `float` or None. If not None, se ratio for the squeeze and
        excitation layer.
-      expand_ratio: `int` expand_ratio for a MBConv block.
+      divisible_by: `int` ensures all inner dimensions are divisible by this number.
      kernel_initializer: kernel_initializer for convolutional layers.
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
        Default to None.
      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
        Default to None.
      activation: `str` name of the activation function.
+      gating_activation: `str` name of the activation function for final gating function.
      **kwargs: keyword arguments to be passed.
    """
    super(SqueezeExcitation, self).__init__(**kwargs)

    self._in_filters = in_filters
+    self._out_filters = out_filters
    self._se_ratio = se_ratio
-    self._expand_ratio = expand_ratio
+    self._divisible_by = divisible_by
    self._activation = activation
+    self._gating_activation = gating_activation
    self._kernel_initializer = kernel_initializer
    self._kernel_regularizer = kernel_regularizer
    self._bias_regularizer = bias_regularizer
@@ -62,9 +112,12 @@ class SqueezeExcitation(tf.keras.layers.Layer):
    else:
      self._spatial_axis = [2, 3]
    self._activation_fn = tf_utils.get_activation(activation)
+    self._gating_activation_fn = tf_utils.get_activation(gating_activation)

  def build(self, input_shape):
-    num_reduced_filters = max(1, int(self._in_filters * self._se_ratio))
+    num_reduced_filters = make_divisible(
+        max(1, int(self._in_filters * self._se_ratio)),
+        divisor=self._divisible_by)

    self._se_reduce = tf.keras.layers.Conv2D(
        filters=num_reduced_filters,
@@ -77,7 +130,7 @@ class SqueezeExcitation(tf.keras.layers.Layer):
        bias_regularizer=self._bias_regularizer)

    self._se_expand = tf.keras.layers.Conv2D(
-        filters=self._in_filters * self._expand_ratio,
+        filters=self._out_filters,
        kernel_size=1,
        strides=1,
        padding='same',
@@ -91,22 +144,24 @@ class SqueezeExcitation(tf.keras.layers.Layer):
  def get_config(self):
    config = {
        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
        'se_ratio': self._se_ratio,
-        'expand_ratio': self._expand_ratio,
+        'divisible_by': self._divisible_by,
        'strides': self._strides,
        'kernel_initializer': self._kernel_initializer,
        'kernel_regularizer': self._kernel_regularizer,
        'bias_regularizer': self._bias_regularizer,
        'activation': self._activation,
+        'gating_activation': self._gating_activation,
    }
    base_config = super(SqueezeExcitation, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

  def call(self, inputs):
    x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True)
-    x = self._se_expand(self._activation_fn(self._se_reduce(x)))
-
-    return tf.sigmoid(x) * inputs
+    x = self._activation_fn(self._se_reduce(x))
+    x = self._gating_activation_fn(self._se_expand(x))
+    return x * inputs


 @tf.keras.utils.register_keras_serializable(package='Vision')