Merge pull request #9317 from tensorflow:mobilenet-beta

PiperOrigin-RevId: 336771016

Merge pull request #9317 from tensorflow:mobilenet-beta
PiperOrigin-RevId: 336771016
c894eafe · A. Unique TensorFlower · fcb43c38 · d68c7b96 · c894eafe · c894eafe
Commit c894eafe authored Oct 12, 2020 by A. Unique TensorFlower
19 changed files
--- a/official/modeling/activations/__init__.py
+++ b/official/modeling/activations/__init__.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Activations package definition."""
 from official.modeling.activations.gelu import gelu
+from official.modeling.activations.relu import relu6
+from official.modeling.activations.sigmoid import hard_sigmoid
 from official.modeling.activations.swish import hard_swish
 from official.modeling.activations.swish import identity
 from official.modeling.activations.swish import simple_swish
--- a/official/modeling/activations/relu.py
+++ b/official/modeling/activations/relu.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Customized Relu activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def relu6(features):
+  """Computes the Relu6 activation function.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.nn.relu6(features)
--- a/official/modeling/activations/relu_test.py
+++ b/official/modeling/activations/relu_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the customized Relu activation."""
+
+import tensorflow as tf
+
+from tensorflow.python.keras import \
+  keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class CustomizedReluTest(keras_parameterized.TestCase):
+
+  def test_relu6(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_relu6_data = activations.relu6(features)
+    relu6_data = tf.nn.relu6(features)
+    self.assertAllClose(customized_relu6_data, relu6_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/activations/sigmoid.py
+++ b/official/modeling/activations/sigmoid.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Customized Sigmoid activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def hard_sigmoid(features):
+  """Computes the hard sigmoid activation function.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.nn.relu6(features + tf.constant(3.)) * 0.16667
--- a/official/modeling/activations/sigmoid_test.py
+++ b/official/modeling/activations/sigmoid_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the customized Sigmoid activation."""
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import \
+  keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class CustomizedSigmoidTest(keras_parameterized.TestCase):
+
+  def _hard_sigmoid_nn(self, x):
+    x = np.float32(x)
+    return tf.nn.relu6(x + 3.) * 0.16667
+
+  def test_hard_sigmoid(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_hard_sigmoid_data = activations.hard_sigmoid(features)
+    sigmoid_data = self._hard_sigmoid_nn(features)
+    self.assertAllClose(customized_hard_sigmoid_data, sigmoid_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/tf_utils.py
+++ b/official/modeling/tf_utils.py
@@ -104,6 +104,8 @@ def get_activation(identifier):
        "gelu": activations.gelu,
        "simple_swish": activations.simple_swish,
        "hard_swish": activations.hard_swish,
+        "relu6": activations.relu6,
+        "hard_sigmoid": activations.hard_sigmoid,
        "identity": activations.identity,
    }
    identifier = str(identifier).lower()

--- a/official/vision/beta/configs/backbones.py
+++ b/official/vision/beta/configs/backbones.py
@@ -36,6 +36,14 @@ class EfficientNet(hyperparams.Config):
  se_ratio: float = 0.0


+@dataclasses.dataclass
+class MobileNet(hyperparams.Config):
+  """Mobilenet config."""
+  model_id: str = 'MobileNetV2'
+  filter_size_scale: float = 1.0
+  stochastic_depth_drop_rate: float = 0.0
+
+
 @dataclasses.dataclass
 class SpineNet(hyperparams.Config):
  """SpineNet config."""
@@ -60,9 +68,11 @@ class Backbone(hyperparams.OneOfConfig):
    revnet: revnet backbone config.
    efficientnet: efficientnet backbone config.
    spinenet: spinenet backbone config.
+    mobilenet: mobilenet backbone config.
  """
  type: Optional[str] = None
  resnet: ResNet = ResNet()
  revnet: RevNet = RevNet()
  efficientnet: EfficientNet = EfficientNet()
  spinenet: SpineNet = SpineNet()
+  mobilenet: MobileNet = MobileNet()
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_gpu.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_gpu.yaml
+# MobileNetV2_1.0 ImageNet classification. 71.0% top-1 and 90.0% top-5 accuracy.
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float16'
+  loss_scale: 'dynamic'
+task:
+  model:
+    num_classes: 1001
+    input_size: [224, 224, 3]
+    backbone:
+      type: 'mobilenet'
+      mobilenet:
+        model_id: 'MobileNetV2'
+        filter_size_scale: 1.0
+    dropout_rate: 0.2
+  losses:
+    l2_weight_decay: 0.00001
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 1024  # 128 * 8
+    dtype: 'float16'
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: true
+    global_batch_size: 1024  # 128 * 8
+    dtype: 'float16'
+    drop_remainder: false
+trainer:
+  train_steps: 625500  # 500 epochs
+  validation_steps: 49
+  validation_interval: 1251
+  steps_per_loop: 1251  # NUM_EXAMPLES (1281167) // global_batch_size
+  summary_interval: 1251
+  checkpoint_interval: 1251
+  optimizer_config:
+    learning_rate:
+      type: 'exponential'
+      exponential:
+        initial_learning_rate: 0.064  # 0.008 * batch_size / 128
+        decay_steps: 3127  # 2.5 * steps_per_epoch
+        decay_rate: 0.96
+        staircase: true
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 6255
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_tpu.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_tpu.yaml
+# MobileNetV2_1.0 ImageNet classification. 72.72% top-1 and 91.05% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [224, 224, 3]
+    backbone:
+      type: 'mobilenet'
+      mobilenet:
+        model_id: 'MobileNetV2'
+        filter_size_scale: 1.0
+    dropout_rate: 0.2
+  losses:
+    l2_weight_decay: 0.00001
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 156000  # 500 epochs
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312  # NUM_EXAMPLES (1281167) // global_batch_size
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    learning_rate:
+      type: 'exponential'
+      exponential:
+        initial_learning_rate: 0.256  # 0.008 * batch_size / 128
+        decay_steps: 780  # 2.5 * steps_per_epoch
+        decay_rate: 0.96
+        staircase: true
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560
--- a/official/vision/beta/configs/image_classification.py
+++ b/official/vision/beta/configs/image_classification.py
@@ -128,10 +128,10 @@ def image_classification_imagenet() -> cfg.ExperimentConfig:
                          80 * steps_per_epoch
                      ],
                      'values': [
-                          0.1 *  train_batch_size / 256,
-                          0.01 *  train_batch_size / 256,
-                          0.001 *  train_batch_size / 256,
-                          0.0001 *  train_batch_size / 256,
+                          0.1 * train_batch_size / 256,
+                          0.01 * train_batch_size / 256,
+                          0.001 * train_batch_size / 256,
+                          0.0001 * train_batch_size / 256,
                      ]
                  }
              },
@@ -215,3 +215,75 @@ def image_classification_imagenet_revnet() -> cfg.ExperimentConfig:
      ])

  return config
+
+
+@exp_factory.register_config_factory('mobilenet_imagenet')
+def image_classification_imagenet_mobilenet() -> cfg.ExperimentConfig:
+  """Image classification on imagenet with mobilenet."""
+  train_batch_size = 4096
+  eval_batch_size = 4096
+  steps_per_epoch = IMAGENET_TRAIN_EXAMPLES // train_batch_size
+  config = cfg.ExperimentConfig(
+      task=ImageClassificationTask(
+          model=ImageClassificationModel(
+              num_classes=1001,
+              dropout_rate=0.2,
+              input_size=[224, 224, 3],
+              backbone=backbones.Backbone(
+                  type='mobilenet',
+                  mobilenet=backbones.MobileNet(
+                      model_id='MobileNetV2', filter_size_scale=1.0)),
+              norm_activation=common.NormActivation(
+                  norm_momentum=0.997, norm_epsilon=1e-3)),
+          losses=Losses(l2_weight_decay=1e-5, label_smoothing=0.1),
+          train_data=DataConfig(
+              input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size),
+          validation_data=DataConfig(
+              input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'valid*'),
+              is_training=False,
+              global_batch_size=eval_batch_size)),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=500 * steps_per_epoch,
+          validation_steps=IMAGENET_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'rmsprop',
+                  'rmsprop': {
+                      'rho': 0.9,
+                      'momentum': 0.9,
+                      'epsilon': 0.002,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'exponential',
+                  'exponential': {
+                      'initial_learning_rate':
+                          0.008 * (train_batch_size // 128),
+                      'decay_steps':
+                          int(2.5 * steps_per_epoch),
+                      'decay_rate':
+                          0.98,
+                      'staircase':
+                          True
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 5 * steps_per_epoch,
+                      'warmup_learning_rate': 0
+                  }
+              },
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
--- a/official/vision/beta/configs/image_classification_test.py
+++ b/official/vision/beta/configs/image_classification_test.py
@@ -28,7 +28,8 @@ from official.vision.beta.configs import image_classification as exp_cfg
 class ImageClassificationConfigTest(tf.test.TestCase, parameterized.TestCase):

  @parameterized.parameters(('resnet_imagenet',),
-                            ('revnet_imagenet',))
+                            ('revnet_imagenet',),
+                            ('mobilenet_imagenet'),)
  def test_image_classification_configs(self, config_name):
    config = exp_factory.get_exp_config(config_name)
    self.assertIsInstance(config, cfg.ExperimentConfig)

--- a/official/vision/beta/modeling/backbones/__init__.py
+++ b/official/vision/beta/modeling/backbones/__init__.py
@@ -16,6 +16,7 @@
 """Backbones package definition."""

 from official.vision.beta.modeling.backbones.efficientnet import EfficientNet
+from official.vision.beta.modeling.backbones.mobilenet import MobileNet
 from official.vision.beta.modeling.backbones.resnet import ResNet
 from official.vision.beta.modeling.backbones.resnet_3d import ResNet3D
 from official.vision.beta.modeling.backbones.revnet import RevNet

--- a/official/vision/beta/modeling/backbones/efficientnet.py
+++ b/official/vision/beta/modeling/backbones/efficientnet.py
@@ -16,11 +16,11 @@

 import math
 # Import libraries
-from absl import logging
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.vision.beta.modeling.backbones import factory
 from official.vision.beta.modeling.layers import nn_blocks
+from official.vision.beta.modeling.layers import nn_layers

 layers = tf.keras.layers

@@ -50,22 +50,6 @@ SCALING_MAP = {
 }


-def round_filters(filters, multiplier, divisor=8, min_depth=None, skip=False):
-  """Round number of filters based on depth multiplier."""
-  orig_f = filters
-  if skip or not multiplier:
-    return filters
-
-  filters *= multiplier
-  min_depth = min_depth or divisor
-  new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
-  # Make sure that round down does not go down by more than 10%.
-  if new_filters < 0.9 * filters:
-    new_filters += divisor
-  logging.info('round_filter input=%s output=%s', orig_f, new_filters)
-  return int(new_filters)
-
-
 def round_repeats(repeats, multiplier, skip=False):
  """Round number of filters based on depth multiplier."""
  if skip or not multiplier:
@@ -96,8 +80,8 @@ class BlockSpec(object):
    self.kernel_size = kernel_size
    self.strides = strides
    self.expand_ratio = expand_ratio
-    self.in_filters = round_filters(in_filters, width_scale)
-    self.out_filters = round_filters(out_filters, width_scale)
+    self.in_filters = nn_layers.round_filters(in_filters, width_scale)
+    self.out_filters = nn_layers.round_filters(out_filters, width_scale)
    self.is_output = is_output


@@ -166,7 +150,7 @@ class EfficientNet(tf.keras.Model):

    # Build stem.
    x = layers.Conv2D(
-        filters=round_filters(32, width_scale),
+        filters=nn_layers.round_filters(32, width_scale),
        kernel_size=3,
        strides=2,
        use_bias=False,
@@ -198,7 +182,7 @@ class EfficientNet(tf.keras.Model):

    # Build the final conv for classification.
    x = layers.Conv2D(
-        filters=round_filters(1280, width_scale),
+        filters=nn_layers.round_filters(1280, width_scale),
        kernel_size=1,
        strides=1,
        use_bias=False,

--- a/official/vision/beta/modeling/backbones/factory_test.py
+++ b/official/vision/beta/modeling/backbones/factory_test.py
@@ -86,6 +86,40 @@ class FactoryTest(tf.test.TestCase, parameterized.TestCase):

    self.assertEqual(network_config, factory_network_config)

+  @combinations.generate(
+      combinations.combine(
+          model_id=['MobileNetV1', 'MobileNetV2',
+                    'MobileNetV3Large', 'MobileNetV3Small',
+                    'MobileNetV3EdgeTPU'],
+          filter_size_scale=[1.0, 0.75],
+      ))
+  def test_mobilenet_creation(self, model_id, filter_size_scale):
+    """Test creation of Mobilenet models."""
+
+    network = backbones.MobileNet(
+        model_id=model_id,
+        filter_size_scale=filter_size_scale,
+        norm_momentum=0.99,
+        norm_epsilon=1e-5)
+
+    backbone_config = backbones_cfg.Backbone(
+        type='mobilenet',
+        mobilenet=backbones_cfg.MobileNet(
+            model_id=model_id, filter_size_scale=filter_size_scale))
+    norm_activation_config = common_cfg.NormActivation(
+        norm_momentum=0.99, norm_epsilon=1e-5)
+    model_config = retinanet_cfg.RetinaNet(
+        backbone=backbone_config, norm_activation=norm_activation_config)
+
+    factory_network = factory.build_backbone(
+        input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
+        model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
  @combinations.generate(combinations.combine(model_id=['49'],))
  def test_spinenet_creation(self, model_id):
    """Test creation of SpineNet models."""

--- a/official/vision/beta/modeling/backbones/mobilenet.py
+++ b/official/vision/beta/modeling/backbones/mobilenet.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains definitions of Mobilenet Networks."""
+
+from typing import Text, Optional, Dict, Any, Tuple
+
+# Import libraries
+import dataclasses
+import tensorflow as tf
+from official.modeling import hyperparams
+from official.modeling import tf_utils
+from official.vision.beta.modeling.backbones import factory
+from official.vision.beta.modeling.layers import nn_blocks
+from official.vision.beta.modeling.layers import nn_layers
+
+layers = tf.keras.layers
+regularizers = tf.keras.regularizers
+
+
+#  pylint: disable=pointless-string-statement
+
+
+class Conv2DBNBlock(tf.keras.layers.Layer):
+  """A convolution block with batch normalization."""
+
+  def __init__(
+      self,
+      filters: int,
+      kernel_size: int = 3,
+      strides: int = 1,
+      use_bias: bool = False,
+      activation: Text = 'relu6',
+      kernel_initializer: Text = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      use_normalization: bool = True,
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      **kwargs):
+    """A convolution block with batch normalization.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      kernel_size: `int` an integer specifying the height and width of the
+        2D convolution window.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_bias: if True, use biase in the convolution layer.
+      activation: `str` name of the activation function.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+                          Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+                        Default to None.
+      use_normalization: if True, use batch normalization.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization momentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(Conv2DBNBlock, self).__init__(**kwargs)
+    self._filters = filters
+    self._kernel_size = kernel_size
+    self._strides = strides
+    self._activation = activation
+    self._use_bias = use_bias
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_normalization = use_normalization
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'kernel_size': self._kernel_size,
+        'use_bias': self._use_bias,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'use_normalization': self._use_normalization,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(Conv2DBNBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    self._conv0 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=self._kernel_size,
+        strides=self._strides,
+        padding='same',
+        use_bias=self._use_bias,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    if self._use_normalization:
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    super(Conv2DBNBlock, self).build(input_shape)
+
+  def call(self, inputs, training=None):
+    x = self._conv0(inputs)
+    if self._use_normalization:
+      x = self._norm0(x)
+    return self._activation_fn(x)
+
+"""
+Architecture: https://arxiv.org/abs/1704.04861.
+
+"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision
+Applications" Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko,
+Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam
+"""
+MNV1_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV1',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters'],
+    'block_specs': [
+        ('convbn', 3, 2, 32),
+        ('depsepconv', 3, 1, 64),
+        ('depsepconv', 3, 2, 128),
+        ('depsepconv', 3, 1, 128),
+        ('depsepconv', 3, 2, 256),
+        ('depsepconv', 3, 1, 256),
+        ('depsepconv', 3, 2, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 1, 512),
+        ('depsepconv', 3, 2, 1024),
+        ('depsepconv', 3, 1, 1024),
+    ]
+}
+
+"""
+Architecture: https://arxiv.org/abs/1801.04381
+
+"MobileNetV2: Inverted Residuals and Linear Bottlenecks"
+Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
+"""
+MNV2_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV2',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'expand_ratio'],
+    'block_specs': [
+        ('convbn', 3, 2, 32, None),
+        ('invertedbottleneck', 3, 1, 16, 1.),
+        ('invertedbottleneck', 3, 2, 24, 6.),
+        ('invertedbottleneck', 3, 1, 24, 6.),
+        ('invertedbottleneck', 3, 2, 32, 6.),
+        ('invertedbottleneck', 3, 1, 32, 6.),
+        ('invertedbottleneck', 3, 1, 32, 6.),
+        ('invertedbottleneck', 3, 2, 64, 6.),
+        ('invertedbottleneck', 3, 1, 64, 6.),
+        ('invertedbottleneck', 3, 1, 64, 6.),
+        ('invertedbottleneck', 3, 1, 64, 6.),
+        ('invertedbottleneck', 3, 1, 96, 6.),
+        ('invertedbottleneck', 3, 1, 96, 6.),
+        ('invertedbottleneck', 3, 1, 96, 6.),
+        ('invertedbottleneck', 3, 2, 160, 6.),
+        ('invertedbottleneck', 3, 1, 160, 6.),
+        ('invertedbottleneck', 3, 1, 160, 6.),
+        ('invertedbottleneck', 3, 1, 320, 6.),
+        ('convbn', 1, 1, 1280, None),
+    ]
+}
+
+"""
+Architecture: https://arxiv.org/abs/1905.02244
+
+"Searching for MobileNetV3"
+Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan,
+Weijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V. Le, Hartwig Adam
+"""
+MNV3Large_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV3Large',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'activation', 'se_ratio', 'expand_ratio',
+                          'use_normalization', 'use_bias'],
+    'block_specs': [
+        ('convbn', 3, 2, 16, 'hard_swish', None, None, True, False),
+        ('invertedbottleneck', 3, 1, 16, 'relu', None, 1., None, False),
+        ('invertedbottleneck', 3, 2, 24, 'relu', None, 4., None, False),
+        ('invertedbottleneck', 3, 1, 24, 'relu', None, 3., None, False),
+        ('invertedbottleneck', 5, 2, 40, 'relu', 0.25, 3., None, False),
+        ('invertedbottleneck', 5, 1, 40, 'relu', 0.25, 3., None, False),
+        ('invertedbottleneck', 5, 1, 40, 'relu', 0.25, 3., None, False),
+        ('invertedbottleneck', 3, 2, 80, 'hard_swish', None, 6., None, False),
+        ('invertedbottleneck', 3, 1, 80, 'hard_swish', None, 2.5, None, False),
+        ('invertedbottleneck', 3, 1, 80, 'hard_swish', None, 2.3, None, False),
+        ('invertedbottleneck', 3, 1, 80, 'hard_swish', None, 2.3, None, False),
+        ('invertedbottleneck', 3, 1, 112, 'hard_swish', 0.25, 6., None, False),
+        ('invertedbottleneck', 3, 1, 112, 'hard_swish', 0.25, 6., None, False),
+        ('invertedbottleneck', 5, 2, 160, 'hard_swish', 0.25, 6., None, False),
+        ('invertedbottleneck', 5, 1, 160, 'hard_swish', 0.25, 6., None, False),
+        ('invertedbottleneck', 5, 1, 160, 'hard_swish', 0.25, 6., None, False),
+        ('convbn', 1, 1, 960, 'hard_swish', None, None, True, False),
+        ('gpooling', None, None, None, None, None, None, None, None),
+        ('convbn', 1, 1, 1280, 'hard_swish', None, None, False, True),
+    ]
+}
+
+MNV3Small_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV3Small',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'activation', 'se_ratio', 'expand_ratio',
+                          'use_normalization', 'use_bias'],
+    'block_specs': [
+        ('convbn', 3, 2, 16, 'hard_swish', None, None, True, False),
+        ('invertedbottleneck', 3, 2, 16, 'relu', 0.25, 1, None, False),
+        ('invertedbottleneck', 3, 2, 24, 'relu', None, 72. / 16, None, False),
+        ('invertedbottleneck', 3, 1, 24, 'relu', None, 88. / 24, None, False),
+        ('invertedbottleneck', 5, 2, 40, 'hard_swish', 0.25, 4., None, False),
+        ('invertedbottleneck', 5, 1, 40, 'hard_swish', 0.25, 6., None, False),
+        ('invertedbottleneck', 5, 1, 40, 'hard_swish', 0.25, 6., None, False),
+        ('invertedbottleneck', 5, 1, 48, 'hard_swish', 0.25, 3., None, False),
+        ('invertedbottleneck', 5, 1, 48, 'hard_swish', 0.25, 3., None, False),
+        ('invertedbottleneck', 5, 2, 96, 'hard_swish', 0.25, 6., None, False),
+        ('invertedbottleneck', 5, 1, 96, 'hard_swish', 0.25, 6., None, False),
+        ('invertedbottleneck', 5, 1, 96, 'hard_swish', 0.25, 6., None, False),
+        ('convbn', 1, 1, 576, 'hard_swish', None, None, True, False),
+        ('gpooling', None, None, None, None, None, None, None, None),
+        ('convbn', 1, 1, 1024, 'hard_swish', None, None, False, True),
+    ]
+}
+
+"""
+The EdgeTPU version is taken from
+github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v3.py
+"""
+MNV3EdgeTPU_BLOCK_SPECS = {
+    'spec_name': 'MobileNetV3EdgeTPU',
+    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
+                          'activation', 'se_ratio', 'expand_ratio',
+                          'use_residual', 'use_depthwise'],
+    'block_specs': [
+        ('convbn', 3, 2, 32, 'relu', None, None, None, None),
+        ('invertedbottleneck', 3, 1, 16, 'relu', None, 1., True, False),
+        ('invertedbottleneck', 3, 2, 32, 'relu', None, 8., True, False),
+        ('invertedbottleneck', 3, 1, 32, 'relu', None, 4., True, False),
+        ('invertedbottleneck', 3, 1, 32, 'relu', None, 4., True, False),
+        ('invertedbottleneck', 3, 1, 32, 'relu', None, 4., True, False),
+        ('invertedbottleneck', 3, 2, 48, 'relu', None, 8., True, False),
+        ('invertedbottleneck', 3, 1, 48, 'relu', None, 4., True, False),
+        ('invertedbottleneck', 3, 1, 48, 'relu', None, 4., True, False),
+        ('invertedbottleneck', 3, 1, 48, 'relu', None, 4., True, False),
+        ('invertedbottleneck', 3, 2, 96, 'relu', None, 8., True, True),
+        ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 3, 1, 96, 'relu', None, 8., False, True),
+        ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 5, 2, 160, 'relu', None, 8., True, True),
+        ('invertedbottleneck', 5, 1, 160, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 5, 1, 160, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 5, 1, 160, 'relu', None, 4., True, True),
+        ('invertedbottleneck', 3, 1, 192, 'relu', None, 8., True, True),
+        ('convbn', 1, 1, 1280, 'relu', None, None, None, None),
+    ]
+}
+
+SUPPORTED_SPECS_MAP = {
+    'MobileNetV1': MNV1_BLOCK_SPECS,
+    'MobileNetV2': MNV2_BLOCK_SPECS,
+    'MobileNetV3Large': MNV3Large_BLOCK_SPECS,
+    'MobileNetV3Small': MNV3Small_BLOCK_SPECS,
+    'MobileNetV3EdgeTPU': MNV3EdgeTPU_BLOCK_SPECS,
+}
+
+
+@dataclasses.dataclass
+class BlockSpec(hyperparams.Config):
+  """A container class that specifies the block configuration for MobileNet."""
+
+  block_fn: Text = 'convbn'
+  kernel_size: int = 3
+  strides: int = 1
+  filters: int = 32
+  use_bias: bool = False
+  use_normalization: bool = True
+  activation: Text = 'relu6'
+  # used for block type InvertedResConv
+  expand_ratio: Optional[float] = 6.
+  # used for block type InvertedResConv with SE
+  se_ratio: Optional[float] = None
+  use_depthwise: bool = True
+  use_residual: bool = True
+
+
+def block_spec_decoder(specs: Dict[Any, Any],
+                       filter_size_scale: float,
+                       # set to 1 for mobilenetv1
+                       divisible_by: int = 8,
+                       finegrain_classification_mode: bool = True):
+  """Decode specs for a block.
+
+  Args:
+    specs: `dict` specification of block specs of a mobilenet version.
+    filter_size_scale: `float` multiplier for the filter size
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    divisible_by: `int` ensures all inner dimensions are divisible by
+      this number.
+    finegrain_classification_mode: if True, the model
+      will keep the last layer large even for small multipliers. Following
+      https://arxiv.org/abs/1801.04381
+
+  Returns:
+    List[BlockSpec]` defines structure of the base network.
+  """
+
+  spec_name = specs['spec_name']
+  block_spec_schema = specs['block_spec_schema']
+  block_specs = specs['block_specs']
+
+  if not block_specs:
+    raise ValueError(
+        'The block spec cannot be empty for {} !'.format(spec_name))
+
+  if len(block_specs[0]) != len(block_spec_schema):
+    raise ValueError('The block spec values {} do not match with '
+                     'the schema {}'.format(block_specs[0], block_spec_schema))
+
+  decoded_specs = []
+
+  for s in block_specs:
+    kw_s = dict(zip(block_spec_schema, s))
+    decoded_specs.append(BlockSpec(**kw_s))
+
+  # This adjustment applies to V2 and V3
+  if (spec_name != 'MobileNetV1'
+      and finegrain_classification_mode
+      and filter_size_scale < 1.0):
+    decoded_specs[-1].filters /= filter_size_scale
+
+  for ds in decoded_specs:
+    if ds.filters:
+      ds.filters = nn_layers.round_filters(filters=ds.filters,
+                                           multiplier=filter_size_scale,
+                                           divisor=divisible_by,
+                                           min_depth=8)
+
+  return decoded_specs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MobileNet(tf.keras.Model):
+  """Class to build MobileNet family model."""
+
+  def __init__(self,
+               model_id: Text = 'MobileNetV2',
+               filter_size_scale: float = 1.0,
+               input_specs: layers.InputSpec = layers.InputSpec(
+                   shape=[None, None, None, 3]),
+               # The followings are for hyper-parameter tuning
+               norm_momentum: float = 0.99,
+               norm_epsilon: float = 0.001,
+               kernel_initializer: Text = 'VarianceScaling',
+               kernel_regularizer: Optional[regularizers.Regularizer] = None,
+               bias_regularizer: Optional[regularizers.Regularizer] = None,
+               # The followings should be kept the same most of the times
+               output_stride: int = None,
+               min_depth: int = 8,
+               # divisible is not used in MobileNetV1
+               divisible_by: int = 8,
+               stochastic_depth_drop_rate: float = 0.0,
+               regularize_depthwise: bool = False,
+               use_sync_bn: bool = False,
+               # finegrain is not used in MobileNetV1
+               finegrain_classification_mode: bool = True,
+               **kwargs):
+    """MobileNet initializer.
+
+    Args:
+      model_id: `str` version of MobileNet. The supported values are
+       'MobileNetV1', 'MobileNetV2', 'MobileNetV3Large', 'MobileNetV3Small',
+        and 'MobileNetV3EdgeTPU'.
+      filter_size_scale: `float` multiplier for the filters (number of channels)
+        for all convolution ops. The value must be greater than zero. Typical
+        usage will be to set this value in (0, 1) to reduce the number of
+        parameters or computation cost of the model.
+      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: `str` kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      output_stride: `int` specifies the requested ratio of input to output
+        spatial resolution. If not None, then we invoke atrous convolution
+        if necessary to prevent the network from reducing the spatial resolution
+        of activation maps. Allowed values are 8 (accurate fully convolutional
+        mode), 16 (fast fully convolutional mode), 32 (classification mode).
+      min_depth: `int` minimum depth (number of channels) for all conv ops.
+        Enforced when filter_size_scale < 1, and not an active constraint when
+        filter_size_scale >= 1.
+      divisible_by: `int` ensures all inner dimensions are divisible by
+        this number.
+      stochastic_depth_drop_rate: `float` drop rate for drop connect layer.
+      regularize_depthwise: if Ture, apply regularization on depthwise.
+      use_sync_bn: if True, use synchronized batch normalization.
+      finegrain_classification_mode: if True, the model
+        will keep the last layer large even for small multipliers. Following
+        https://arxiv.org/abs/1801.04381
+      **kwargs: keyword arguments to be passed.
+    """
+    if model_id not in SUPPORTED_SPECS_MAP:
+      raise ValueError('The MobileNet version {} '
+                       'is not supported'.format(model_id))
+
+    if filter_size_scale <= 0:
+      raise ValueError('filter_size_scale is not greater than zero.')
+
+    if output_stride is not None:
+      if model_id == 'MobileNetV1':
+        if output_stride not in [8, 16, 32]:
+          raise ValueError('Only allowed output_stride values are 8, 16, 32.')
+      else:
+        if output_stride == 0 or (output_stride > 1 and output_stride % 2):
+          raise ValueError('Output stride must be None, 1 or a multiple of 2.')
+
+    self._model_id = model_id
+    self._input_specs = input_specs
+    self._filter_size_scale = filter_size_scale
+    self._min_depth = min_depth
+    self._output_stride = output_stride
+    self._divisible_by = divisible_by
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._regularize_depthwise = regularize_depthwise
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._finegrain_classification_mode = finegrain_classification_mode
+
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+
+    block_specs = SUPPORTED_SPECS_MAP.get(model_id)
+    self._decoded_specs = block_spec_decoder(
+        specs=block_specs,
+        filter_size_scale=self._filter_size_scale,
+        divisible_by=self._get_divisible_by(),
+        finegrain_classification_mode=self._finegrain_classification_mode)
+
+    x, endpoints = self._mobilenet_base(inputs=inputs)
+
+    endpoints[max(endpoints.keys()) + 1] = x
+    self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
+
+    super(MobileNet, self).__init__(
+        inputs=inputs, outputs=endpoints, **kwargs)
+
+  def _get_divisible_by(self):
+    if self._model_id == 'MobileNetV1':
+      return 1
+    else:
+      return self._divisible_by
+
+  def _mobilenet_base(self,
+                      inputs: tf.Tensor
+                      ) -> Tuple[tf.Tensor, Dict[int, tf.Tensor]]:
+    """Build the base MobileNet architecture.
+
+    Args:
+      inputs: Input tensor of shape [batch_size, height, width, channels].
+
+    Returns:
+      A tuple of output Tensor and dictionary that collects endpoints.
+    """
+
+    input_shape = inputs.get_shape().as_list()
+    if len(input_shape) != 4:
+      raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
+
+    # The current_stride variable keeps track of the output stride of the
+    # activations, i.e., the running product of convolution strides up to the
+    # current network layer. This allows us to invoke atrous convolution
+    # whenever applying the next convolution would result in the activations
+    # having output stride larger than the target output_stride.
+    current_stride = 1
+
+    # The atrous convolution rate parameter.
+    rate = 1
+
+    net = inputs
+    endpoints = {}
+    endpoint_level = 1
+    for i, block_def in enumerate(self._decoded_specs):
+      block_name = 'block_group_{}_{}'.format(block_def.block_fn, i)
+      # A small catch for gpooling block with None strides
+      if not block_def.strides:
+        block_def.strides = 1
+      if self._output_stride is not None \
+          and current_stride == self._output_stride:
+        # If we have reached the target output_stride, then we need to employ
+        # atrous convolution with stride=1 and multiply the atrous rate by the
+        # current unit's stride for use in subsequent layers.
+        layer_stride = 1
+        layer_rate = rate
+        rate *= block_def.strides
+      else:
+        layer_stride = block_def.strides
+        layer_rate = 1
+        current_stride *= block_def.strides
+
+      if block_def.block_fn == 'convbn':
+
+        net = Conv2DBNBlock(
+            filters=block_def.filters,
+            kernel_size=block_def.kernel_size,
+            strides=block_def.strides,
+            activation=block_def.activation,
+            use_bias=block_def.use_bias,
+            use_normalization=block_def.use_normalization,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer,
+            use_sync_bn=self._use_sync_bn,
+            norm_momentum=self._norm_momentum,
+            norm_epsilon=self._norm_epsilon
+        )(net)
+
+      elif block_def.block_fn == 'depsepconv':
+        net = nn_blocks.DepthwiseSeparableConvBlock(
+            filters=block_def.filters,
+            kernel_size=block_def.kernel_size,
+            strides=block_def.strides,
+            activation=block_def.activation,
+            dilation_rate=layer_rate,
+            regularize_depthwise=self._regularize_depthwise,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            use_sync_bn=self._use_sync_bn,
+            norm_momentum=self._norm_momentum,
+            norm_epsilon=self._norm_epsilon,
+        )(net)
+
+      elif block_def.block_fn == 'invertedbottleneck':
+        use_rate = rate
+        if layer_rate > 1 and block_def.kernel_size != 1:
+          # We will apply atrous rate in the following cases:
+          # 1) When kernel_size is not in params, the operation then uses
+          #   default kernel size 3x3.
+          # 2) When kernel_size is in params, and if the kernel_size is not
+          #   equal to (1, 1) (there is no need to apply atrous convolution to
+          #   any 1x1 convolution).
+          use_rate = layer_rate
+        in_filters = net.shape.as_list()[-1]
+        net = nn_blocks.InvertedBottleneckBlock(
+            in_filters=in_filters,
+            out_filters=block_def.filters,
+            kernel_size=block_def.kernel_size,
+            strides=layer_stride,
+            expand_ratio=block_def.expand_ratio,
+            se_ratio=block_def.se_ratio,
+            expand_se_in_filters=True,
+            se_gating_activation='hard_sigmoid',
+            activation=block_def.activation,
+            use_depthwise=block_def.use_depthwise,
+            use_residual=block_def.use_residual,
+            dilation_rate=use_rate,
+            regularize_depthwise=self._regularize_depthwise,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer,
+            use_sync_bn=self._use_sync_bn,
+            norm_momentum=self._norm_momentum,
+            norm_epsilon=self._norm_epsilon,
+            stochastic_depth_drop_rate=self._stochastic_depth_drop_rate,
+            divisible_by=self._get_divisible_by()
+        )(net)
+
+      elif block_def.block_fn == 'gpooling':
+        net = layers.GlobalAveragePooling2D()(net)
+        net = layers.Reshape((1, 1, net.shape[1]))(net)
+
+      else:
+        raise ValueError('Unknown block type {} for layer {}'.format(
+            block_def.block_fn, i))
+
+      endpoints[endpoint_level] = net
+      endpoint_level += 1
+      net = tf.identity(net, name=block_name)
+    return net, endpoints
+
+  def get_config(self):
+    config_dict = {
+        'model_id': self._model_id,
+        'filter_size_scale': self._filter_size_scale,
+        'min_depth': self._min_depth,
+        'output_stride': self._output_stride,
+        'divisible_by': self._divisible_by,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'regularize_depthwise': self._regularize_depthwise,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'finegrain_classification_mode': self._finegrain_classification_mode,
+    }
+    return config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def output_specs(self):
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
+
+
+@factory.register_backbone_builder('mobilenet')
+def build_mobilenet(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config,
+    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
+  """Builds MobileNet 3d backbone from a config."""
+  backbone_type = model_config.backbone.type
+  backbone_cfg = model_config.backbone.get()
+  norm_activation_config = model_config.norm_activation
+  assert backbone_type == 'mobilenet', (f'Inconsistent backbone type '
+                                        f'{backbone_type}')
+
+  return MobileNet(
+      model_id=backbone_cfg.model_id,
+      filter_size_scale=backbone_cfg.filter_size_scale,
+      input_specs=input_specs,
+      stochastic_depth_drop_rate=backbone_cfg.stochastic_depth_drop_rate,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
--- a/official/vision/beta/modeling/backbones/mobilenet_test.py
+++ b/official/vision/beta/modeling/backbones/mobilenet_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MobileNet."""
+
+import itertools
+# Import libraries
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.beta.modeling.backbones import mobilenet
+
+
+class MobileNetTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters('MobileNetV1', 'MobileNetV2',
+                            'MobileNetV3Large', 'MobileNetV3Small',
+                            'MobileNetV3EdgeTPU')
+  def test_serialize_deserialize(self, model_id):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        model_id=model_id,
+        filter_size_scale=1.0,
+        stochastic_depth_drop_rate=None,
+        use_sync_bn=False,
+        kernel_initializer='VarianceScaling',
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+        output_stride=None,
+        min_depth=8,
+        divisible_by=8,
+        regularize_depthwise=False,
+        finegrain_classification_mode=True
+    )
+    network = mobilenet.MobileNet(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(network.get_config(), expected_config)
+
+    # Create another network object from the first object's config.
+    new_network = mobilenet.MobileNet.from_config(network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+
+  @parameterized.parameters(
+      itertools.product((1, 3),
+                        ('MobileNetV1', 'MobileNetV2', 'MobileNetV3Large',
+                         'MobileNetV3Small', 'MobileNetV3EdgeTPU')))
+  def test_input_specs(self, input_dim, model_id):
+    """Test different input feature dimensions."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim])
+    network = mobilenet.MobileNet(model_id=model_id, input_specs=input_specs)
+
+    inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v1_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV1',
+                                  filter_size_scale=0.75)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 24],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 48],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 96],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 96],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v2_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV2',
+                                  filter_size_scale=1.0)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 32],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v3_small_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV3Small',
+                                  filter_size_scale=0.75)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 16],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 3, input_size / 2 ** 3, 24],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 3, input_size / 2 ** 3, 24],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v3_large_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV3Large',
+                                  filter_size_scale=0.75)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(32, 224)
+  def test_mobilenet_v3_edgetpu_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = mobilenet.MobileNet(model_id='MobileNetV3EdgeTPU',
+                                  filter_size_scale=0.75)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 24],
+                        endpoints[1].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 1, input_size / 2 ** 1, 16],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2 ** 2, input_size / 2 ** 2, 24],
+                        endpoints[4].shape.as_list())
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v1_scaling(self, filter_size_scale):
+    mobilenet_v1_params = {
+        1.0: 3228864,
+        0.75: 1832976
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV1',
+                                  filter_size_scale=filter_size_scale)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v1_params[filter_size_scale])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v2_scaling(self, filter_size_scale):
+    mobilenet_v2_params = {
+        1.0: 2257984,
+        0.75: 1382064
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV2',
+                                  filter_size_scale=filter_size_scale)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v2_params[filter_size_scale])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v3_large_scaling(self, filter_size_scale):
+    mobilenet_v3_large_params = {
+        1.0: 4226432,
+        0.75: 2731616
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV3Large',
+                                  filter_size_scale=filter_size_scale)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v3_large_params[filter_size_scale])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v3_small_scaling(self, filter_size_scale):
+    mobilenet_v3_small_params = {
+        1.0: 1529968,
+        0.75: 1026552
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV3Small',
+                                  filter_size_scale=filter_size_scale)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v3_small_params[filter_size_scale])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1.0, 0.75)
+  def test_mobilenet_v3_edgetpu_scaling(self, filter_size_scale):
+    mobilenet_v3_edgetpu_params = {
+        1.0: 2849312,
+        0.75: 1737288
+    }
+
+    input_size = 224
+    network = mobilenet.MobileNet(model_id='MobileNetV3EdgeTPU',
+                                  filter_size_scale=filter_size_scale)
+    self.assertEqual(network.count_params(),
+                     mobilenet_v3_edgetpu_params[filter_size_scale])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
--- a/official/vision/beta/modeling/classification_model_test.py
+++ b/official/vision/beta/modeling/classification_model_test.py
@@ -77,6 +77,52 @@ class ClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
    logits = model(inputs)
    self.assertAllEqual([2, num_classes], logits.numpy().shape)

+  @combinations.generate(
+      combinations.combine(
+          mobilenet_model_id=[
+              'MobileNetV1',
+              'MobileNetV2',
+              'MobileNetV3Large',
+              'MobileNetV3Small',
+              'MobileNetV3EdgeTPU'
+          ],
+          filter_size_scale=[1.0, 0.75],
+      ))
+  def test_mobilenet_network_creation(self, mobilenet_model_id,
+                                      filter_size_scale):
+    """Test for creation of a MobileNet classifier."""
+    mobilenet_params = {
+        ('MobileNetV1', 1.0): 4254889,
+        ('MobileNetV1', 0.75): 2602745,
+        ('MobileNetV2', 1.0): 3540265,
+        ('MobileNetV2', 0.75): 2664345,
+        ('MobileNetV3Large', 1.0): 5508713,
+        ('MobileNetV3Large', 0.75): 4013897,
+        ('MobileNetV3Small', 1.0): 2555993,
+        ('MobileNetV3Small', 0.75): 2052577,
+        ('MobileNetV3EdgeTPU', 1.0): 4131593,
+        ('MobileNetV3EdgeTPU', 0.75): 3019569,
+    }
+
+    inputs = np.random.rand(2, 224, 224, 3)
+
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    backbone = backbones.MobileNet(
+        model_id=mobilenet_model_id, filter_size_scale=filter_size_scale)
+
+    num_classes = 1001
+    model = classification_model.ClassificationModel(
+        backbone=backbone,
+        num_classes=num_classes,
+        dropout_rate=0.2,
+    )
+    self.assertEqual(model.count_params(),
+                     mobilenet_params[(mobilenet_model_id, filter_size_scale)])
+
+    logits = model(inputs)
+    self.assertAllEqual([2, num_classes], logits.numpy().shape)
+
  @combinations.generate(
      combinations.combine(
          strategy=[
@@ -129,7 +175,7 @@ class ClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
      _ = model(inputs)

  def test_serialize_deserialize(self):
-    """Validate the classification network can be serialized and deserialized."""
+    """Validate the classification net can be serialized and deserialized."""

    tf.keras.backend.set_image_data_format('channels_last')
    backbone = backbones.ResNet(model_id=50)

--- a/official/vision/beta/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/modeling/layers/nn_blocks.py
@@ -14,10 +14,10 @@
 # ==============================================================================
 """Contains common building blocks for neural networks."""

-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Text

 # Import libraries
-
+from absl import logging
 import tensorflow as tf

 from official.modeling import tf_utils
@@ -391,7 +391,16 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
               kernel_regularizer=None,
               bias_regularizer=None,
               activation='relu',
+               se_inner_activation='relu',
+               se_gating_activation='sigmoid',
+               expand_se_in_filters=False,
+               depthwise_activation=None,
               use_sync_bn=False,
+               dilation_rate=1,
+               divisible_by=1,
+               regularize_depthwise=False,
+               use_depthwise=True,
+               use_residual=True,
               norm_momentum=0.99,
               norm_epsilon=0.001,
               **kwargs):
@@ -414,7 +423,24 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
        Default to None.
      activation: `str` name of the activation function.
+      se_inner_activation: Squeeze excitation inner activation.
+      se_gating_activation: Squeeze excitation gating activation.
+      expand_se_in_filters: Whether or not to expand in_filter in squeeze and
+        excitation layer.
+      depthwise_activation: `str` name of the activation function for depthwise
+        only.
      use_sync_bn: if True, use synchronized batch normalization.
+      dilation_rate: `int` an integer specifying the dilation rate to use for.
+      divisible_by: `int` ensures all inner dimensions are divisible by this
+        number.
+      dilated convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      regularize_depthwise: `bool` whether or not apply regularization on
+        depthwise.
+      use_depthwise: `bool` whether to uses fused convolutions instead of
+        depthwise.
+      use_residual: `bool`whether to include residual connection between input
+      and output.
      norm_momentum: `float` normalization omentum for the moving average.
      norm_epsilon: `float` small float added to variance to avoid dividing by
        zero.
@@ -428,14 +454,23 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
    self._strides = strides
    self._kernel_size = kernel_size
    self._se_ratio = se_ratio
+    self._divisible_by = divisible_by
    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._dilation_rate = dilation_rate
    self._use_sync_bn = use_sync_bn
+    self._regularize_depthwise = regularize_depthwise
+    self._use_depthwise = use_depthwise
+    self._use_residual = use_residual
    self._activation = activation
+    self._se_inner_activation = se_inner_activation
+    self._se_gating_activation = se_gating_activation
+    self._depthwise_activation = depthwise_activation
    self._kernel_initializer = kernel_initializer
    self._norm_momentum = norm_momentum
    self._norm_epsilon = norm_epsilon
    self._kernel_regularizer = kernel_regularizer
    self._bias_regularizer = bias_regularizer
+    self._expand_se_in_filters = expand_se_in_filters

    if use_sync_bn:
      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
@@ -446,14 +481,30 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
    else:
      self._bn_axis = 1
    self._activation_fn = tf_utils.get_activation(activation)
+    if not depthwise_activation:
+      self._depthwise_activation = activation
+    self._depthwise_activation_fn = tf_utils.get_activation(
+        self._depthwise_activation)
+    if regularize_depthwise:
+      self._depthsize_regularizer = kernel_regularizer
+    else:
+      self._depthsize_regularizer = None

  def build(self, input_shape):
-    if self._expand_ratio != 1:
+    expand_filters = self._in_filters
+    if self._expand_ratio > 1:
      # First 1x1 conv for channel expansion.
+      expand_filters = nn_layers.make_divisible(
+          self._in_filters * self._expand_ratio, self._divisible_by)
+
+      expand_kernel = 1 if self._use_depthwise else self._kernel_size
+      expand_stride = 1 if self._use_depthwise else self._strides
+
      self._conv0 = tf.keras.layers.Conv2D(
-          filters=self._in_filters * self._expand_ratio,
-          kernel_size=1,
-          strides=1,
+          filters=expand_filters,
+          kernel_size=expand_kernel,
+          strides=expand_stride,
+          padding='same',
          use_bias=False,
          kernel_initializer=self._kernel_initializer,
          kernel_regularizer=self._kernel_regularizer,
@@ -463,29 +514,39 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
          momentum=self._norm_momentum,
          epsilon=self._norm_epsilon)

-    # Depthwise conv.
-    self._conv1 = tf.keras.layers.DepthwiseConv2D(
-        kernel_size=(self._kernel_size, self._kernel_size),
-        strides=self._strides,
-        padding='same',
-        use_bias=False,
-        depthwise_initializer=self._kernel_initializer,
-        depthwise_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer)
-    self._norm1 = self._norm(
-        axis=self._bn_axis,
-        momentum=self._norm_momentum,
-        epsilon=self._norm_epsilon)
+    if self._use_depthwise:
+      # Depthwise conv.
+      self._conv1 = tf.keras.layers.DepthwiseConv2D(
+          kernel_size=(self._kernel_size, self._kernel_size),
+          strides=self._strides,
+          padding='same',
+          depth_multiplier=1,
+          dilation_rate=self._dilation_rate,
+          use_bias=False,
+          depthwise_initializer=self._kernel_initializer,
+          depthwise_regularizer=self._depthsize_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm1 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)

    # Squeeze and excitation.
-    if self._se_ratio is not None and self._se_ratio > 0 and self._se_ratio <= 1:
+    if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1:
+      logging.info('Use Squeeze and excitation.')
+      in_filters = self._in_filters
+      if self._expand_se_in_filters:
+        in_filters = expand_filters
      self._squeeze_excitation = nn_layers.SqueezeExcitation(
-          in_filters=self._in_filters,
+          in_filters=in_filters,
+          out_filters=expand_filters,
          se_ratio=self._se_ratio,
-          expand_ratio=self._expand_ratio,
+          divisible_by=self._divisible_by,
          kernel_initializer=self._kernel_initializer,
          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer)
+          bias_regularizer=self._bias_regularizer,
+          activation=self._se_inner_activation,
+          gating_activation=self._se_gating_activation)
    else:
      self._squeeze_excitation = None

@@ -494,6 +555,7 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
        filters=self._out_filters,
        kernel_size=1,
        strides=1,
+        padding='same',
        use_bias=False,
        kernel_initializer=self._kernel_initializer,
        kernel_regularizer=self._kernel_regularizer,
@@ -519,12 +581,21 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
        'strides': self._strides,
        'kernel_size': self._kernel_size,
        'se_ratio': self._se_ratio,
+        'divisible_by': self._divisible_by,
        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
        'kernel_initializer': self._kernel_initializer,
        'kernel_regularizer': self._kernel_regularizer,
        'bias_regularizer': self._bias_regularizer,
        'activation': self._activation,
+        'se_inner_activation': self._se_inner_activation,
+        'se_gating_activation': self._se_gating_activation,
+        'expand_se_in_filters': self._expand_se_in_filters,
+        'depthwise_activation': self._depthwise_activation,
+        'dilation_rate': self._dilation_rate,
        'use_sync_bn': self._use_sync_bn,
+        'regularize_depthwise': self._regularize_depthwise,
+        'use_depthwise': self._use_depthwise,
+        'use_residual': self._use_residual,
        'norm_momentum': self._norm_momentum,
        'norm_epsilon': self._norm_epsilon
    }
@@ -533,16 +604,17 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):

  def call(self, inputs, training=None):
    shortcut = inputs
-    if self._expand_ratio != 1:
+    if self._expand_ratio > 1:
      x = self._conv0(inputs)
      x = self._norm0(x)
      x = self._activation_fn(x)
    else:
      x = inputs

-    x = self._conv1(x)
-    x = self._norm1(x)
-    x = self._activation_fn(x)
+    if self._use_depthwise:
+      x = self._conv1(x)
+      x = self._norm1(x)
+      x = self._depthwise_activation_fn(x)

    if self._squeeze_excitation:
      x = self._squeeze_excitation(x)
@@ -550,7 +622,9 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
    x = self._conv2(x)
    x = self._norm2(x)

-    if self._in_filters == self._out_filters and self._strides == 1:
+    if (self._use_residual and
+        self._in_filters == self._out_filters and
+        self._strides == 1):
      if self._stochastic_depth:
        x = self._stochastic_depth(x, training=training)
      x = tf.add(x, shortcut)
@@ -570,9 +644,8 @@ class ResidualInner(tf.keras.layers.Layer):
      self,
      filters: int,
      strides: int,
-      kernel_initializer: Union[
-          str, Callable[..., tf.keras.initializers.Initializer]]
-      = 'VarianceScaling',
+      kernel_initializer: Union[str, Callable[
+          ..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
      use_sync_bn: bool = False,
@@ -693,9 +766,8 @@ class BottleneckResidualInner(tf.keras.layers.Layer):
      self,
      filters: int,
      strides: int,
-      kernel_initializer: Union[
-          str, Callable[..., tf.keras.initializers.Initializer]]
-      = 'VarianceScaling',
+      kernel_initializer: Union[str, Callable[
+          ..., tf.keras.initializers.Initializer]] = 'VarianceScaling',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
      use_sync_bn: bool = False,
@@ -886,8 +958,10 @@ class ReversibleLayer(tf.keras.layers.Layer):
      self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:

    @tf.custom_gradient
-    def reversible(x: tf.Tensor) -> Tuple[
-        tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor], List[tf.Tensor]]]]:
+    def reversible(
+        x: tf.Tensor
+    ) -> Tuple[tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor],
+                                                List[tf.Tensor]]]]:
      """Implements Algorithm 1 in RevNet paper.

      Paper: https://arxiv.org/pdf/1707.04585.pdf
@@ -926,7 +1000,7 @@ class ReversibleLayer(tf.keras.layers.Layer):

      def grad_fn(dy: tf.Tensor,
                  variables: Optional[List[tf.Variable]] = None,
-                 ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
+                  ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
        """Given dy calculate (dy/dx)|_{x_{input}} using f/g."""
        if irreversible or not self._manual_grads:
          grads_combined = fwdtape.gradient(
@@ -947,11 +1021,11 @@ class ReversibleLayer(tf.keras.layers.Layer):
          self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables]

          # Algorithm 1 in paper (line # documented in-line)
-          z1 = y1_nograd                                         # line 2
+          z1 = y1_nograd  # line 2
          with tf.GradientTape() as gtape:
            gtape.watch(z1)
            g_z1 = self._g(z1, training=training)
-          x2 = y2_nograd - g_z1                                  # line 3
+          x2 = y2_nograd - g_z1  # line 3

          with tf.GradientTape() as ftape:
            ftape.watch(x2)
@@ -963,16 +1037,16 @@ class ReversibleLayer(tf.keras.layers.Layer):
              g_z1,
              [z1] + self._g.trainable_variables,
              output_gradients=dy2)
-          dz1 = dy1 + g_grads_combined[0]                        # line 5
-          dwg = g_grads_combined[1:]                             # line 9
+          dz1 = dy1 + g_grads_combined[0]  # line 5
+          dwg = g_grads_combined[1:]  # line 9

          f_grads_combined = ftape.gradient(
              f_x2,
              [x2] + self._f.trainable_variables,
              output_gradients=dz1)
-          dx2 = dy2 + f_grads_combined[0]                        # line 6
-          dwf = f_grads_combined[1:]                             # line 8
-          dx1 = dz1                                              # line 7
+          dx2 = dy2 + f_grads_combined[0]  # line 6
+          dwf = f_grads_combined[1:]  # line 8
+          dx1 = dz1  # line 7

          # Pack the input and variable gradients.
          dx = tf.concat([dx1, dx2], axis=self._axis)
@@ -989,3 +1063,130 @@ class ReversibleLayer(tf.keras.layers.Layer):

    activations = reversible(inputs)
    return activations
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DepthwiseSeparableConvBlock(tf.keras.layers.Layer):
+  """An depthwise separable convolution block with batch normalization."""
+
+  def __init__(
+      self,
+      filters: int,
+      kernel_size: int = 3,
+      strides: int = 1,
+      regularize_depthwise=False,
+      activation: Text = 'relu6',
+      kernel_initializer: Text = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      dilation_rate: int = 1,
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      **kwargs):
+    """An convolution block with batch normalization.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      kernel_size: `int` an integer specifying the height and width of the
+      2D convolution window.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      regularize_depthwise: if Ture, apply regularization on depthwise.
+      activation: `str` name of the activation function.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+                          Default to None.
+      dilation_rate: an integer or tuple/list of 2 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(DepthwiseSeparableConvBlock, self).__init__(**kwargs)
+    self._filters = filters
+    self._kernel_size = kernel_size
+    self._strides = strides
+    self._activation = activation
+    self._regularize_depthwise = regularize_depthwise
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._dilation_rate = dilation_rate
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+    if regularize_depthwise:
+      self._depthsize_regularizer = kernel_regularizer
+    else:
+      self._depthsize_regularizer = None
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'regularize_depthwise': self._regularize_depthwise,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(DepthwiseSeparableConvBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+
+    self._dwconv0 = tf.keras.layers.DepthwiseConv2D(
+        kernel_size=self._kernel_size,
+        strides=self._strides,
+        padding='same',
+        depth_multiplier=1,
+        dilation_rate=self._dilation_rate,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._depthsize_regularizer,
+        use_bias=False)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    super(DepthwiseSeparableConvBlock, self).build(input_shape)
+
+  def call(self, inputs, training=None):
+    x = self._dwconv0(inputs)
+    x = self._norm0(x)
+    x = self._activation_fn(x)
+
+    x = self._conv1(x)
+    x = self._norm1(x)
+    return self._activation_fn(x)
--- a/official/vision/beta/modeling/layers/nn_layers.py
+++ b/official/vision/beta/modeling/layers/nn_layers.py
@@ -14,46 +14,99 @@
 # ==============================================================================
 """Contains common building blocks for neural networks."""

+from typing import Optional
+
 # Import libraries
+
+from absl import logging
 import tensorflow as tf

 from official.modeling import tf_utils


+def make_divisible(value: float,
+                   divisor: int,
+                   min_value: Optional[float] = None
+                   ) -> int:
+  """This is to ensure that all layers have channels that are divisible by 8.
+
+  Args:
+    value: `float` original value.
+    divisor: `int` the divisor that need to be checked upon.
+    min_value: `float` minimum value threshold.
+
+  Returns:
+    The adjusted value in `int` that divisible against divisor.
+  """
+  if min_value is None:
+    min_value = divisor
+  new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_value < 0.9 * value:
+    new_value += divisor
+  return new_value
+
+
+def round_filters(filters: int,
+                  multiplier: float,
+                  divisor: int = 8,
+                  min_depth: Optional[int] = None,
+                  skip: bool = False):
+  """Round number of filters based on width multiplier."""
+  orig_f = filters
+  if skip or not multiplier:
+    return filters
+
+  new_filters = make_divisible(value=filters * multiplier,
+                               divisor=divisor,
+                               min_value=min_depth)
+
+  logging.info('round_filter input=%s output=%s', orig_f, new_filters)
+  return int(new_filters)
+
+
 @tf.keras.utils.register_keras_serializable(package='Vision')
 class SqueezeExcitation(tf.keras.layers.Layer):
  """Squeeze and excitation layer."""

  def __init__(self,
               in_filters,
+               out_filters,
               se_ratio,
-               expand_ratio,
+               divisible_by=1,
               kernel_initializer='VarianceScaling',
               kernel_regularizer=None,
               bias_regularizer=None,
               activation='relu',
+               gating_activation='sigmoid',
               **kwargs):
    """Implementation for squeeze and excitation.

    Args:
      in_filters: `int` number of filters of the input tensor.
+      out_filters: `int` number of filters of the output tensor.
      se_ratio: `float` or None. If not None, se ratio for the squeeze and
        excitation layer.
-      expand_ratio: `int` expand_ratio for a MBConv block.
+      divisible_by: `int` ensures all inner dimensions are divisible by this
+        number.
      kernel_initializer: kernel_initializer for convolutional layers.
      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
        Default to None.
      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
        Default to None.
      activation: `str` name of the activation function.
+      gating_activation: `str` name of the activation function for final gating
+        function.
      **kwargs: keyword arguments to be passed.
    """
    super(SqueezeExcitation, self).__init__(**kwargs)

    self._in_filters = in_filters
+    self._out_filters = out_filters
    self._se_ratio = se_ratio
-    self._expand_ratio = expand_ratio
+    self._divisible_by = divisible_by
    self._activation = activation
+    self._gating_activation = gating_activation
    self._kernel_initializer = kernel_initializer
    self._kernel_regularizer = kernel_regularizer
    self._bias_regularizer = bias_regularizer
@@ -62,9 +115,12 @@ class SqueezeExcitation(tf.keras.layers.Layer):
    else:
      self._spatial_axis = [2, 3]
    self._activation_fn = tf_utils.get_activation(activation)
+    self._gating_activation_fn = tf_utils.get_activation(gating_activation)

  def build(self, input_shape):
-    num_reduced_filters = max(1, int(self._in_filters * self._se_ratio))
+    num_reduced_filters = make_divisible(
+        max(1, int(self._in_filters * self._se_ratio)),
+        divisor=self._divisible_by)

    self._se_reduce = tf.keras.layers.Conv2D(
        filters=num_reduced_filters,
@@ -77,7 +133,7 @@ class SqueezeExcitation(tf.keras.layers.Layer):
        bias_regularizer=self._bias_regularizer)

    self._se_expand = tf.keras.layers.Conv2D(
-        filters=self._in_filters * self._expand_ratio,
+        filters=self._out_filters,
        kernel_size=1,
        strides=1,
        padding='same',
@@ -91,22 +147,24 @@ class SqueezeExcitation(tf.keras.layers.Layer):
  def get_config(self):
    config = {
        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
        'se_ratio': self._se_ratio,
-        'expand_ratio': self._expand_ratio,
+        'divisible_by': self._divisible_by,
        'strides': self._strides,
        'kernel_initializer': self._kernel_initializer,
        'kernel_regularizer': self._kernel_regularizer,
        'bias_regularizer': self._bias_regularizer,
        'activation': self._activation,
+        'gating_activation': self._gating_activation,
    }
    base_config = super(SqueezeExcitation, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

  def call(self, inputs):
    x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True)
-    x = self._se_expand(self._activation_fn(self._se_reduce(x)))
-
-    return tf.sigmoid(x) * inputs
+    x = self._activation_fn(self._se_reduce(x))
+    x = self._gating_activation_fn(self._se_expand(x))
+    return x * inputs


 @tf.keras.utils.register_keras_serializable(package='Vision')