Merge branch 'purdue-yolo' of https://github.com/tensorflow/models into detection_generator_pr_2

cf80ed4e · anivegesana · 394cefcc · 461b3587 · cf80ed4e · cf80ed4e
Commit cf80ed4e authored Aug 02, 2021 by anivegesana
20 changed files
--- a/official/vision/beta/modeling/decoders/factory.py
+++ b/official/vision/beta/modeling/decoders/factory.py
@@ -12,80 +12,124 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
-"""Contains the factory method to create decoders."""
+"""Decoder registers and factory method.

-from typing import Mapping, Optional
+One can register a new decoder model by the following two steps:
+
+1 Import the factory and register the build in the decoder file.
+2 Import the decoder class and add a build in __init__.py.
+
+```
+# my_decoder.py
+
+from modeling.decoders import factory
+
+class MyDecoder():
+  ...
+
+@factory.register_decoder_builder('my_decoder')
+def build_my_decoder():
+  return MyDecoder()
+
+# decoders/__init__.py adds import
+from modeling.decoders.my_decoder import MyDecoder
+```
+
+If one wants the MyDecoder class to be used only by those binary
+then don't imported the decoder module in decoders/__init__.py, but import it
+in place that uses it.
+"""
+from typing import Any, Callable, Mapping, Optional, Union

 # Import libraries

 import tensorflow as tf

+from official.core import registry
 from official.modeling import hyperparams
-from official.vision.beta.modeling import decoders
+
+_REGISTERED_DECODER_CLS = {}
+
+
+def register_decoder_builder(key: str) -> Callable[..., Any]:
+  """Decorates a builder of decoder class.
+
+  The builder should be a Callable (a class or a function).
+  This decorator supports registration of decoder builder as follows:
+
+  ```
+  class MyDecoder(tf.keras.Model):
+    pass
+
+  @register_decoder_builder('mydecoder')
+  def builder(input_specs, config, l2_reg):
+    return MyDecoder(...)
+
+  # Builds a MyDecoder object.
+  my_decoder = build_decoder_3d(input_specs, config, l2_reg)
+  ```
+
+  Args:
+    key: A `str` of key to look up the builder.
+
+  Returns:
+    A callable for using as class decorator that registers the decorated class
+    for creation from an instance of task_config_cls.
+  """
+  return registry.register(_REGISTERED_DECODER_CLS, key)
+
+
+@register_decoder_builder('identity')
+def build_identity(
+    input_specs: Optional[Mapping[str, tf.TensorShape]] = None,
+    model_config: Optional[hyperparams.Config] = None,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None) -> None:
+  """Builds identity decoder from a config.
+
+  All the input arguments are not used by identity decoder but kept here to
+  ensure the interface is consistent.
+
+  Args:
+    input_specs: A `dict` of input specifications. A dictionary consists of
+      {level: TensorShape} from a backbone.
+    model_config: A `OneOfConfig` of model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` object. Default to
+      None.
+
+  Returns:
+    An instance of the identity decoder.
+  """
+  del input_specs, model_config, l2_regularizer  # Unused by identity decoder.


 def build_decoder(
    input_specs: Mapping[str, tf.TensorShape],
    model_config: hyperparams.Config,
-    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
-) -> tf.keras.Model:
+    l2_regularizer: tf.keras.regularizers.Regularizer = None,
+    **kwargs) -> Union[None, tf.keras.Model, tf.keras.layers.Layer]:
  """Builds decoder from a config.

+  A decoder can be a keras.Model, a keras.layers.Layer, or None. If it is not
+  None, the decoder will take features from the backbone as input and generate
+  decoded feature maps. If it is None, such as an identity decoder, the decoder
+  is skipped and features from the backbone are regarded as model output.
+
  Args:
    input_specs: A `dict` of input specifications. A dictionary consists of
      {level: TensorShape} from a backbone.
-    model_config: A OneOfConfig. Model config.
-    l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
+    model_config: A `OneOfConfig` of model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` object. Default to
      None.
+    **kwargs: Additional keyword args to be passed to decoder builder.

  Returns:
-    A `tf.keras.Model` instance of the decoder.
+    An instance of the decoder.
  """
-  decoder_type = model_config.decoder.type
-  decoder_cfg = model_config.decoder.get()
-  norm_activation_config = model_config.norm_activation
-
-  if decoder_type == 'identity':
-    decoder = None
-  elif decoder_type == 'fpn':
-    decoder = decoders.FPN(
-        input_specs=input_specs,
-        min_level=model_config.min_level,
-        max_level=model_config.max_level,
-        num_filters=decoder_cfg.num_filters,
-        use_separable_conv=decoder_cfg.use_separable_conv,
-        activation=norm_activation_config.activation,
-        use_sync_bn=norm_activation_config.use_sync_bn,
-        norm_momentum=norm_activation_config.norm_momentum,
-        norm_epsilon=norm_activation_config.norm_epsilon,
-        kernel_regularizer=l2_regularizer)
-  elif decoder_type == 'nasfpn':
-    decoder = decoders.NASFPN(
-        input_specs=input_specs,
-        min_level=model_config.min_level,
-        max_level=model_config.max_level,
-        num_filters=decoder_cfg.num_filters,
-        num_repeats=decoder_cfg.num_repeats,
-        use_separable_conv=decoder_cfg.use_separable_conv,
-        activation=norm_activation_config.activation,
-        use_sync_bn=norm_activation_config.use_sync_bn,
-        norm_momentum=norm_activation_config.norm_momentum,
-        norm_epsilon=norm_activation_config.norm_epsilon,
-        kernel_regularizer=l2_regularizer)
-  elif decoder_type == 'aspp':
-    decoder = decoders.ASPP(
-        level=decoder_cfg.level,
-        dilation_rates=decoder_cfg.dilation_rates,
-        num_filters=decoder_cfg.num_filters,
-        pool_kernel_size=decoder_cfg.pool_kernel_size,
-        dropout_rate=decoder_cfg.dropout_rate,
-        use_sync_bn=norm_activation_config.use_sync_bn,
-        norm_momentum=norm_activation_config.norm_momentum,
-        norm_epsilon=norm_activation_config.norm_epsilon,
-        activation=norm_activation_config.activation,
-        kernel_regularizer=l2_regularizer)
-  else:
-    raise ValueError('Decoder {!r} not implement'.format(decoder_type))
-
-  return decoder
+  decoder_builder = registry.lookup(_REGISTERED_DECODER_CLS,
+                                    model_config.decoder.type)
+
+  return decoder_builder(
+      input_specs=input_specs,
+      model_config=model_config,
+      l2_regularizer=l2_regularizer,
+      **kwargs)
--- a/official/vision/beta/modeling/decoders/factory_test.py
+++ b/official/vision/beta/modeling/decoders/factory_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for decoder factory functions."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from official.vision.beta import configs
+from official.vision.beta.configs import decoders as decoders_cfg
+from official.vision.beta.modeling import decoders
+from official.vision.beta.modeling.decoders import factory
+
+
+class FactoryTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          num_filters=[128, 256], use_separable_conv=[True, False]))
+  def test_fpn_decoder_creation(self, num_filters, use_separable_conv):
+    """Test creation of FPN decoder."""
+    min_level = 3
+    max_level = 7
+    input_specs = {}
+    for level in range(min_level, max_level):
+      input_specs[str(level)] = tf.TensorShape(
+          [1, 128 // (2**level), 128 // (2**level), 3])
+
+    network = decoders.FPN(
+        input_specs=input_specs,
+        num_filters=num_filters,
+        use_separable_conv=use_separable_conv,
+        use_sync_bn=True)
+
+    model_config = configs.retinanet.RetinaNet()
+    model_config.min_level = min_level
+    model_config.max_level = max_level
+    model_config.num_classes = 10
+    model_config.input_size = [None, None, 3]
+    model_config.decoder = decoders_cfg.Decoder(
+        type='fpn',
+        fpn=decoders_cfg.FPN(
+            num_filters=num_filters, use_separable_conv=use_separable_conv))
+
+    factory_network = factory.build_decoder(
+        input_specs=input_specs, model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(
+      combinations.combine(
+          num_filters=[128, 256],
+          num_repeats=[3, 5],
+          use_separable_conv=[True, False]))
+  def test_nasfpn_decoder_creation(self, num_filters, num_repeats,
+                                   use_separable_conv):
+    """Test creation of NASFPN decoder."""
+    min_level = 3
+    max_level = 7
+    input_specs = {}
+    for level in range(min_level, max_level):
+      input_specs[str(level)] = tf.TensorShape(
+          [1, 128 // (2**level), 128 // (2**level), 3])
+
+    network = decoders.NASFPN(
+        input_specs=input_specs,
+        num_filters=num_filters,
+        num_repeats=num_repeats,
+        use_separable_conv=use_separable_conv,
+        use_sync_bn=True)
+
+    model_config = configs.retinanet.RetinaNet()
+    model_config.min_level = min_level
+    model_config.max_level = max_level
+    model_config.num_classes = 10
+    model_config.input_size = [None, None, 3]
+    model_config.decoder = decoders_cfg.Decoder(
+        type='nasfpn',
+        nasfpn=decoders_cfg.NASFPN(
+            num_filters=num_filters,
+            num_repeats=num_repeats,
+            use_separable_conv=use_separable_conv))
+
+    factory_network = factory.build_decoder(
+        input_specs=input_specs, model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(
+      combinations.combine(
+          level=[3, 4],
+          dilation_rates=[[6, 12, 18], [6, 12]],
+          num_filters=[128, 256]))
+  def test_aspp_decoder_creation(self, level, dilation_rates, num_filters):
+    """Test creation of ASPP decoder."""
+    input_specs = {'1': tf.TensorShape([1, 128, 128, 3])}
+
+    network = decoders.ASPP(
+        level=level,
+        dilation_rates=dilation_rates,
+        num_filters=num_filters,
+        use_sync_bn=True)
+
+    model_config = configs.semantic_segmentation.SemanticSegmentationModel()
+    model_config.num_classes = 10
+    model_config.input_size = [None, None, 3]
+    model_config.decoder = decoders_cfg.Decoder(
+        type='aspp',
+        aspp=decoders_cfg.ASPP(
+            level=level, dilation_rates=dilation_rates,
+            num_filters=num_filters))
+
+    factory_network = factory.build_decoder(
+        input_specs=input_specs, model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  def test_identity_decoder_creation(self):
+    """Test creation of identity decoder."""
+    model_config = configs.retinanet.RetinaNet()
+    model_config.num_classes = 2
+    model_config.input_size = [None, None, 3]
+
+    model_config.decoder = decoders_cfg.Decoder(
+        type='identity', identity=decoders_cfg.Identity())
+
+    factory_network = factory.build_decoder(
+        input_specs=None, model_config=model_config)
+
+    self.assertIsNone(factory_network)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/decoders/fpn.py
+++ b/official/vision/beta/modeling/decoders/fpn.py
@@ -16,9 +16,12 @@
 from typing import Any, Mapping, Optional

 # Import libraries
+
 import tensorflow as tf

+from official.modeling import hyperparams
 from official.modeling import tf_utils
+from official.vision.beta.modeling.decoders import factory
 from official.vision.beta.ops import spatial_transform_ops


@@ -187,3 +190,43 @@ class FPN(tf.keras.Model):
  def output_specs(self) -> Mapping[str, tf.TensorShape]:
    """A dict of {level: TensorShape} pairs for the model output."""
    return self._output_specs
+
+
+@factory.register_decoder_builder('fpn')
+def build_fpn_decoder(
+    input_specs: Mapping[str, tf.TensorShape],
+    model_config: hyperparams.Config,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds FPN decoder from a config.
+
+  Args:
+    input_specs: A `dict` of input specifications. A dictionary consists of
+      {level: TensorShape} from a backbone.
+    model_config: A OneOfConfig. Model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
+      None.
+
+  Returns:
+    A `tf.keras.Model` instance of the FPN decoder.
+
+  Raises:
+    ValueError: If the model_config.decoder.type is not `fpn`.
+  """
+  decoder_type = model_config.decoder.type
+  decoder_cfg = model_config.decoder.get()
+  if decoder_type != 'fpn':
+    raise ValueError(f'Inconsistent decoder type {decoder_type}. '
+                     'Need to be `fpn`.')
+  norm_activation_config = model_config.norm_activation
+  return FPN(
+      input_specs=input_specs,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_filters=decoder_cfg.num_filters,
+      use_separable_conv=decoder_cfg.use_separable_conv,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
--- a/official/vision/beta/modeling/decoders/fpn_test.py
+++ b/official/vision/beta/modeling/decoders/fpn_test.py
@@ -19,6 +19,7 @@
 from absl.testing import parameterized
 import tensorflow as tf

+from official.vision.beta.modeling.backbones import mobilenet
 from official.vision.beta.modeling.backbones import resnet
 from official.vision.beta.modeling.decoders import fpn

@@ -52,6 +53,33 @@ class FPNTest(parameterized.TestCase, tf.test.TestCase):
          [1, input_size // 2**level, input_size // 2**level, 256],
          feats[str(level)].shape.as_list())

+  @parameterized.parameters(
+      (256, 3, 7, False),
+      (256, 3, 7, True),
+  )
+  def test_network_creation_with_mobilenet(self, input_size, min_level,
+                                           max_level, use_separable_conv):
+    """Test creation of FPN with mobilenet backbone."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+
+    backbone = mobilenet.MobileNet(model_id='MobileNetV2')
+    network = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        use_separable_conv=use_separable_conv)
+
+    endpoints = backbone(inputs)
+    feats = network(endpoints)
+
+    for level in range(min_level, max_level + 1):
+      self.assertIn(str(level), feats)
+      self.assertAllEqual(
+          [1, input_size // 2**level, input_size // 2**level, 256],
+          feats[str(level)].shape.as_list())
+
  def test_serialize_deserialize(self):
    # Create a network object that sets all of its config options.
    kwargs = dict(

--- a/official/vision/beta/modeling/decoders/nasfpn.py
+++ b/official/vision/beta/modeling/decoders/nasfpn.py
@@ -13,12 +13,16 @@
 # limitations under the License.

 """Contains definitions of NAS-FPN."""
-from typing import Any, Mapping, List, Tuple, Optional
+
+from typing import Any, List, Mapping, Optional, Tuple

 # Import libraries
+
 from absl import logging
 import tensorflow as tf

+from official.modeling import hyperparams
+from official.vision.beta.modeling.decoders import factory
 from official.vision.beta.ops import spatial_transform_ops


@@ -316,3 +320,45 @@ class NASFPN(tf.keras.Model):
  def output_specs(self) -> Mapping[str, tf.TensorShape]:
    """A dict of {level: TensorShape} pairs for the model output."""
    return self._output_specs
+
+
+@factory.register_decoder_builder('nasfpn')
+def build_nasfpn_decoder(
+    input_specs: Mapping[str, tf.TensorShape],
+    model_config: hyperparams.Config,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds NASFPN decoder from a config.
+
+  Args:
+    input_specs: A `dict` of input specifications. A dictionary consists of
+      {level: TensorShape} from a backbone.
+    model_config: A OneOfConfig. Model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
+      None.
+
+  Returns:
+    A `tf.keras.Model` instance of the NASFPN decoder.
+
+  Raises:
+    ValueError: If the model_config.decoder.type is not `nasfpn`.
+  """
+  decoder_type = model_config.decoder.type
+  decoder_cfg = model_config.decoder.get()
+  if decoder_type != 'nasfpn':
+    raise ValueError(f'Inconsistent decoder type {decoder_type}. '
+                     'Need to be `nasfpn`.')
+
+  norm_activation_config = model_config.norm_activation
+  return NASFPN(
+      input_specs=input_specs,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_filters=decoder_cfg.num_filters,
+      num_repeats=decoder_cfg.num_repeats,
+      use_separable_conv=decoder_cfg.use_separable_conv,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
--- a/official/vision/beta/modeling/factory.py
+++ b/official/vision/beta/modeling/factory.py
@@ -24,10 +24,10 @@ from official.vision.beta.configs import retinanet as retinanet_cfg
 from official.vision.beta.configs import semantic_segmentation as segmentation_cfg
 from official.vision.beta.modeling import backbones
 from official.vision.beta.modeling import classification_model
+from official.vision.beta.modeling import decoders
 from official.vision.beta.modeling import maskrcnn_model
 from official.vision.beta.modeling import retinanet_model
 from official.vision.beta.modeling import segmentation_model
-from official.vision.beta.modeling.decoders import factory as decoder_factory
 from official.vision.beta.modeling.heads import dense_prediction_heads
 from official.vision.beta.modeling.heads import instance_heads
 from official.vision.beta.modeling.heads import segmentation_heads
@@ -78,7 +78,7 @@ def build_maskrcnn(
      l2_regularizer=l2_regularizer)
  backbone(tf.keras.Input(input_specs.shape[1:]))

-  decoder = decoder_factory.build_decoder(
+  decoder = decoders.factory.build_decoder(
      input_specs=backbone.output_specs,
      model_config=model_config,
      l2_regularizer=l2_regularizer)
@@ -253,7 +253,7 @@ def build_retinanet(
      l2_regularizer=l2_regularizer)
  backbone(tf.keras.Input(input_specs.shape[1:]))

-  decoder = decoder_factory.build_decoder(
+  decoder = decoders.factory.build_decoder(
      input_specs=backbone.output_specs,
      model_config=model_config,
      l2_regularizer=l2_regularizer)
@@ -313,7 +313,7 @@ def build_segmentation_model(
      norm_activation_config=norm_activation_config,
      l2_regularizer=l2_regularizer)

-  decoder = decoder_factory.build_decoder(
+  decoder = decoders.factory.build_decoder(
      input_specs=backbone.output_specs,
      model_config=model_config,
      l2_regularizer=l2_regularizer)

--- a/official/vision/beta/modeling/layers/nn_layers.py
+++ b/official/vision/beta/modeling/layers/nn_layers.py
@@ -26,10 +26,6 @@ from official.modeling import tf_utils
 States = Dict[str, tf.Tensor]
 Activation = Union[str, Callable]

-# TODO(dankondratyuk): keep legacy padding until new checkpoints are trained.
-# Otherwise, accuracy will be affected.
-LEGACY_PADDING = True
-

 def make_divisible(value: float,
                   divisor: int,
@@ -89,6 +85,22 @@ def hard_swish(x: tf.Tensor) -> tf.Tensor:
 tf.keras.utils.get_custom_objects().update({'hard_swish': hard_swish})


+def simple_swish(x: tf.Tensor) -> tf.Tensor:
+  """A swish/silu activation function without custom gradients.
+
+  Useful for exporting to SavedModel to avoid custom gradient warnings.
+
+  Args:
+    x: the input tensor.
+
+  Returns:
+    The activation output.
+  """
+  return x * tf.math.sigmoid(x)
+
+tf.keras.utils.get_custom_objects().update({'simple_swish': simple_swish})
+
+
 @tf.keras.utils.register_keras_serializable(package='Vision')
 class SqueezeExcitation(tf.keras.layers.Layer):
  """Creates a squeeze and excitation layer."""
@@ -752,14 +764,10 @@ class CausalConvMixin:
         (self.kernel_size[i] - 1) * (self.dilation_rate[i] - 1))
        for i in range(self.rank)
    ]
-    if LEGACY_PADDING:
-      # Apply legacy padding that does not take into account spatial strides
-      pad_total = [kernel_size_effective[i] - 1 for i in range(self.rank)]
-    else:
-      pad_total = [kernel_size_effective[0] - 1]
-      for i in range(1, self.rank):
-        overlap = (input_shape[i] - 1) % self.strides[i] + 1
-        pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0))
+    pad_total = [kernel_size_effective[0] - 1]
+    for i in range(1, self.rank):
+      overlap = (input_shape[i] - 1) % self.strides[i] + 1
+      pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0))
    pad_beg = [pad_total[i] // 2 for i in range(self.rank)]
    pad_end = [pad_total[i] - pad_beg[i] for i in range(self.rank)]
    padding = [[pad_beg[i], pad_end[i]] for i in range(self.rank)]

--- a/official/vision/beta/modeling/layers/nn_layers_test.py
+++ b/official/vision/beta/modeling/layers/nn_layers_test.py
@@ -24,10 +24,6 @@ from official.vision.beta.modeling.layers import nn_layers

 class NNLayersTest(parameterized.TestCase, tf.test.TestCase):

-  def setUp(self):
-    super().setUp()
-    nn_layers.LEGACY_PADDING = False
-
  def test_hard_swish(self):
    activation = tf.keras.layers.Activation('hard_swish')
    output = activation(tf.constant([-3, -1.5, 0, 3]))

--- a/official/vision/beta/ops/box_ops.py
+++ b/official/vision/beta/ops/box_ops.py
@@ -50,6 +50,60 @@ def yxyx_to_xywh(boxes):
  return new_boxes


+def yxyx_to_cycxhw(boxes):
+  """Converts box corner coordinates to center plus height and width terms.
+
+  Args:
+    boxes: a `Tensor` with last dimension of 4, representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+
+  Returns:
+    boxes: a `Tensor` with the same shape as the inputted boxes, in the format
+      of cy, cx, height, width.
+
+  Raises:
+    ValueError: if the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('Last dimension of boxes must be 4 but is {:d}'.format(
+        boxes.shape[-1]))
+
+  boxes_ycenter = (boxes[..., 0] + boxes[..., 2]) / 2
+  boxes_xcenter = (boxes[..., 1] + boxes[..., 3]) / 2
+  boxes_height = boxes[..., 2] - boxes[..., 0]
+  boxes_width = boxes[..., 3] - boxes[..., 1]
+
+  new_boxes = tf.stack(
+      [boxes_ycenter, boxes_xcenter, boxes_height, boxes_width], axis=-1)
+  return new_boxes
+
+
+def cycxhw_to_yxyx(boxes):
+  """Converts box center coordinates plus height and width terms to corner.
+
+  Args:
+    boxes: a numpy array whose last dimension is 4 representing the coordinates
+      of boxes in cy, cx, height, width order.
+
+  Returns:
+    boxes: a numpy array whose shape is the same as `boxes` in new format.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  boxes_ymin = boxes[..., 0] - boxes[..., 2] / 2
+  boxes_xmin = boxes[..., 1] - boxes[..., 3] / 2
+  boxes_ymax = boxes[..., 0] + boxes[..., 2] / 2
+  boxes_xmax = boxes[..., 1] + boxes[..., 3] / 2
+  new_boxes = tf.stack([
+      boxes_ymin, boxes_xmin, boxes_ymax, boxes_xmax], axis=-1)
+  return new_boxes
+
+
 def jitter_boxes(boxes, noise_scale=0.025):
  """Jitter the box coordinates by some noise distribution.


--- a/official/vision/beta/projects/movinet/README.md
+++ b/official/vision/beta/projects/movinet/README.md
@@ -8,6 +8,8 @@ This repository is the official implementation of
 [MoViNets: Mobile Video Networks for Efficient Video
 Recognition](https://arxiv.org/abs/2103.11511).

+**[UPDATE 2021-07-12] Mobile Models Available via [TF Lite](#tf-lite-streaming-models)**
+
 <p align="center">
  <img src="https://storage.googleapis.com/tf_model_garden/vision/movinet/artifacts/hoverboard_stream.gif" height=500>
 </p>
@@ -53,6 +55,8 @@ approach that performs redundant computation and limits temporal scope.

 ## History

+- **2021-07-12** Add TF Lite support and replace 3D stream models with
+mobile-friendly (2+1)D stream.
 - **2021-05-30** Add streaming MoViNet checkpoints and examples.
 - **2021-05-11** Initial Commit.

@@ -68,6 +72,7 @@ approach that performs redundant computation and limits temporal scope.
 - [Results and Pretrained Weights](#results-and-pretrained-weights)
  - [Kinetics 600](#kinetics-600)
 - [Prediction Examples](#prediction-examples)
+- [TF Lite Example](#tf-lite-example)
 - [Training and Evaluation](#training-and-evaluation)
 - [References](#references)
 - [License](#license)
@@ -108,10 +113,14 @@ MoViNet-A5.

 #### Base Models

-Base models implement standard 3D convolutions without stream buffers.
+Base models implement standard 3D convolutions without stream buffers. Base
+models are not recommended for fast inference on CPU or mobile due to
+limited support for
+[`tf.nn.conv3d`](https://www.tensorflow.org/api_docs/python/tf/nn/conv3d).
+Instead, see the [streaming models section](#streaming-models).

-| Model Name | Top-1 Accuracy | Top-5 Accuracy | Input Shape | GFLOPs\* | Chekpoint | TF Hub SavedModel |
-|------------|----------------|----------------|-------------|----------|-----------|-------------------|
+| Model Name | Top-1 Accuracy | Top-5 Accuracy | Input Shape | GFLOPs\* | Checkpoint | TF Hub SavedModel |
+|------------|----------------|----------------|-------------|----------|------------|-------------------|
 | MoViNet-A0-Base | 72.28 | 90.92 | 50 x 172 x 172 | 2.7 | [checkpoint (12 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_base.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a0/base/kinetics-600/classification/) |
 | MoViNet-A1-Base | 76.69 | 93.40 | 50 x 172 x 172 | 6.0 | [checkpoint (18 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a1_base.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a1/base/kinetics-600/classification/) |
 | MoViNet-A2-Base | 78.62 | 94.17 | 50 x 224 x 224 | 10 | [checkpoint (20 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a2_base.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a2/base/kinetics-600/classification/) |
@@ -123,10 +132,19 @@ Base models implement standard 3D convolutions without stream buffers.

 #### Streaming Models

-Streaming models implement causal 3D convolutions with stream buffers.
+Streaming models implement causal (2+1)D convolutions with stream buffers.
+Streaming models use (2+1)D convolution instead of 3D to utilize optimized
+[`tf.nn.conv2d`](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d)
+operations, which offer fast inference on CPU. Streaming models can be run on
+individual frames or on larger video clips like base models.
+
+Note: A3, A4, and A5 models use a positional encoding in the squeeze-excitation
+blocks, while A0, A1, and A2 do not. For the smaller models, accuracy is
+unaffected without positional encoding, while for the larger models accuracy is
+significantly worse without positional encoding.

-| Model Name | Top-1 Accuracy | Top-5 Accuracy | Input Shape\* | GFLOPs\*\* | Chekpoint | TF Hub SavedModel |
-|------------|----------------|----------------|---------------|------------|-----------|-------------------|
+| Model Name | Top-1 Accuracy | Top-5 Accuracy | Input Shape\* | GFLOPs\*\* | Checkpoint | TF Hub SavedModel |
+|------------|----------------|----------------|---------------|------------|------------|-------------------|
 | MoViNet-A0-Stream | 72.05 | 90.63 | 50 x 172 x 172 | 2.7 | [checkpoint (12 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_stream.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a0/stream/kinetics-600/classification/) |
 | MoViNet-A1-Stream | 76.45 | 93.25 | 50 x 172 x 172 | 6.0 | [checkpoint (18 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a1_stream.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a1/stream/kinetics-600/classification/) |
 | MoViNet-A2-Stream | 78.40 | 94.05 | 50 x 224 x 224 | 10 | [checkpoint (20 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a2_stream.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a2/stream/kinetics-600/classification/) |
@@ -139,6 +157,35 @@ duration of the 10-second clip.

 \*\*GFLOPs per video on Kinetics 600.

+Note: current streaming model checkpoints have been updated with a slightly
+different architecture. To download the old checkpoints, insert `_legacy` before
+`.tar.gz` in the URL. E.g., `movinet_a0_stream_legacy.tar.gz`.
+
+##### TF Lite Streaming Models
+
+For convenience, we provide converted TF Lite models for inference on mobile
+devices. See the [TF Lite Example](#tf-lite-example) to export and run your own
+models.
+
+For reference, MoViNet-A0-Stream runs with a similar latency to
+[MobileNetV3-Large]
+(https://tfhub.dev/google/imagenet/mobilenet_v3_large_100_224/classification/)
+with +5% accuracy on Kinetics 600.
+
+| Model Name | Input Shape | Pixel 4 Latency\* | x86 Latency\* | TF Lite Binary |
+|------------|-------------|-------------------|---------------|----------------|
+| MoViNet-A0-Stream | 1 x 1 x 172 x 172 | 22 ms | 16 ms | [TF Lite (13 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_stream.tflite) |
+| MoViNet-A1-Stream | 1 x 1 x 172 x 172 | 42 ms | 33 ms | [TF Lite (45 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a1_stream.tflite) |
+| MoViNet-A2-Stream | 1 x 1 x 224 x 224 | 200 ms | 66 ms | [TF Lite (53 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a2_stream.tflite) |
+| MoViNet-A3-Stream | 1 x 1 x 256 x 256 | - | 120 ms | [TF Lite (73 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a3_stream.tflite) |
+| MoViNet-A4-Stream | 1 x 1 x 290 x 290 | - | 300 ms | [TF Lite (101 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a4_stream.tflite) |
+| MoViNet-A5-Stream | 1 x 1 x 320 x 320 | - | 450 ms | [TF Lite (153 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a5_stream.tflite) |
+
+\*Single-frame latency measured on with unaltered float32 operations on a
+single CPU core. Observed latency may differ depending on hardware
+configuration. Measured on a stock Pixel 4 (Android 11) and x86 Intel Xeon
+W-2135 CPU.
+
 ## Prediction Examples

 Please check out our [Colab Notebook](https://colab.research.google.com/github/tensorflow/models/tree/master/official/vision/beta/projects/movinet/movinet_tutorial.ipynb)
@@ -146,7 +193,7 @@ to get started with MoViNets.

 This section provides examples on how to run prediction.

-For base models, run the following:
+For **base models**, run the following:

 ```python
 import tensorflow as tf
@@ -181,7 +228,7 @@ output = model(inputs)
 prediction = tf.argmax(output, -1)
 ```

-For streaming models, run the following:
+For **streaming models**, run the following:

 ```python
 import tensorflow as tf
@@ -189,20 +236,31 @@ import tensorflow as tf
 from official.vision.beta.projects.movinet.modeling import movinet
 from official.vision.beta.projects.movinet.modeling import movinet_model

+model_id = 'a0'
+use_positional_encoding = model_id in {'a3', 'a4', 'a5'}
+
 # Create backbone and model.
 backbone = movinet.Movinet(
-    model_id='a0',
+    model_id=model_id,
    causal=True,
+    conv_type='2plus1d',
+    se_type='2plus3d',
+    activation='hard_swish',
+    gating_activation='hard_sigmoid',
+    use_positional_encoding=use_positional_encoding,
    use_external_states=True,
 )
+
 model = movinet_model.MovinetClassifier(
-    backbone, num_classes=600, output_states=True)
+    backbone,
+    num_classes=600,
+    output_states=True)

 # Create your example input here.
 # Refer to the paper for recommended input shapes.
 inputs = tf.ones([1, 8, 172, 172, 3])

-# [Optional] Build the model and load a pretrained checkpoint
+# [Optional] Build the model and load a pretrained checkpoint.
 model.build(inputs.shape)

 checkpoint_dir = '/path/to/checkpoint'
@@ -237,23 +295,89 @@ non_streaming_output, _ = model({**init_states, 'image': inputs})
 non_streaming_prediction = tf.argmax(non_streaming_output, -1)
 ```

+## TF Lite Example
+
+This section outlines an example on how to export a model to run on mobile
+devices with [TF Lite](https://www.tensorflow.org/lite).
+
+First, convert to [TF SavedModel](https://www.tensorflow.org/guide/saved_model)
+by running `export_saved_model.py`. For example, for `MoViNet-A0-Stream`, run:
+
+```shell
+python3 export_saved_model.py \
+  --model_id=a0 \
+  --causal=True \
+  --conv_type=2plus1d \
+  --se_type=2plus3d \
+  --activation=hard_swish \
+  --gating_activation=hard_sigmoid \
+  --use_positional_encoding=False \
+  --num_classes=600 \
+  --batch_size=1 \
+  --num_frames=1 \
+  --image_size=172 \
+  --bundle_input_init_states_fn=False \
+  --checkpoint_path=/path/to/checkpoint \
+  --export_path=/tmp/movinet_a0_stream
+```
+
+Then the SavedModel can be converted to TF Lite using the [`TFLiteConverter`](https://www.tensorflow.org/lite/convert):
+
+```python
+saved_model_dir = '/tmp/movinet_a0_stream'
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+tflite_model = converter.convert()
+
+with open('/tmp/movinet_a0_stream.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+To run with TF Lite using [tf.lite.Interpreter](https://www.tensorflow.org/lite/guide/inference#load_and_run_a_model_in_python)
+with the Python API:
+
+```python
+# Create the interpreter and signature runner
+interpreter = tf.lite.Interpreter('/tmp/movinet_a0_stream.tflite')
+signature = interpreter.get_signature_runner()
+
+# Extract state names and create the initial (zero) states
+def state_name(name: str) -> str:
+  return name[len('serving_default_'):-len(':0')]
+
+init_states = {
+    state_name(x['name']): tf.zeros(x['shape'], dtype=x['dtype'])
+    for x in interpreter.get_input_details()
+}
+del init_states['image']
+
+# Insert your video clip here
+video = tf.ones([1, 8, 172, 172, 3])
+clips = tf.split(video, video.shape[1], axis=1)
+
+# To run on a video, pass in one frame at a time
+states = init_states
+for clip in clips:
+  # Input shape: [1, 1, 172, 172, 3]
+  outputs = signature(**states, image=clip)
+  logits = outputs.pop('logits')
+  states = outputs
+```
+
+Follow the [official guide](https://www.tensorflow.org/lite/guide) to run a
+model with TF Lite on your mobile device.
+
 ## Training and Evaluation

 Run this command line for continuous training and evaluation.

 ```shell
-MODE=train_and_eval  # Can also be 'train'
+MODE=train_and_eval  # Can also be 'train' if using a separate evaluator job
 CONFIG_FILE=official/vision/beta/projects/movinet/configs/yaml/movinet_a0_k600_8x8.yaml
 python3 official/vision/beta/projects/movinet/train.py \
    --experiment=movinet_kinetics600 \
    --mode=${MODE} \
-    --model_dir=/tmp/movinet/ \
-    --config_file=${CONFIG_FILE} \
-    --params_override="" \
-    --gin_file="" \
-    --gin_params="" \
-    --tpu="" \
-    --tf_data_service=""
+    --model_dir=/tmp/movinet_a0_base/ \
+    --config_file=${CONFIG_FILE}
 ```

 Run this command line for evaluation.
@@ -264,13 +388,8 @@ CONFIG_FILE=official/vision/beta/projects/movinet/configs/yaml/movinet_a0_k600_8
 python3 official/vision/beta/projects/movinet/train.py \
    --experiment=movinet_kinetics600 \
    --mode=${MODE} \
-    --model_dir=/tmp/movinet/ \
-    --config_file=${CONFIG_FILE} \
-    --params_override="" \
-    --gin_file="" \
-    --gin_params="" \
-    --tpu="" \
-    --tf_data_service=""
+    --model_dir=/tmp/movinet_a0_base/ \
+    --config_file=${CONFIG_FILE}
 ```

 ## License

--- a/official/vision/beta/projects/movinet/configs/movinet.py
+++ b/official/vision/beta/projects/movinet/configs/movinet.py
@@ -130,6 +130,7 @@ class MovinetModel(video_classification.VideoClassificationModel):
      norm_momentum=0.99,
      norm_epsilon=1e-3,
      use_sync_bn=True)
+  activation: str = 'swish'
  output_states: bool = False



--- a/official/vision/beta/projects/movinet/configs/yaml/movinet_a0_stream_k600_8x8.yaml
+++ b/official/vision/beta/projects/movinet/configs/yaml/movinet_a0_stream_k600_8x8.yaml
@@ -15,6 +15,11 @@ task:
      movinet:
        model_id: 'a0'
        causal: true
+        # Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
+        conv_type: '3d_2plus1d'
+        se_type: '2plus3d'
+        activation: 'hard_swish'
+        gating_activation: 'hard_sigmoid'
        stochastic_depth_drop_rate: 0.2
    norm_activation:
      use_sync_bn: true

--- a/official/vision/beta/projects/movinet/configs/yaml/movinet_a1_stream_k600_8x8.yaml
+++ b/official/vision/beta/projects/movinet/configs/yaml/movinet_a1_stream_k600_8x8.yaml
@@ -15,6 +15,11 @@ task:
      movinet:
        model_id: 'a1'
        causal: true
+        # Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
+        conv_type: '3d_2plus1d'
+        se_type: '2plus3d'
+        activation: 'hard_swish'
+        gating_activation: 'hard_sigmoid'
        stochastic_depth_drop_rate: 0.2
    norm_activation:
      use_sync_bn: true

--- a/official/vision/beta/projects/movinet/configs/yaml/movinet_a2_stream_k600_8x8.yaml
+++ b/official/vision/beta/projects/movinet/configs/yaml/movinet_a2_stream_k600_8x8.yaml
@@ -15,10 +15,15 @@ task:
      movinet:
        model_id: 'a2'
        causal: true
+        # Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
+        conv_type: '3d_2plus1d'
+        se_type: '2plus3d'
+        activation: 'hard_swish'
+        gating_activation: 'hard_sigmoid'
        stochastic_depth_drop_rate: 0.2
    norm_activation:
      use_sync_bn: true
-    dropout_rate: 0.2
+    dropout_rate: 0.5
  train_data:
    name: kinetics600
    variant_name: rgb

--- a/official/vision/beta/projects/movinet/configs/yaml/movinet_a3_stream_k600_8x8.yaml
+++ b/official/vision/beta/projects/movinet/configs/yaml/movinet_a3_stream_k600_8x8.yaml
@@ -15,6 +15,11 @@ task:
      movinet:
        model_id: 'a3'
        causal: true
+        # Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
+        conv_type: '3d_2plus1d'
+        se_type: '2plus3d'
+        activation: 'hard_swish'
+        gating_activation: 'hard_sigmoid'
        use_positional_encoding: true
        stochastic_depth_drop_rate: 0.2
    norm_activation:

--- a/official/vision/beta/projects/movinet/configs/yaml/movinet_a4_stream_k600_8x8.yaml
+++ b/official/vision/beta/projects/movinet/configs/yaml/movinet_a4_stream_k600_8x8.yaml
@@ -15,6 +15,11 @@ task:
      movinet:
        model_id: 'a4'
        causal: true
+        # Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
+        conv_type: '3d_2plus1d'
+        se_type: '2plus3d'
+        activation: 'hard_swish'
+        gating_activation: 'hard_sigmoid'
        use_positional_encoding: true
        stochastic_depth_drop_rate: 0.2
    norm_activation:

--- a/official/vision/beta/projects/movinet/configs/yaml/movinet_a5_stream_k600_8x8.yaml
+++ b/official/vision/beta/projects/movinet/configs/yaml/movinet_a5_stream_k600_8x8.yaml
@@ -15,6 +15,11 @@ task:
      movinet:
        model_id: 'a5'
        causal: true
+        # Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
+        conv_type: '3d_2plus1d'
+        se_type: '2plus3d'
+        activation: 'hard_swish'
+        gating_activation: 'hard_sigmoid'
        use_positional_encoding: true
        stochastic_depth_drop_rate: 0.2
    norm_activation:
@@ -42,7 +47,8 @@ task:
  validation_data:
    name: kinetics600
    feature_shape: !!python/tuple
-    - 120
+    # Evaluate on 115 frames instead of 120, as the model will get OOM on TPU
+    - 115
    - 320
    - 320
    - 3

--- a/official/vision/beta/projects/movinet/configs/yaml/movinet_t0_stream_k600_8x8.yaml
+++ b/official/vision/beta/projects/movinet/configs/yaml/movinet_t0_stream_k600_8x8.yaml
@@ -15,6 +15,11 @@ task:
      movinet:
        model_id: 't0'
        causal: true
+        # Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
+        conv_type: '3d_2plus1d'
+        se_type: '2plus3d'
+        activation: 'hard_swish'
+        gating_activation: 'hard_sigmoid'
        stochastic_depth_drop_rate: 0.2
    norm_activation:
      use_sync_bn: true

--- a/official/vision/beta/projects/movinet/export_saved_model.py
+++ b/official/vision/beta/projects/movinet/export_saved_model.py
@@ -28,6 +28,26 @@ python3 export_saved_model.py \
  --checkpoint_path=""
 ```

+Export for TF Lite example:
+
+```shell
+python3 export_saved_model.py \
+  --model_id=a0 \
+  --causal=True \
+  --conv_type=2plus1d \
+  --se_type=2plus3d \
+  --activation=hard_swish \
+  --gating_activation=hard_sigmoid \
+  --use_positional_encoding=False \
+  --num_classes=600 \
+  --batch_size=1 \
+  --num_frames=1 \  # Use a single frame for streaming mode
+  --image_size=172 \  # Input resolution for the model
+  --bundle_input_init_states_fn=False \
+  --checkpoint_path=/path/to/checkpoint \
+  --export_path=/tmp/movinet_a0_stream
+```
+
 To use an exported saved_model, refer to export_saved_model_test.py.
 """

@@ -79,6 +99,10 @@ flags.DEFINE_integer(
 flags.DEFINE_integer(
    'image_size', None,
    'The resolution of the input. Set to None for dynamic input.')
+flags.DEFINE_bool(
+    'bundle_input_init_states_fn', True,
+    'Add init_states as a function signature to the saved model.'
+    'This is not necessary if the input shape is static (e.g., for TF Lite).')
 flags.DEFINE_string(
    'checkpoint_path', '',
    'Checkpoint path to load. Leave blank for default initialization.')
@@ -97,24 +121,33 @@ def main(_) -> None:

  # Use dimensions of 1 except the channels to export faster,
  # since we only really need the last dimension to build and get the output
-  # states. These dimensions will be set to `None` once the model is built.
+  # states. These dimensions can be set to `None` once the model is built.
  input_shape = [1 if s is None else s for s in input_specs.shape]

+  activation = FLAGS.activation
+  if activation == 'swish':
+    # Override swish activation implementation to remove custom gradients
+    activation = 'simple_swish'
+
  backbone = movinet.Movinet(
-      FLAGS.model_id,
+      model_id=FLAGS.model_id,
      causal=FLAGS.causal,
+      use_positional_encoding=FLAGS.use_positional_encoding,
      conv_type=FLAGS.conv_type,
-      use_external_states=FLAGS.causal,
+      se_type=FLAGS.se_type,
      input_specs=input_specs,
-      activation=FLAGS.activation,
+      activation=activation,
      gating_activation=FLAGS.gating_activation,
-      se_type=FLAGS.se_type,
-      use_positional_encoding=FLAGS.use_positional_encoding)
+      use_sync_bn=False,
+      use_external_states=FLAGS.causal)
  model = movinet_model.MovinetClassifier(
      backbone,
      num_classes=FLAGS.num_classes,
      output_states=FLAGS.causal,
-      input_specs=dict(image=input_specs))
+      input_specs=dict(image=input_specs),
+      # TODO(dankondratyuk): currently set to swish, but will need to
+      # re-train to use other activations.
+      activation='simple_swish')
  model.build(input_shape)

  # Compile model to generate some internal Keras variables.
@@ -131,7 +164,7 @@ def main(_) -> None:
    # with the full output state shapes.
    input_image = tf.ones(input_shape)
    _, states = model({**model.init_states(input_shape), 'image': input_image})
-    _, states = model({**states, 'image': input_image})
+    _ = model({**states, 'image': input_image})

    # Create a function to explicitly set the names of the outputs
    def predict(inputs):
@@ -153,7 +186,10 @@ def main(_) -> None:
    init_states_fn = init_states_fn.get_concrete_function(
        tf.TensorSpec([5], dtype=tf.int32))

-    signatures = {'call': predict_fn, 'init_states': init_states_fn}
+    if FLAGS.bundle_input_init_states_fn:
+      signatures = {'call': predict_fn, 'init_states': init_states_fn}
+    else:
+      signatures = predict_fn

    tf.keras.models.save_model(
        model, FLAGS.export_path, signatures=signatures)

--- a/official/vision/beta/projects/movinet/export_saved_model_test.py
+++ b/official/vision/beta/projects/movinet/export_saved_model_test.py
@@ -48,7 +48,7 @@ class ExportSavedModelTest(tf.test.TestCase):
    example_input = tf.ones([1, 8, 172, 172, 3])
    outputs = model(example_input)

-    self.assertEqual(outputs.shape, [1, 600])
+    self.assertAllEqual(outputs.shape, [1, 600])

  def test_movinet_export_a0_stream_with_tfhub(self):
    saved_model_path = self.get_temp_dir()
@@ -94,9 +94,55 @@ class ExportSavedModelTest(tf.test.TestCase):
    for frame in frames:
      outputs, states = model({**states, 'image': frame})

-    self.assertEqual(outputs.shape, [1, 600])
+    self.assertAllEqual(outputs.shape, [1, 600])
    self.assertNotEmpty(states)
    self.assertAllClose(outputs, expected_outputs, 1e-5, 1e-5)

+  def test_movinet_export_a0_stream_with_tflite(self):
+    saved_model_path = self.get_temp_dir()
+
+    FLAGS.export_path = saved_model_path
+    FLAGS.model_id = 'a0'
+    FLAGS.causal = True
+    FLAGS.conv_type = '2plus1d'
+    FLAGS.se_type = '2plus3d'
+    FLAGS.activation = 'hard_swish'
+    FLAGS.gating_activation = 'hard_sigmoid'
+    FLAGS.use_positional_encoding = False
+    FLAGS.num_classes = 600
+    FLAGS.batch_size = 1
+    FLAGS.num_frames = 1
+    FLAGS.image_size = 172
+    FLAGS.bundle_input_init_states_fn = False
+
+    export_saved_model.main('unused_args')
+
+    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
+    tflite_model = converter.convert()
+
+    interpreter = tf.lite.Interpreter(model_content=tflite_model)
+    signature = interpreter.get_signature_runner()
+
+    def state_name(name: str) -> str:
+      return name[len('serving_default_'):-len(':0')]
+
+    init_states = {
+        state_name(x['name']): tf.zeros(x['shape'], dtype=x['dtype'])
+        for x in interpreter.get_input_details()
+    }
+    del init_states['image']
+
+    video = tf.ones([1, 8, 172, 172, 3])
+    clips = tf.split(video, video.shape[1], axis=1)
+
+    states = init_states
+    for clip in clips:
+      outputs = signature(**states, image=clip)
+      logits = outputs.pop('logits')
+      states = outputs
+
+    self.assertAllEqual(logits.shape, [1, 600])
+    self.assertNotEmpty(states)
+
 if __name__ == '__main__':
  tf.test.main()