Internal change

PiperOrigin-RevId: 428641380

Internal change
PiperOrigin-RevId: 428641380
9c8cbd0c · A. Unique TensorFlower · 8c3a1ef3 · 9c8cbd0c · 9c8cbd0c · 9c8cbd0c
Commit 9c8cbd0c authored Feb 14, 2022 by A. Unique TensorFlower
8 changed files
--- a/official/projects/s3d/configs/s3d.py
+++ b/official/projects/s3d/configs/s3d.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""S3D model configurations."""
+import dataclasses
+from typing import Text
+from official.modeling import hyperparams
+from official.vision.beta.configs import backbones_3d
+from official.vision.beta.configs import video_classification
+@dataclasses.dataclass
+class S3D(hyperparams.Config):
+  """S3D backbone config.
+  Attributes:
+    final_endpoint: Specifies the endpoint to construct the network up to. It
+      can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+      'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c',
+      'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e',
+      'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', 'Mixed_5c']
+    first_temporal_kernel_size: Specifies the temporal kernel size for the first
+      conv3d filter. A larger value slows down the model but provides little
+      accuracy improvement. Must be set to one of 1, 3, 5 or 7.
+    temporal_conv_start_at: Specifies the first conv block to use separable 3D
+      convs rather than 2D convs (implemented as [1, k, k] 3D conv). This is
+      used to construct the inverted pyramid models. 'Conv2d_2c_3x3' is the
+      first valid block to use separable 3D convs. If provided block name is
+      not present, all valid blocks will use separable 3D convs.
+    gating_start_at: Specifies the first conv block to use self gating.
+      'Conv2d_2c_3x3' is the first valid block to use self gating.
+    swap_pool_and_1x1x1: If True, in Branch_3 1x1x1 convolution is performed
+      first, then followed by max pooling. 1x1x1 convolution is used to reduce
+      the number of filters. Thus, max pooling is performed on less filters.
+    gating_style: Self gating can be applied after each branch and/or after each
+      inception cell. It can be one of ['BRANCH', 'CELL', 'BRANCH_AND_CELL'].
+    use_sync_bn: If True, use synchronized batch normalization.
+    norm_momentum: A `float` of normalization momentum for the moving average.
+    norm_epsilon: A `float` added to variance to avoid dividing by zero.
+    temporal_conv_type: It can be one of ['3d', '2+1d', '1+2d', '1+1+1d'] where
+      '3d' is SPATIOTEMPORAL 3d convolution, '2+1d' is SPATIAL_TEMPORAL_SEPARATE
+      with 2D convolution on the spatial dimensions followed by 1D convolution
+      on the temporal dimension, '1+2d' is TEMPORAL_SPATIAL_SEPARATE with 1D
+      convolution on the temporal dimension followed by 2D convolution on the
+      spatial dimensions, and '1+1+1d' is FULLY_SEPARATE with 1D convolutions on
+      the horizontal, vertical, and temporal dimensions, respectively.
+    depth_multiplier: Float multiplier for the depth (number of channels) for
+      all convolution ops. The value must be greater than zero. Typical usage
+      will be to set this value in (0, 1) to reduce the number of parameters or
+      computation cost of the model.
+  """
+  final_endpoint: Text = 'Mixed_5c'
+  first_temporal_kernel_size: int = 3
+  temporal_conv_start_at: Text = 'Conv2d_2c_3x3'
+  gating_start_at: Text = 'Conv2d_2c_3x3'
+  swap_pool_and_1x1x1: bool = True
+  gating_style: Text = 'CELL'
+  use_sync_bn: bool = False
+  norm_momentum: float = 0.999
+  norm_epsilon: float = 0.001
+  temporal_conv_type: Text = '2+1d'
+  depth_multiplier: float = 1.0
+@dataclasses.dataclass
+class Backbone3D(backbones_3d.Backbone3D):
+  """Configuration for backbones.
+  Attributes:
+    type: 'str', type of backbone be used, on the of fields below.
+    s3d: s3d backbone config.
+  """
+  type: str = 's3d'
+  s3d: S3D = S3D()
+@dataclasses.dataclass
+class S3DModel(video_classification.VideoClassificationModel):
+  """The S3D model config.
+  Attributes:
+    type: 'str', type of backbone be used, on the of fields below.
+    backbone: backbone config.
+  """
+  model_type: str = 's3d'
+  backbone: Backbone3D = Backbone3D()
--- a/official/projects/s3d/modeling/inception_utils.py
+++ b/official/projects/s3d/modeling/inception_utils.py
--- a/official/projects/s3d/modeling/inception_utils_test.py
+++ b/official/projects/s3d/modeling/inception_utils_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+from absl.testing import parameterized
+import tensorflow as tf
+from official.projects.s3d.modeling import inception_utils
+class InceptionUtilsTest(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.parameters((1.0, 3, {'Conv2d_1a_7x7', 'Conv2d_2c_3x3'}),
+                            (0.5, 5, {'Conv2d_1a_7x7', 'Conv2d_2c_3x3'}),
+                            (0.25, 7, {'Conv2d_1a_7x7', 'Conv2d_2c_3x3'}))
+  def test_s3d_stem_cells(self, depth_multiplier, first_temporal_kernel_size,
+                          temporal_conv_endpoints):
+    batch_size = 1
+    num_frames = 64
+    height, width = 224, 224
+    inputs = tf.keras.layers.Input(
+        shape=(num_frames, height, width, 3), batch_size=batch_size)
+    outputs, output_endpoints = inception_utils.inception_v1_stem_cells(
+        inputs,
+        depth_multiplier,
+        'Mixed_5c',
+        temporal_conv_endpoints=temporal_conv_endpoints,
+        self_gating_endpoints={'Conv2d_2c_3x3'},
+        first_temporal_kernel_size=first_temporal_kernel_size)
+    self.assertListEqual(outputs.shape.as_list(),
+                         [batch_size, 32, 28, 28, int(192 * depth_multiplier)])
+    expected_endpoints = {
+        'Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3',
+        'MaxPool_3a_3x3'
+    }
+    self.assertSetEqual(expected_endpoints, set(output_endpoints.keys()))
+  @parameterized.parameters(
+      ('3d', True, True, True),
+      ('2d', False, False, True),
+      ('1+2d', True, False, False),
+      ('2+1d', False, True, False),
+  )
+  def test_inception_v1_cell_endpoint_match(self, conv_type,
+                                            swap_pool_and_1x1x1,
+                                            use_self_gating_on_branch,
+                                            use_self_gating_on_cell):
+    batch_size = 5
+    num_frames = 32
+    channels = 128
+    height, width = 28, 28
+    inputs = tf.keras.layers.Input(
+        shape=(num_frames, height, width, channels), batch_size=batch_size)
+    inception_v1_cell_layer = inception_utils.InceptionV1CellLayer(
+        [[64], [96, 128], [16, 32], [32]],
+        conv_type=conv_type,
+        swap_pool_and_1x1x1=swap_pool_and_1x1x1,
+        use_self_gating_on_branch=use_self_gating_on_branch,
+        use_self_gating_on_cell=use_self_gating_on_cell,
+        name='test')
+    outputs = inception_v1_cell_layer(inputs)
+    # self.assertTrue(net.op.name.startswith('test'))
+    self.assertListEqual(outputs.shape.as_list(),
+                         [batch_size, 32, 28, 28, 256])
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/projects/s3d/modeling/net_utils.py
+++ b/official/projects/s3d/modeling/net_utils.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Commonly used TensorFlow 2 network blocks."""
+from typing import Any, Text, Sequence, Union
+import tensorflow as tf
+WEIGHT_INITIALIZER = {
+    'Xavier': tf.keras.initializers.GlorotUniform,
+    'Gaussian': lambda: tf.keras.initializers.RandomNormal(stddev=0.01),
+}
+initializers = tf.keras.initializers
+regularizers = tf.keras.regularizers
+def make_set_from_start_endpoint(start_endpoint: Text,
+                                 endpoints: Sequence[Text]):
+  """Makes a subset of endpoints from the given starting position."""
+  if start_endpoint not in endpoints:
+    return set()
+  start_index = endpoints.index(start_endpoint)
+  return set(endpoints[start_index:])
+def apply_depth_multiplier(d: Union[int, Sequence[Any]],
+                           depth_multiplier: float):
+  """Applies depth_multiplier recursively to ints."""
+  if isinstance(d, int):
+    return int(d * depth_multiplier)
+  else:
+    return [apply_depth_multiplier(x, depth_multiplier) for x in d]
+class ParameterizedConvLayer(tf.keras.layers.Layer):
+  """Convolution layer based on the input conv_type."""
+  def __init__(
+      self,
+      conv_type: Text,
+      kernel_size: int,
+      filters: int,
+      strides: Sequence[int],
+      rates: Sequence[int],
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.999,
+      norm_epsilon: float = 0.001,
+      temporal_conv_initializer: Union[
+          Text, initializers.Initializer] = 'glorot_uniform',
+      kernel_initializer: Union[Text,
+                                initializers.Initializer] = 'truncated_normal',
+      kernel_regularizer: Union[Text, regularizers.Regularizer] = 'l2',
+      **kwargs):
+    super(ParameterizedConvLayer, self).__init__(**kwargs)
+    self._conv_type = conv_type
+    self._kernel_size = kernel_size
+    self._filters = filters
+    self._strides = strides
+    self._rates = rates
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._channel_axis = -1
+    else:
+      self._channel_axis = 1
+    self._temporal_conv_initializer = temporal_conv_initializer
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+  def _build_conv_layer_params(self, input_shape):
+    """Builds params for conv layers."""
+    conv_layer_params = []
+    if self._conv_type == '3d':
+      conv_layer_params.append(
+          dict(
+              filters=self._filters,
+              kernel_size=[self._kernel_size] * 3,
+              strides=self._strides,
+              dilation_rate=self._rates,
+              kernel_initializer=self._kernel_initializer,
+          ))
+    elif self._conv_type == '2d':
+      conv_layer_params.append(
+          dict(
+              filters=self._filters,
+              kernel_size=[1, self._kernel_size, self._kernel_size],
+              strides=[1, self._strides[1], self._strides[2]],
+              dilation_rate=[1, self._rates[1], self._rates[2]],
+              kernel_initializer=self._kernel_initializer,
+          ))
+    elif self._conv_type == '1+2d':
+      channels_in = input_shape[self._channel_axis]
+      conv_layer_params.append(
+          dict(
+              filters=channels_in,
+              kernel_size=[self._kernel_size, 1, 1],
+              strides=[self._strides[0], 1, 1],
+              dilation_rate=[self._rates[0], 1, 1],
+              kernel_initializer=self._temporal_conv_initializer,
+          ))
+      conv_layer_params.append(
+          dict(
+              filters=self._filters,
+              kernel_size=[1, self._kernel_size, self._kernel_size],
+              strides=[1, self._strides[1], self._strides[2]],
+              dilation_rate=[1, self._rates[1], self._rates[2]],
+              kernel_initializer=self._kernel_initializer,
+          ))
+    elif self._conv_type == '2+1d':
+      conv_layer_params.append(
+          dict(
+              filters=self._filters,
+              kernel_size=[1, self._kernel_size, self._kernel_size],
+              strides=[1, self._strides[1], self._strides[2]],
+              dilation_rate=[1, self._rates[1], self._rates[2]],
+              kernel_initializer=self._kernel_initializer,
+          ))
+      conv_layer_params.append(
+          dict(
+              filters=self._filters,
+              kernel_size=[self._kernel_size, 1, 1],
+              strides=[self._strides[0], 1, 1],
+              dilation_rate=[self._rates[0], 1, 1],
+              kernel_initializer=self._temporal_conv_initializer,
+          ))
+    elif self._conv_type == '1+1+1d':
+      conv_layer_params.append(
+          dict(
+              filters=self._filters,
+              kernel_size=[1, 1, self._kernel_size],
+              strides=[1, 1, self._strides[2]],
+              dilation_rate=[1, 1, self._rates[2]],
+              kernel_initializer=self._kernel_initializer,
+          ))
+      conv_layer_params.append(
+          dict(
+              filters=self._filters,
+              kernel_size=[1, self._kernel_size, 1],
+              strides=[1, self._strides[1], 1],
+              dilation_rate=[1, self._rates[1], 1],
+              kernel_initializer=self._kernel_initializer,
+          ))
+      conv_layer_params.append(
+          dict(
+              filters=self._filters,
+              kernel_size=[self._kernel_size, 1, 1],
+              strides=[self._strides[0], 1, 1],
+              dilation_rate=[self._rates[0], 1, 1],
+              kernel_initializer=self._kernel_initializer,
+          ))
+    else:
+      raise ValueError('Unsupported conv_type: {}'.format(self._conv_type))
+    return conv_layer_params
+  def _build_norm_layer_params(self, conv_param):
+    """Builds params for the norm layer after one conv layer."""
+    return dict(
+        axis=self._channel_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon,
+        scale=False,
+        gamma_initializer='ones')
+  def _build_activation_layer_params(self, conv_param):
+    """Builds params for the activation layer after one conv layer."""
+    return {}
+  def _append_conv_layer(self, param):
+    """Appends conv, normalization and activation layers."""
+    self._parameterized_conv_layers.append(
+        tf.keras.layers.Conv3D(
+            padding='same',
+            use_bias=False,
+            kernel_regularizer=self._kernel_regularizer,
+            **param,
+        ))
+    norm_layer_params = self._build_norm_layer_params(param)
+    self._parameterized_conv_layers.append(self._norm(**norm_layer_params))
+    relu_layer_params = self._build_activation_layer_params(param)
+    self._parameterized_conv_layers.append(
+        tf.keras.layers.Activation('relu', **relu_layer_params))
+  def build(self, input_shape):
+    self._parameterized_conv_layers = []
+    for conv_layer_param in self._build_conv_layer_params(input_shape):
+      self._append_conv_layer(conv_layer_param)
+    super(ParameterizedConvLayer, self).build(input_shape)
+  def call(self, inputs):
+    x = inputs
+    for layer in self._parameterized_conv_layers:
+      x = layer(x)
+    return x
--- a/official/projects/s3d/modeling/net_utils_test.py
+++ b/official/projects/s3d/modeling/net_utils_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+from absl import logging
+from absl.testing import parameterized
+import tensorflow as tf
+from official.projects.s3d.modeling import net_utils
+class Tf2NetUtilsTest(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.parameters(
+      ('3d', [2, 1, 1], [5, 16, 28, 28, 256]),
+      ('3d', [2, 2, 2], [5, 16, 14, 14, 256]),
+      ('3d', [1, 2, 1], [5, 32, 14, 28, 256]),
+      ('2d', [2, 2, 2], [5, 32, 14, 14, 256]),
+      ('2d', [1, 1, 2], [5, 32, 28, 14, 256]),
+      ('1+2d', [2, 2, 2], [5, 16, 14, 14, 256]),
+      ('1+2d', [2, 1, 1], [5, 16, 28, 28, 256]),
+      ('1+2d', [1, 1, 1], [5, 32, 28, 28, 256]),
+      ('1+2d', [1, 1, 2], [5, 32, 28, 14, 256]),
+      ('2+1d', [2, 2, 2], [5, 16, 14, 14, 256]),
+      ('2+1d', [1, 1, 1], [5, 32, 28, 28, 256]),
+      ('2+1d', [2, 1, 2], [5, 16, 28, 14, 256]),
+      ('1+1+1d', [2, 2, 2], [5, 16, 14, 14, 256]),
+      ('1+1+1d', [1, 1, 1], [5, 32, 28, 28, 256]),
+      ('1+1+1d', [2, 1, 2], [5, 16, 28, 14, 256]),
+  )
+  def test_parameterized_conv_layer_creation(self, conv_type, strides,
+                                             expected_shape):
+    batch_size = 5
+    temporal_size = 32
+    spatial_size = 28
+    channels = 128
+    kernel_size = 3
+    filters = 256
+    rates = [1, 1, 1]
+    name = 'ParameterizedConv'
+    inputs = tf.keras.Input(
+        shape=(temporal_size, spatial_size, spatial_size, channels),
+        batch_size=batch_size)
+    parameterized_conv_layer = net_utils.ParameterizedConvLayer(
+        conv_type, kernel_size, filters, strides, rates, name=name)
+    features = parameterized_conv_layer(inputs)
+    logging.info(features.shape.as_list())
+    logging.info([w.name for w in parameterized_conv_layer.weights])
+    self.assertAllEqual(features.shape.as_list(), expected_shape)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/projects/s3d/modeling/s3d.py
+++ b/official/projects/s3d/modeling/s3d.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Contains the Tensorflow 2 version definition of S3D model.
+S3D model is described in the following paper:
+https://arxiv.org/abs/1712.04851.
+"""
+from typing import Any, Dict, Mapping, Optional, Sequence, Text, Tuple, Union
+import tensorflow as tf
+from official.modeling import hyperparams
+from official.projects.s3d.configs import s3d as cfg
+from official.projects.s3d.modeling import inception_utils
+from official.projects.s3d.modeling import net_utils
+from official.vision.beta.modeling import factory_3d as model_factory
+from official.vision.beta.modeling.backbones import factory as backbone_factory
+initializers = tf.keras.initializers
+regularizers = tf.keras.regularizers
+class S3D(tf.keras.Model):
+  """Class to build S3D family model."""
+  def __init__(self,
+               input_specs: tf.keras.layers.InputSpec,
+               final_endpoint: Text = 'Mixed_5c',
+               first_temporal_kernel_size: int = 3,
+               temporal_conv_start_at: Text = 'Conv2d_2c_3x3',
+               gating_start_at: Text = 'Conv2d_2c_3x3',
+               swap_pool_and_1x1x1: bool = True,
+               gating_style: Text = 'CELL',
+               use_sync_bn: bool = False,
+               norm_momentum: float = 0.999,
+               norm_epsilon: float = 0.001,
+               temporal_conv_initializer: Union[
+                   Text,
+                   initializers.Initializer] = initializers.TruncatedNormal(
+                       mean=0.0, stddev=0.01),
+               temporal_conv_type: Text = '2+1d',
+               kernel_initializer: Union[
+                   Text,
+                   initializers.Initializer] = initializers.TruncatedNormal(
+                       mean=0.0, stddev=0.01),
+               kernel_regularizer: Union[Text, regularizers.Regularizer] = 'l2',
+               depth_multiplier: float = 1.0,
+               **kwargs):
+    """Constructor.
+    Args:
+      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+      final_endpoint: Specifies the endpoint to construct the network up to.
+      first_temporal_kernel_size: Temporal kernel size of the first convolution
+        layer.
+      temporal_conv_start_at: Specifies the endpoint where to start performimg
+        temporal convolution from.
+      gating_start_at: Specifies the endpoint where to start performimg self
+        gating from.
+      swap_pool_and_1x1x1: A boolean flag indicates that whether to swap the
+        order of convolution and max pooling in Branch_3 of inception v1 cell.
+      gating_style: A string that specifies self gating to be applied after each
+        branch and/or after each cell. It can be one of ['BRANCH', 'CELL',
+        'BRANCH_AND_CELL'].
+      use_sync_bn: If True, use synchronized batch normalization.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      temporal_conv_initializer: Weight initializer for temporal convolutional
+        layers.
+      temporal_conv_type: The type of parameterized convolution. Currently, we
+        support '2d', '3d', '2+1d', '1+2d'.
+      kernel_initializer: Weight initializer for convolutional layers other than
+        temporal convolution.
+      kernel_regularizer: Weight regularizer for all convolutional layers.
+      depth_multiplier: A float to reduce/increase number of channels.
+      **kwargs: keyword arguments to be passed.
+    """
+    self._input_specs = input_specs
+    self._final_endpoint = final_endpoint
+    self._first_temporal_kernel_size = first_temporal_kernel_size
+    self._temporal_conv_start_at = temporal_conv_start_at
+    self._gating_start_at = gating_start_at
+    self._swap_pool_and_1x1x1 = swap_pool_and_1x1x1
+    self._gating_style = gating_style
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._temporal_conv_initializer = temporal_conv_initializer
+    self._temporal_conv_type = temporal_conv_type
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._depth_multiplier = depth_multiplier
+    self._temporal_conv_endpoints = net_utils.make_set_from_start_endpoint(
+        temporal_conv_start_at, inception_utils.INCEPTION_V1_CONV_ENDPOINTS)
+    self._self_gating_endpoints = net_utils.make_set_from_start_endpoint(
+        gating_start_at, inception_utils.INCEPTION_V1_CONV_ENDPOINTS)
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+    net, end_points = inception_utils.inception_v1_stem_cells(
+        inputs,
+        depth_multiplier,
+        final_endpoint,
+        temporal_conv_endpoints=self._temporal_conv_endpoints,
+        self_gating_endpoints=self._self_gating_endpoints,
+        temporal_conv_type=self._temporal_conv_type,
+        first_temporal_kernel_size=self._first_temporal_kernel_size,
+        use_sync_bn=self._use_sync_bn,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon,
+        temporal_conv_initializer=self._temporal_conv_initializer,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        parameterized_conv_layer=self._get_parameterized_conv_layer_impl(),
+        layer_naming_fn=self._get_layer_naming_fn(),
+    )
+    for end_point, filters in inception_utils.INCEPTION_V1_ARCH_SKELETON:
+      net, end_points = self._s3d_cell(net, end_point, end_points, filters)
+      if end_point == final_endpoint:
+        break
+    if final_endpoint not in end_points:
+      raise ValueError(
+          'Unrecognized final endpoint %s (available endpoints: %s).' %
+          (final_endpoint, end_points.keys()))
+    super(S3D, self).__init__(inputs=inputs, outputs=end_points, **kwargs)
+  def _s3d_cell(
+      self,
+      net: tf.Tensor,
+      end_point: Text,
+      end_points: Dict[Text, tf.Tensor],
+      filters: Union[int, Sequence[Any]],
+      non_local_block: Optional[tf.keras.layers.Layer] = None,
+      attention_cell: Optional[tf.keras.layers.Layer] = None,
+      attention_cell_super_graph: Optional[tf.keras.layers.Layer] = None
+  ) -> Tuple[tf.Tensor, Dict[Text, tf.Tensor]]:
+    if end_point.startswith('Mixed'):
+      conv_type = (
+          self._temporal_conv_type
+          if end_point in self._temporal_conv_endpoints else '2d')
+      use_self_gating_on_branch = (
+          end_point in self._self_gating_endpoints and
+          (self._gating_style == 'BRANCH' or
+           self._gating_style == 'BRANCH_AND_CELL'))
+      use_self_gating_on_cell = (
+          end_point in self._self_gating_endpoints and
+          (self._gating_style == 'CELL' or
+           self._gating_style == 'BRANCH_AND_CELL'))
+      net = self._get_inception_v1_cell_layer_impl()(
+          branch_filters=net_utils.apply_depth_multiplier(
+              filters, self._depth_multiplier),
+          conv_type=conv_type,
+          temporal_dilation_rate=1,
+          swap_pool_and_1x1x1=self._swap_pool_and_1x1x1,
+          use_self_gating_on_branch=use_self_gating_on_branch,
+          use_self_gating_on_cell=use_self_gating_on_cell,
+          use_sync_bn=self._use_sync_bn,
+          norm_momentum=self._norm_momentum,
+          norm_epsilon=self._norm_epsilon,
+          kernel_initializer=self._kernel_initializer,
+          temporal_conv_initializer=self._temporal_conv_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          name=self._get_layer_naming_fn()(end_point))(
+              net)
+    else:
+      net = tf.keras.layers.MaxPool3D(
+          pool_size=filters[0],
+          strides=filters[1],
+          padding='same',
+          name=self._get_layer_naming_fn()(end_point))(
+              net)
+    end_points[end_point] = net
+    if non_local_block:
+      # TODO(b/182299420): Implement non local block in TF2.
+      raise NotImplementedError('Non local block is not implemented yet.')
+    if attention_cell:
+      # TODO(b/182299420): Implement attention cell in TF2.
+      raise NotImplementedError('Attention cell is not implemented yet.')
+    if attention_cell_super_graph:
+      # TODO(b/182299420): Implement attention cell super graph in TF2.
+      raise NotImplementedError('Attention cell super graph is not implemented'
+                                ' yet.')
+    return net, end_points
+  def get_config(self):
+    config_dict = {
+        'input_specs': self._input_specs,
+        'final_endpoint': self._final_endpoint,
+        'first_temporal_kernel_size': self._first_temporal_kernel_size,
+        'temporal_conv_start_at': self._temporal_conv_start_at,
+        'gating_start_at': self._gating_start_at,
+        'swap_pool_and_1x1x1': self._swap_pool_and_1x1x1,
+        'gating_style': self._gating_style,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'temporal_conv_initializer': self._temporal_conv_initializer,
+        'temporal_conv_type': self._temporal_conv_type,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'depth_multiplier': self._depth_multiplier
+    }
+    return config_dict
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+  @property
+  def output_specs(self):
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
+  def _get_inception_v1_cell_layer_impl(self):
+    return inception_utils.InceptionV1CellLayer
+  def _get_parameterized_conv_layer_impl(self):
+    return net_utils.ParameterizedConvLayer
+  def _get_layer_naming_fn(self):
+    return lambda end_point: None
+class S3DModel(tf.keras.Model):
+  """An S3D model builder."""
+  def __init__(self,
+               backbone: tf.keras.Model,
+               num_classes: int,
+               input_specs: Mapping[Text, tf.keras.layers.InputSpec],
+               final_endpoint: Text = 'Mixed_5c',
+               dropout_rate: float = 0.0,
+               **kwargs):
+    """Constructor.
+    Args:
+      backbone: S3D backbone Keras Model.
+      num_classes: `int` number of possible classes for video classification.
+      input_specs: input_specs: `tf.keras.layers.InputSpec` specs of the input
+        tensor.
+      final_endpoint: Specifies the endpoint to construct the network up to.
+      dropout_rate: `float` between 0 and 1. Fraction of the input units to
+        drop. Note that dropout_rate = 1.0 - dropout_keep_prob.
+      **kwargs: keyword arguments to be passed.
+    """
+    self._self_setattr_tracking = False
+    self._backbone = backbone
+    self._num_classes = num_classes
+    self._input_specs = input_specs
+    self._final_endpoint = final_endpoint
+    self._dropout_rate = dropout_rate
+    self._config_dict = {
+        'backbone': backbone,
+        'num_classes': num_classes,
+        'input_specs': input_specs,
+        'final_endpoint': final_endpoint,
+        'dropout_rate': dropout_rate,
+    }
+    inputs = {
+        k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items()
+    }
+    streams = self._backbone(inputs['image'])
+    pool = tf.math.reduce_mean(streams[self._final_endpoint], axis=[1, 2, 3])
+    fc = tf.keras.layers.Dropout(dropout_rate)(pool)
+    logits = tf.keras.layers.Dense(**self._build_dense_layer_params())(fc)
+    super(S3DModel, self).__init__(inputs=inputs, outputs=logits, **kwargs)
+  @property
+  def checkpoint_items(self):
+    """Returns a dictionary of items to be additionally checkpointed."""
+    return dict(backbone=self.backbone)
+  @property
+  def backbone(self):
+    return self._backbone
+  def get_config(self):
+    return self._config_dict
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+  def _build_dense_layer_params(self):
+    return dict(units=self._num_classes, kernel_regularizer='l2')
+@backbone_factory.register_backbone_builder('s3d')
+def build_s3d(
+    input_specs: tf.keras.layers.InputSpec,
+    backbone_config: hyperparams.Config,
+    norm_activation_config: hyperparams.Config,
+    l2_regularizer: tf.keras.regularizers.Regularizer = None
+) -> tf.keras.Model:  # pytype: disable=annotation-type-mismatch  # typed-keras
+  """Builds S3D backbone."""
+  backbone_type = backbone_config.type
+  backbone_cfg = backbone_config.get()
+  assert backbone_type == 's3d'
+  del norm_activation_config
+  backbone = S3D(
+      input_specs=input_specs,
+      final_endpoint=backbone_cfg.final_endpoint,
+      first_temporal_kernel_size=backbone_cfg.first_temporal_kernel_size,
+      temporal_conv_start_at=backbone_cfg.temporal_conv_start_at,
+      gating_start_at=backbone_cfg.gating_start_at,
+      swap_pool_and_1x1x1=backbone_cfg.swap_pool_and_1x1x1,
+      gating_style=backbone_cfg.gating_style,
+      use_sync_bn=backbone_cfg.use_sync_bn,
+      norm_momentum=backbone_cfg.norm_momentum,
+      norm_epsilon=backbone_cfg.norm_epsilon,
+      temporal_conv_type=backbone_cfg.temporal_conv_type,
+      kernel_regularizer=l2_regularizer,
+      depth_multiplier=backbone_cfg.depth_multiplier)
+  return backbone
+@model_factory.register_model_builder('s3d')
+def build_s3d_model(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: cfg.S3DModel,
+    num_classes: int,
+    l2_regularizer: tf.keras.regularizers.Regularizer = None
+) -> tf.keras.Model:  # pytype: disable=annotation-type-mismatch  # typed-keras
+  """Builds S3D model with classification layer."""
+  input_specs_dict = {'image': input_specs}
+  backbone = build_s3d(input_specs, model_config.backbone,
+                       model_config.norm_activation, l2_regularizer)
+  model = S3DModel(
+      backbone,
+      num_classes=num_classes,
+      input_specs=input_specs_dict,
+      dropout_rate=model_config.dropout_rate)
+  return model
--- a/official/projects/s3d/modeling/s3d_test.py
+++ b/official/projects/s3d/modeling/s3d_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Tests for S3D model."""
+from absl.testing import parameterized
+import tensorflow as tf
+from official.projects.s3d.modeling import s3d
+class S3dTest(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.parameters(
+      (7, 224, 224, 3),
+      (7, 128, 128, 3),
+      (7, 256, 256, 3),
+      (7, 192, 192, 3),
+      (64, 224, 224, 3),
+      (32, 224, 224, 3),
+      (64, 224, 224, 11),
+      (32, 224, 224, 11),
+  )
+  def test_build(self, num_frames, height, width, first_temporal_kernel_size):
+    batch_size = 5
+    input_shape = [batch_size, num_frames, height, width, 3]
+    input_specs = tf.keras.layers.InputSpec(shape=input_shape)
+    network = s3d.S3D(
+        input_specs=input_specs
+    )
+    inputs = tf.keras.Input(shape=input_shape[1:], batch_size=input_shape[0])
+    endpoints = network(inputs)
+    temporal_1a = (num_frames - 1)//2 + 1
+    expected_shapes = {
+        'Conv2d_1a_7x7': [5, temporal_1a, height//2, width//2, 64],
+        'Conv2d_2b_1x1': [5, temporal_1a, height//4, width//4, 64],
+        'Conv2d_2c_3x3': [5, temporal_1a, height//4, height//4, 192],
+        'MaxPool_2a_3x3': [5, temporal_1a, height//4, height//4, 64],
+        'MaxPool_3a_3x3': [5, temporal_1a, height//8, width//8, 192],
+        'Mixed_3b': [5, temporal_1a, height//8, width//8, 256],
+        'Mixed_3c': [5, temporal_1a, height//8, width//8, 480],
+        'MaxPool_4a_3x3': [5, temporal_1a//2, height//16, width//16, 480],
+        'Mixed_4b': [5, temporal_1a//2, height//16, width//16, 512],
+        'Mixed_4c': [5, temporal_1a//2, height//16, width//16, 512],
+        'Mixed_4d': [5, temporal_1a//2, height//16, width//16, 512],
+        'Mixed_4e': [5, temporal_1a//2, height//16, width//16, 528],
+        'Mixed_4f': [5, temporal_1a//2, height//16, width//16, 832],
+        'MaxPool_5a_2x2': [5, temporal_1a//4, height//32, width//32, 832],
+        'Mixed_5b': [5, temporal_1a//4, height//32, width//32, 832],
+        'Mixed_5c': [5, temporal_1a//4, height//32, width//32, 1024],
+    }
+    output_shapes = dict()
+    for end_point, output_tensor in endpoints.items():
+      output_shapes[end_point] = output_tensor.shape.as_list()
+    self.assertDictEqual(output_shapes, expected_shapes)
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        input_specs=tf.keras.layers.InputSpec(shape=(5, 64, 224, 224, 3)),
+        final_endpoint='Mixed_5c',
+        first_temporal_kernel_size=3,
+        temporal_conv_start_at='Conv2d_2c_3x3',
+        gating_start_at='Conv2d_2c_3x3',
+        swap_pool_and_1x1x1=True,
+        gating_style='CELL',
+        use_sync_bn=False,
+        norm_momentum=0.999,
+        norm_epsilon=0.001,
+        temporal_conv_initializer=tf.keras.initializers.TruncatedNormal(
+            mean=0.0, stddev=0.01),
+        temporal_conv_type='2+1d',
+        kernel_initializer='truncated_normal',
+        kernel_regularizer='l2',
+        depth_multiplier=1.0
+    )
+    network = s3d.S3D(**kwargs)
+    expected_config = dict(kwargs)
+    self.assertEqual(network.get_config(), expected_config)
+    # Create another network object from the first object's config.
+    new_network = s3d.S3D.from_config(network.get_config())
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/projects/s3d/train.py
+++ b/official/projects/s3d/train.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""TensorFlow Model Garden Vision training driver for S3D."""
+from absl import app
+# pylint: disable=unused-import
+from official.common import registry_imports
+# pylint: enable=unused-import
+from official.common import flags as tfm_flags
+# pylint: disable=unused-import
+from official.projects.s3d.configs.google import s3d as s3d_config
+from official.projects.s3d.modeling import s3d
+from official.projects.s3d.tasks.google import automl_video_classification
+# pylint: enable=unused-import
+from official.vision.beta import train
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  app.run(train.main)