Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

7479dbb8 · Srihari Humbarwadi · GitHub · 8b60a5a8 · 9c8cbd0c · 7479dbb8
Unverified Commit 7479dbb8 authored Feb 15, 2022 by Srihari Humbarwadi Committed by GitHub Feb 15, 2022
20 changed files
--- a/official/legacy/detection/modeling/learning_rates.py
+++ b/official/legacy/detection/modeling/learning_rates.py
@@ -61,7 +61,7 @@ class CosineLearningRateWithLinearWarmup(
  """Class to generate learning rate tensor."""

  def __init__(self, total_steps, params):
-    """Creates the consine learning rate tensor with linear warmup."""
+    """Creates the cosine learning rate tensor with linear warmup."""
    super(CosineLearningRateWithLinearWarmup, self).__init__()
    self._total_steps = total_steps
    assert isinstance(params, (dict, params_dict.ParamsDict))

--- a/official/legacy/image_classification/learning_rate.py
+++ b/official/legacy/image_classification/learning_rate.py
@@ -78,7 +78,7 @@ class CosineDecayWithWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
  """Class to generate learning rate tensor."""

  def __init__(self, batch_size: int, total_steps: int, warmup_steps: int):
-    """Creates the consine learning rate tensor with linear warmup.
+    """Creates the cosine learning rate tensor with linear warmup.

    Args:
      batch_size: The training batch size used in the experiment.

--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
@@ -216,14 +216,14 @@ class StepCosineLrConfig(base_config.Config):
  """Configuration for stepwise learning rate decay.

  This class is a container for the piecewise cosine learning rate scheduling
-  configs. It will configure an instance of StepConsineDecayWithOffset keras
+  configs. It will configure an instance of StepCosineDecayWithOffset keras
  learning rate schedule.

    ```python
    boundaries: [100000, 110000]
    values: [1.0, 0.5]
    lr_decayed_fn = (
-    lr_schedule.StepConsineDecayWithOffset(
+    lr_schedule.StepCosineDecayWithOffset(
        boundaries,
        values))
    ```
@@ -243,7 +243,7 @@ class StepCosineLrConfig(base_config.Config):
              [boundaries[n], end]               -> values[n+1] to 0.
    offset: An int. The offset applied to steps. Defaults to 0.
  """
-  name: str = 'StepConsineDecayWithOffset'
+  name: str = 'StepCosineDecayWithOffset'
  boundaries: Optional[List[int]] = None
  values: Optional[List[float]] = None
  offset: int = 0

--- a/official/modeling/optimization/lr_schedule.py
+++ b/official/modeling/optimization/lr_schedule.py
@@ -386,11 +386,11 @@ class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
    }


-class StepConsineDecayWithOffset(
+class StepCosineDecayWithOffset(
    tf.keras.optimizers.schedules.LearningRateSchedule):
  """Stepwise cosine learning rate decay with offset.

-  Learning rate is equivalent to one or more consine decay(s) starting and
+  Learning rate is equivalent to one or more cosine decay(s) starting and
  ending at each interval.

  ExampleL
@@ -399,7 +399,7 @@ class StepConsineDecayWithOffset(
    boundaries: [100000, 110000]
    values: [1.0, 0.5]
    lr_decayed_fn = (
-    lr_schedule.StepConsineDecayWithOffset(
+    lr_schedule.StepCosineDecayWithOffset(
        boundaries,
        values))
    ```
@@ -412,7 +412,7 @@ class StepConsineDecayWithOffset(
               boundaries,
               values,
               offset: int = 0,
-               name: str = "StepConsineDecayWithOffset"):
+               name: str = "StepCosineDecayWithOffset"):
    """Initialize configuration of the learning rate schedule.

    Args:
@@ -444,7 +444,7 @@ class StepConsineDecayWithOffset(
        ] + [0])

  def __call__(self, global_step):
-    with tf.name_scope(self.name or "StepConsineDecayWithOffset"):
+    with tf.name_scope(self.name or "StepCosineDecayWithOffset"):
      global_step = tf.cast(global_step - self.offset, tf.float32)
      lr_levels = self.values
      lr_steps = self.boundaries

--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -47,7 +47,7 @@ LR_CLS = {
    'power': lr_schedule.DirectPowerDecay,
    'power_linear': lr_schedule.PowerAndLinearDecay,
    'power_with_offset': lr_schedule.PowerDecayWithOffset,
-    'step_cosine_with_offset': lr_schedule.StepConsineDecayWithOffset,
+    'step_cosine_with_offset': lr_schedule.StepCosineDecayWithOffset,
 }

 WARMUP_CLS = {

--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -33,7 +33,6 @@ from official.nlp.tools import tokenization

 FLAGS = flags.FLAGS

-# TODO(chendouble): consider moving each task to its own binary.
 flags.DEFINE_enum(
    "fine_tuning_task_type", "classification",
    ["classification", "regression", "squad", "retrieval", "tagging"],

--- a/official/nlp/modeling/models/t5.py
+++ b/official/nlp/modeling/models/t5.py
@@ -1086,12 +1086,17 @@ class Encoder(Module):
      self.output_dropout = Dropout(self.config.dropout_rate,)

  @tf.Module.with_name_scope
-  def __call__(self, inputs, encoder_mask=None, training=False):
+  def __call__(self,
+               inputs,
+               encoder_mask=None,
+               dense_inputs=None,
+               training=False):
    """Applies Transformer model on the inputs.

    Args:
      inputs: input data
      encoder_mask: the encoder self-attention mask.
+      dense_inputs: dense input data, concat after the embedding.
      training: whether it is training pass, affecting dropouts.

    Returns:
@@ -1102,11 +1107,20 @@ class Encoder(Module):
      encoder_mask = tf.cast(encoder_mask, self.compute_dtype)
    cfg = self.config
    x = self.input_embed(inputs, one_hot=cfg.one_hot_embedding)
+    if dense_inputs is not None:
+      x = tf.concat([x, dense_inputs], axis=1)
    tensor_shape = tf_utils.get_shape_list(x)
    tensor_shape[-2] = 1
    x = self.input_dropout(x, noise_shape=tensor_shape, training=training)
    input_length = tf_utils.get_shape_list(inputs)[1]
    position_bias = self.relative_embedding(input_length, input_length)
+    if dense_inputs is not None:
+      # Here we ignore relative position bias for dense embeddings.
+      dense_input_length = tf_utils.get_shape_list(dense_inputs)[1]
+      # Position bias shape: [batch, 1, len, len]
+      paddings = tf.constant([[0, 0], [0, 0], [0, dense_input_length],
+                              [0, dense_input_length]])
+      position_bias = tf.pad(position_bias, paddings, "CONSTANT")

    for i in range(cfg.num_layers):
      x = self.encoder_layers[i](
@@ -1308,31 +1322,56 @@ class T5Transformer(Module):
  def encode(self,
             encoder_input_tokens,
             encoder_segment_ids=None,
+             encoder_dense_inputs=None,
+             encoder_dense_segment_ids=None,
             training=False):
    eligible_positions = tf.cast(
        tf.not_equal(encoder_input_tokens, 0), self.compute_dtype)
+    if encoder_dense_inputs is not None:
+      eligible_dense_position = tf.cast(
+          tf.reduce_any(tf.not_equal(encoder_dense_inputs, 0), axis=-1),
+          self.compute_dtype)
+      eligible_positions = tf.concat(
+          [eligible_positions, eligible_dense_position], axis=1)
    encoder_mask = make_attention_mask(
        eligible_positions, eligible_positions, dtype=tf.bool)
    if encoder_segment_ids is not None:
+      if encoder_dense_segment_ids is not None:
+        encoder_segment_ids = tf.concat(
+            [encoder_segment_ids, encoder_dense_segment_ids], axis=1)
      segment_mask = make_attention_mask(
          encoder_segment_ids, encoder_segment_ids, tf.equal, dtype=tf.bool)
      encoder_mask = tf.math.logical_and(encoder_mask, segment_mask)
    encoder_mask = (1.0 - tf.cast(encoder_mask, self.compute_dtype)) * -1e9
-    return self.encoder(encoder_input_tokens, encoder_mask, training=training)
+    return self.encoder(
+        encoder_input_tokens,
+        encoder_mask,
+        encoder_dense_inputs,
+        training=training)

  def decode(
      self,
      encoded,
      decoder_target_tokens,
      encoder_input_tokens,  # only used for masks
+      encoder_dense_inputs=None,
      decoder_input_tokens=None,
      encoder_segment_ids=None,
+      encoder_dense_segment_ids=None,
      decoder_segment_ids=None,
      decode_position=None,
      cache=None,
      max_decode_len=None,
      decode=False,
      training=False):
+    eligible_inputs = tf.cast(
+        tf.not_equal(encoder_input_tokens, 0), self.compute_dtype)
+    if encoder_dense_inputs is not None:
+      eligible_dense_inputs = tf.cast(
+          tf.reduce_any(tf.not_equal(encoder_dense_inputs, 0), axis=-1),
+          self.compute_dtype)
+      eligible_inputs = tf.concat([eligible_inputs, eligible_dense_inputs],
+                                  axis=1)
    if decode:
      # For decoding, the decoder_input_tokens is the decoder_target_tokens.
      decoder_input_tokens = decoder_target_tokens
@@ -1342,14 +1381,12 @@ class T5Transformer(Module):
          tf.cast(
              tf.not_equal(tf.ones_like(decoder_target_tokens), 0),
              self.compute_dtype),
-          tf.cast(tf.not_equal(encoder_input_tokens, 0), self.compute_dtype),
+          eligible_inputs,
          dtype=tf.bool)
    else:
      # Note that, masks should be created using decoder_target_tokens.
      eligible_targets = tf.cast(
          tf.not_equal(decoder_target_tokens, 0), self.compute_dtype)
-      eligible_inputs = tf.cast(
-          tf.not_equal(encoder_input_tokens, 0), self.compute_dtype)
      decoder_mask = tf.math.logical_and(
          make_attention_mask(
              eligible_targets, eligible_targets, dtype=tf.bool),
@@ -1365,6 +1402,9 @@ class T5Transformer(Module):
                  decoder_segment_ids,
                  tf.equal,
                  dtype=tf.bool))
+        if encoder_dense_segment_ids is not None:
+          encoder_segment_ids = tf.concat(
+              [encoder_segment_ids, encoder_dense_segment_ids], axis=1)
        encoder_decoder_mask = tf.math.logical_and(
            encoder_decoder_mask,
            make_attention_mask(
@@ -1392,6 +1432,8 @@ class T5Transformer(Module):
  def __call__(self,
               encoder_input_tokens,
               decoder_target_tokens,
+               encoder_dense_inputs=None,
+               encoder_dense_segment_ids=None,
               decoder_input_tokens=None,
               encoder_segment_ids=None,
               decoder_segment_ids=None,
@@ -1401,9 +1443,12 @@ class T5Transformer(Module):
    Args:
      encoder_input_tokens: input tokens to the encoder.
      decoder_target_tokens: target tokens to the decoder.
+      encoder_dense_inputs: input dense vectors to the encoder.
+      encoder_dense_segment_ids: dense input segmentation info for packed
      decoder_input_tokens: input tokens to the decoder, only required for
        training.
      encoder_segment_ids: input segmentation info for packed examples.
+        examples.
      decoder_segment_ids: target segmentation info for packed examples.
      training: whether it is training pass, affecting dropouts.

@@ -1413,13 +1458,17 @@ class T5Transformer(Module):
    encoded = self.encode(
        encoder_input_tokens,
        encoder_segment_ids=encoder_segment_ids,
+        encoder_dense_inputs=encoder_dense_inputs,
+        encoder_dense_segment_ids=encoder_dense_segment_ids,
        training=training)
    outputs = self.decode(
        encoded=encoded,
        decoder_target_tokens=decoder_target_tokens,
        encoder_input_tokens=encoder_input_tokens,  # only used for masks.
+        encoder_dense_inputs=encoder_dense_inputs,  # only used for masks.
        decoder_input_tokens=decoder_input_tokens,
        encoder_segment_ids=encoder_segment_ids,
+        encoder_dense_segment_ids=encoder_dense_segment_ids,
        decoder_segment_ids=decoder_segment_ids,
        training=training)
    outputs["encoded"] = encoded

--- a/official/nlp/modeling/models/t5_test.py
+++ b/official/nlp/modeling/models/t5_test.py
@@ -354,6 +354,24 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
    encoded = encoder(tf.zeros((4, 8), dtype=tf.int32))
    self.assertEqual(encoded.shape, (4, 8, config.d_model))

+  @parameterized.named_parameters(("bfloat16", tf.bfloat16),
+                                  ("float32", tf.float32))
+  def test_encoder_with_dense(self, dtype):
+    config = t5.T5TransformerParams(
+        num_layers=2,
+        d_model=4,
+        d_kv=3,
+        num_heads=4,
+        d_ff=16,
+        vocab_size=10,
+        vocab_embeddings_initializer=tf.keras.initializers.Ones(),
+        relative_embeddings_initializer=tf.keras.initializers.Ones())
+    encoder = t5.Encoder(config, compute_dtype=dtype)
+    encoded = encoder(
+        tf.zeros((4, 8), dtype=tf.int32),
+        dense_inputs=tf.ones((4, 2, 4), dtype=dtype))
+    self.assertEqual(encoded.shape, (4, 10, config.d_model))
+
  def test_decoder(self):
    max_decode_len = 10
    config = t5.T5TransformerParams(
@@ -445,6 +463,58 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
      print(v.name, v.shape)
      self.assertEqual(v.dtype, tf.float32)

+  @parameterized.named_parameters(
+      ("t5_10", ("relu",), True, 26, False, tf.float32),)
+  def test_transformer_with_dense(self, ffn_activations, logits_via_embedding,
+                                  expect_num_variables, layer_sharing, dtype):
+    max_decode_len = 10
+    config = t5.T5TransformerParams(
+        num_layers=1,
+        d_model=8,
+        d_kv=4,
+        num_heads=4,
+        d_ff=32,
+        vocab_size=10,
+        shared_embedding=True,
+        layer_sharing=layer_sharing,
+        ffn_activations=ffn_activations,
+        logits_via_embedding=logits_via_embedding)
+    transformer = t5.T5Transformer(config, compute_dtype=dtype)
+    self.assertLen(transformer.trainable_variables, expect_num_variables)
+    inputs = tf.convert_to_tensor(
+        np.array([[2, 2, 1, 3, 1, 0], [3, 3, 1, 2, 2, 1]]))
+    segments = tf.convert_to_tensor(
+        np.array([[1, 1, 1, 2, 2, 0], [1, 1, 1, 2, 2, 2]]))
+
+    dense_inputs = tf.convert_to_tensor(np.random.randn(2, 2, 8), dtype=dtype)
+    dense_segments = tf.convert_to_tensor(np.array([[1, 2], [1, 2]]))
+    outputs = transformer(
+        encoder_input_tokens=inputs,
+        encoder_dense_inputs=dense_inputs,
+        decoder_input_tokens=inputs,
+        decoder_target_tokens=inputs,
+        encoder_segment_ids=segments,
+        encoder_dense_segment_ids=dense_segments,
+        decoder_segment_ids=segments)
+    cache = {}
+    batch_size = 2
+    cache[0] = _create_cache(
+        batch_size, max_decode_len, config.num_heads, config.d_kv, dtype=dtype)
+    outputs = transformer.decode(
+        encoder_input_tokens=inputs,
+        encoder_dense_inputs=dense_inputs,
+        encoded=outputs["encoded"],
+        decoder_target_tokens=tf.ones((batch_size, 1), dtype=tf.int32),
+        decode_position=1,
+        decode=True,
+        max_decode_len=max_decode_len,
+        cache=cache)
+    self.assertEqual(outputs["logits"].shape,
+                     (batch_size, 1, config.vocab_size))
+    for v in transformer.trainable_variables:
+      print(v.name, v.shape)
+      self.assertEqual(v.dtype, tf.float32)
+
  @parameterized.named_parameters(
      ("t5_10", ("relu",), True, 39, tf.float32, 2),
      ("t5_10_bfloat16", ("relu",), True, 39, tf.bfloat16, 2))

--- a/official/nlp/tasks/sentence_prediction_test.py
+++ b/official/nlp/tasks/sentence_prediction_test.py
@@ -32,10 +32,12 @@ def _create_fake_dataset(output_path, seq_length, num_classes, num_examples):
  writer = tf.io.TFRecordWriter(output_path)

  def create_int_feature(values):
-    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return tf.train.Feature(
+        int64_list=tf.train.Int64List(value=np.ravel(values)))

  def create_float_feature(values):
-    return tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+    return tf.train.Feature(
+        float_list=tf.train.FloatList(value=np.ravel(values)))

  for i in range(num_examples):
    features = {}

--- a/official/pip_package/setup.py
+++ b/official/pip_package/setup.py
@@ -20,8 +20,8 @@ import sys
 from setuptools import find_packages
 from setuptools import setup

-version = '2.7.0'
-tf_version = '2.7.0'  # Major version.
+version = '2.8.0'
+tf_version = '2.8.0'  # Major version.

 project_name = 'tf-models-official'


--- a/official/projects/basnet/tasks/basnet.py
+++ b/official/projects/basnet/tasks/basnet.py
@@ -203,8 +203,7 @@ class BASNetTask(base_task.Task):

      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
      # scaled for numerical stability.
-      if isinstance(
-          optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+      if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
        scaled_loss = optimizer.get_scaled_loss(scaled_loss)

    tvars = model.trainable_variables
@@ -212,8 +211,7 @@ class BASNetTask(base_task.Task):

    # Scales back gradient before apply_gradients when LossScaleOptimizer is
    # used.
-    if isinstance(
-        optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+    if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
      grads = optimizer.get_unscaled_gradients(grads)

    # Apply gradient clipping.

--- a/official/projects/movinet/tools/quantize_movinet.py
+++ b/official/projects/movinet/tools/quantize_movinet.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Generates example dataset for post-training quantization.
+
+Example command line to run the script:
+
+```shell
+python3 quantize_movinet.py \
+--saved_model_dir=${SAVED_MODEL_DIR} \
+--saved_model_with_states_dir=${SAVED_MODEL_WITH_STATES_DIR} \
+--output_dataset_dir=${OUTPUT_DATASET_DIR} \
+--output_tflite=${OUTPUT_TFLITE} \
+--quantization_mode='int_float_fallback' \
+--save_dataset_to_tfrecords=True
+```
+
+"""
+
+import functools
+from typing import Any, Callable, Mapping, Optional
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow.compat.v2 as tf
+import tensorflow_hub as hub
+
+from official.vision.beta.configs import video_classification as video_classification_configs
+from official.vision.beta.tasks import video_classification
+
+tf.enable_v2_behavior()
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    'saved_model_dir', None, 'The saved_model directory.')
+flags.DEFINE_string(
+    'saved_model_with_states_dir', None,
+    'The directory to the saved_model with state signature. '
+    'The saved_model_with_states is needed in order to get the initial state '
+    'shape and dtype while saved_model is used for the quantization.')
+flags.DEFINE_string(
+    'output_tflite', '/tmp/output.tflite',
+    'The output tflite file path.')
+flags.DEFINE_integer(
+    'temporal_stride', 5,
+    'Temporal stride used to generate input videos.')
+flags.DEFINE_integer(
+    'num_frames', 50, 'Input videos number of frames.')
+flags.DEFINE_integer(
+    'image_size', 172, 'Input videos frame size.')
+flags.DEFINE_string(
+    'quantization_mode', None,
+    'The quantization mode. Can be one of "float16", "int8",'
+    '"int_float_fallback" or None.')
+flags.DEFINE_integer(
+    'num_calibration_videos', 100,
+    'Number of videos to run to generate example datasets.')
+flags.DEFINE_integer(
+    'num_samples_per_video', 3,
+    'Number of sample draw from one single video.')
+flags.DEFINE_boolean(
+    'save_dataset_to_tfrecords', False,
+    'Whether to save representative dataset to the disk.')
+flags.DEFINE_string(
+    'output_dataset_dir', '/tmp/representative_dataset/',
+    'The directory to store exported tfrecords.')
+flags.DEFINE_integer(
+    'max_saved_files', 100,
+    'The maximum number of tfrecord files to save.')
+
+
+def _bytes_feature(value):
+  """Returns a bytes_list from a string / byte."""
+  if isinstance(value, type(tf.constant(0))):
+    value = value.numpy()  # BytesList won't unpack string from an EagerTensor.
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _float_feature(value):
+  """Returns a float_list from a float / double."""
+  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+
+def _int64_feature(value):
+  """Returns an int64_list from a bool / enum / int / uint."""
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def _build_tf_example(feature):
+  return tf.train.Example(
+      features=tf.train.Features(feature=feature)).SerializeToString()
+
+
+def save_to_tfrecord(input_frame: tf.Tensor,
+                     input_states: Mapping[str, tf.Tensor],
+                     frame_index: int,
+                     predictions: tf.Tensor,
+                     output_states: Mapping[str, tf.Tensor],
+                     groundtruth_label_id: tf.Tensor,
+                     output_dataset_dir: str,
+                     file_index: int):
+  """Save results to tfrecord."""
+  features = {}
+  features['frame_id'] = _int64_feature([frame_index])
+  features['groundtruth_label'] = _int64_feature(
+      groundtruth_label_id.numpy().flatten().tolist())
+  features['predictions'] = _float_feature(
+      predictions.numpy().flatten().tolist())
+  image_string = tf.io.encode_png(
+      tf.squeeze(tf.cast(input_frame * 255., tf.uint8), axis=[0, 1]))
+  features['image'] = _bytes_feature(image_string.numpy())
+
+  # Input/Output states at time T
+  for k, v in output_states.items():
+    dtype = v[0].dtype
+    if dtype == tf.int32:
+      features['input/' + k] = _int64_feature(
+          input_states[k].numpy().flatten().tolist())
+      features['output/' + k] = _int64_feature(
+          output_states[k].numpy().flatten().tolist())
+    elif dtype == tf.float32:
+      features['input/' + k] = _float_feature(
+          input_states[k].numpy().flatten().tolist())
+      features['output/' + k] = _float_feature(
+          output_states[k].numpy().flatten().tolist())
+    else:
+      raise ValueError(f'Unrecongized dtype: {dtype}')
+
+  tfe = _build_tf_example(features)
+  record_file = '{}/movinet_stream_{:06d}.tfrecords'.format(
+      output_dataset_dir, file_index)
+  logging.info('Saving to %s.', record_file)
+  with tf.io.TFRecordWriter(record_file) as writer:
+    writer.write(tfe)
+
+
+def get_dataset() -> tf.data.Dataset:
+  """Gets dataset source."""
+  config = video_classification_configs.video_classification_kinetics600()
+
+  temporal_stride = FLAGS.temporal_stride
+  num_frames = FLAGS.num_frames
+  image_size = FLAGS.image_size
+  feature_shape = (num_frames, image_size, image_size, 3)
+
+  config.task.validation_data.global_batch_size = 1
+  config.task.validation_data.feature_shape = feature_shape
+  config.task.validation_data.temporal_stride = temporal_stride
+  config.task.train_data.min_image_size = int(1.125 * image_size)
+  config.task.validation_data.dtype = 'float32'
+  config.task.validation_data.drop_remainder = False
+
+  task = video_classification.VideoClassificationTask(config.task)
+
+  valid_dataset = task.build_inputs(config.task.validation_data)
+  valid_dataset = valid_dataset.map(lambda x, y: (x['image'], y))
+  valid_dataset = valid_dataset.prefetch(32)
+  return valid_dataset
+
+
+def stateful_representative_dataset_generator(
+    model: tf.keras.Model,
+    dataset_iter: Any,
+    init_states: Mapping[str, tf.Tensor],
+    save_dataset_to_tfrecords: bool = False,
+    max_saved_files: int = 100,
+    output_dataset_dir: Optional[str] = None,
+    num_samples_per_video: int = 3,
+    num_calibration_videos: int = 100):
+  """Generates sample input data with states.
+
+  Args:
+    model: the inference keras model.
+    dataset_iter: the dataset source.
+    init_states: the initial states for the model.
+    save_dataset_to_tfrecords: whether to save the representative dataset to
+      tfrecords on disk.
+    max_saved_files: the max number of saved tfrecords files.
+    output_dataset_dir: the directory to store the saved tfrecords.
+    num_samples_per_video: number of randomly sampled frames per video.
+    num_calibration_videos: number of calibration videos to run.
+
+  Yields:
+    A dictionary of model inputs.
+  """
+  counter = 0
+  for i in range(num_calibration_videos):
+    if i % 100 == 0:
+      logging.info('Reading representative dateset id %d.', i)
+
+    example_input, example_label = next(dataset_iter)
+    groundtruth_label_id = tf.argmax(example_label, axis=-1)
+    input_states = init_states
+    # split video into frames along the temporal dimension.
+    frames = tf.split(example_input, example_input.shape[1], axis=1)
+
+    random_indices = np.random.randint(
+        low=1, high=len(frames), size=num_samples_per_video)
+    # always include the first frame
+    random_indices[0] = 0
+    random_indices = set(random_indices)
+
+    for frame_index, frame in enumerate(frames):
+      predictions, output_states = model({'image': frame, **input_states})
+      if frame_index in random_indices:
+        if save_dataset_to_tfrecords and counter < max_saved_files:
+          save_to_tfrecord(
+              input_frame=frame,
+              input_states=input_states,
+              frame_index=frame_index,
+              predictions=predictions,
+              output_states=output_states,
+              groundtruth_label_id=groundtruth_label_id,
+              output_dataset_dir=output_dataset_dir,
+              file_index=counter)
+        yield {'image': frame, **input_states}
+        counter += 1
+
+      # update states for the next inference step
+      input_states = output_states
+
+
+def get_tflite_converter(
+    saved_model_dir: str,
+    quantization_mode: str,
+    representative_dataset: Optional[Callable[..., Any]] = None
+) -> tf.lite.TFLiteConverter:
+  """Gets tflite converter."""
+  converter = tf.lite.TFLiteConverter.from_saved_model(
+      saved_model_dir=saved_model_dir)
+  converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+  if quantization_mode == 'float16':
+    logging.info('Using float16 quantization.')
+    converter.target_spec.supported_types = [tf.float16]
+
+  elif quantization_mode == 'int8':
+    logging.info('Using full interger quantization.')
+    converter.representative_dataset = representative_dataset
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    converter.inference_input_type = tf.int8
+    converter.inference_output_type = tf.int8
+
+  elif quantization_mode == 'int_float_fallback':
+    logging.info('Using interger quantization with float-point fallback.')
+    converter.representative_dataset = representative_dataset
+
+  else:
+    logging.info('Using dynamic range quantization.')
+  return converter
+
+
+def quantize_movinet(dataset_fn):
+  """Quantizes Movinet."""
+  valid_dataset = dataset_fn()
+  dataset_iter = iter(valid_dataset)
+
+  # Load model
+  encoder = hub.KerasLayer(FLAGS.saved_model_with_states_dir, trainable=False)
+  inputs = tf.keras.layers.Input(
+      shape=[1, FLAGS.image_size, FLAGS.image_size, 3],
+      dtype=tf.float32,
+      name='image')
+
+  # Define the state inputs, which is a dict that maps state names to tensors.
+  init_states_fn = encoder.resolved_object.signatures['init_states']
+  state_shapes = {
+      name: ([s if s > 0 else None for s in state.shape], state.dtype)
+      for name, state in init_states_fn(
+          tf.constant([1, 1, FLAGS.image_size, FLAGS.image_size, 3])).items()
+  }
+  states_input = {
+      name: tf.keras.Input(shape[1:], dtype=dtype, name=name)
+      for name, (shape, dtype) in state_shapes.items()
+  }
+
+  # The inputs to the model are the states and the video
+  inputs = {**states_input, 'image': inputs}
+  outputs = encoder(inputs)
+  model = tf.keras.Model(inputs, outputs, name='movinet_stream')
+  input_shape = tf.constant(
+      [1, FLAGS.num_frames, FLAGS.image_size, FLAGS.image_size, 3])
+  init_states = init_states_fn(input_shape)
+
+  # config representative_datset_fn
+  representative_dataset = functools.partial(
+      stateful_representative_dataset_generator,
+      model=model,
+      dataset_iter=dataset_iter,
+      init_states=init_states,
+      save_dataset_to_tfrecords=FLAGS.save_dataset_to_tfrecords,
+      max_saved_files=FLAGS.max_saved_files,
+      output_dataset_dir=FLAGS.output_dataset_dir,
+      num_samples_per_video=FLAGS.num_samples_per_video,
+      num_calibration_videos=FLAGS.num_calibration_videos)
+
+  converter = get_tflite_converter(
+      saved_model_dir=FLAGS.saved_model_dir,
+      quantization_mode=FLAGS.quantization_mode,
+      representative_dataset=representative_dataset)
+
+  logging.info('Converting...')
+  tflite_buffer = converter.convert()
+  return tflite_buffer
+
+
+def main(_):
+  tflite_buffer = quantize_movinet(dataset_fn=get_dataset)
+
+  with open(FLAGS.output_tflite, 'wb') as f:
+    f.write(tflite_buffer)
+
+  logging.info('tflite model written to %s', FLAGS.output_tflite)
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('saved_model_dir')
+  flags.mark_flag_as_required('saved_model_with_states_dir')
+  app.run(main)
--- a/official/projects/qat/vision/README.md
+++ b/official/projects/qat/vision/README.md
+# Quantization Aware Training Project for Computer Vision Models
+
+[TOC]
+
+⚠️ Disclaimer: All datasets hyperlinked from this page are not owned or
+distributed by Google. The dataset is made available by third parties.
+Please review the terms and conditions made available by the third parties
+before using the data.
+
+## Overview
+
+This project includes quantization aware training code for Computer Vision
+models. These are examples to show how to apply the Model Optimization Toolkit's
+[quantization aware training API](https://www.tensorflow.org/model_optimization/guide/quantization/training).
+
+Note: Currently, we support a limited number of ML tasks & models (e.g., image
+classification and semantic segmentation)
+We will keep adding support for other ML tasks and models in the next releases.
+
+## How to train a model
+
+```
+EXPERIMENT=xxx  # Change this for your run, for example, 'mobilenet_imagenet_qat'
+CONFIG_FILE=xxx  # Change this for your run, for example, path of imagenet_mobilenetv2_qat_gpu.yaml
+MODEL_DIR=xxx  #  Change this for your run, for example, /tmp/model_dir
+$ python3 train.py \
+--experiment=${EXPERIMENT} \
+--config_file=${CONFIG_FILE} \
+--model_dir=${MODEL_DIR} \
+--mode=train_and_eval
+```
+
+## Model Accuracy
+
+<figure align="center">
+<img width=70% src=https://storage.googleapis.com/tf_model_garden/models/qat/images/readme-qat-classification-plot.png>
+<figcaption>Comparison of Imagenet top-1 accuracy for the classification models</figcaption>
+</figure>
+
+Note: The Top-1 model accuracy is measured on the validation set of [ImageNet](https://www.image-net.org/).
+
+
+### Pre-trained Models
+
+|Model                 |Resolution|Top-1 Accuracy (FP32)|Top-1 Accuracy (Int8/PTQ)|Top-1 Accuracy (Int8/QAT)|Config                                                                                                                                                              |Download                                                                                                                                        |
+|----------------------|----------|---------------------|-------------------------|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+|MobileNetV2           |224x224   |72.782%              |72.392%                  |72.792%                  |[config](https://github.com/tensorflow/models/blob/master/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv2_qat_gpu.yaml)  |[TFLite(Int8/QAT)](https://storage.googleapis.com/tf_model_garden/vision/mobilenet/v2_1.0_int8/mobilenet_v2_1.00_224_int8.tflite)                    |
+|ResNet50              |224x224   |76.710%              |76.420%                  |77.200%                  |[config](https://github.com/tensorflow/models/blob/master/official/projects/qat/vision/configs/experiments/image_classification/imagenet_resnet50_qat_gpu.yaml)     |[TFLite(Int8/QAT)](https://storage.googleapis.com/tf_model_garden/vision/resnet50_imagenet/resnet_50_224_int8.tflite)                                |
+|MobileNetV3.5 MultiAVG|224x224   |75.212%              |74.122%                  |75.130%                  |[config](https://github.com/tensorflow/models/blob/master/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv3.5_qat_gpu.yaml)|[TFLite(Int8/QAT)](https://storage.googleapis.com/tf_model_garden/vision/mobilenet/v3.5multiavg_1.0_int8/mobilenet_v3.5multiavg_1.00_224_int8.tflite)|
+
--- a/official/projects/qat/vision/configs/__init__.py
+++ b/official/projects/qat/vision/configs/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Configs package definition."""
+
+from official.projects.qat.vision.configs import image_classification
+from official.projects.qat.vision.configs import semantic_segmentation
--- a/official/projects/qat/vision/configs/common.py
+++ b/official/projects/qat/vision/configs/common.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Image classification configuration definition."""
+
+import dataclasses
+from typing import Optional
+
+from official.modeling import hyperparams
+
+
+@dataclasses.dataclass
+class Quantization(hyperparams.Config):
+  """Quantization parameters.
+
+  Attributes:
+    pretrained_original_checkpoint: A string indicate pretrained checkpoint
+      location.
+    change_num_bits: A `bool` indicates whether to manually allocate num_bits.
+    num_bits_weight: An `int` number of bits for weight. Default to 8.
+    num_bits_activation: An `int` number of bits for activation. Default to 8.
+  """
+  pretrained_original_checkpoint: Optional[str] = None
+  change_num_bits: bool = False
+  num_bits_weight: int = 8
+  num_bits_activation: int = 8
--- a/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv2_qat_gpu.yaml
+++ b/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv2_qat_gpu.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float32'
+  loss_scale: 'dynamic'
+task:
+  model:
+    num_classes: 1001
+    input_size: [224, 224, 3]
+    backbone:
+      type: 'mobilenet'
+      mobilenet:
+        model_id: 'MobileNetV2'
+        filter_size_scale: 1.0
+    dropout_rate: 0.1
+  losses:
+    l2_weight_decay: 0.0000001
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 512  # 64 * 8
+    dtype: 'float32'
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 512  # 64 * 8
+    dtype: 'float32'
+    drop_remainder: false
+  quantization:
+    pretrained_original_checkpoint: 'gs://**/mobilenetv2_gpu/22984194/ckpt-625500'
+trainer:
+  # With below setting, the accuracy of QAT reaches to accuracy 0.7279 after 43 hours with 8 GPUS.
+  train_steps: 250200
+  validation_steps: 98
+  validation_interval: 2502
+  steps_per_loop: 2502
+  summary_interval: 2502
+  checkpoint_interval: 2502
+  optimizer_config:
+    learning_rate:
+      type: 'exponential'
+      exponential:
+        decay_rate: 0.9
+        decay_steps: 1251
+        initial_learning_rate: 0.0001
+        name: 'ExponentialDecay'
+        offset: 0
+        staircase: true
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 0
--- a/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv2_qat_gpu_batch256.yaml
+++ b/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv2_qat_gpu_batch256.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float32'
+  loss_scale: 'dynamic'
+task:
+  model:
+    num_classes: 1001
+    input_size: [224, 224, 3]
+    backbone:
+      type: 'mobilenet'
+      mobilenet:
+        model_id: 'MobileNetV2'
+        filter_size_scale: 1.0
+    dropout_rate: 0.0  # changed from 0.2 to 0.0
+  losses:
+    l2_weight_decay: 0.0000001
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 256
+    dtype: 'float32'
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 256
+    dtype: 'float32'
+    drop_remainder: false
+  quantization:
+    pretrained_original_checkpoint: 'gs://**/mobilenetv2_gpu/22984194/ckpt-625500'
+trainer:
+  # With below setting, the accuracy of QAT reaches Top1-accuracy 0.7251 at 420336 steps after
+  # 1 day 19 hours of training with 8GPUs, which is higher than the result of PTQ in MobileNetV2.
+  train_steps: 1000800  # 200 epochs
+  validation_steps: 196  # NUM_EXAMPLES (50000) // global_batch_size (256)
+  validation_interval: 5004  # 1 epoch
+  steps_per_loop: 5004  # NUM_EXAMPLES (1281167) // global_batch_size (256)
+  summary_interval: 5004  # 1 epoch
+  checkpoint_interval: 5004  # 1 epoch
+  max_to_keep: 200
+  optimizer_config:
+    learning_rate:
+      type: 'exponential'
+      exponential:
+        initial_learning_rate: 0.0001
+        decay_steps: 1251  # steps_per_epoch // 4
+        decay_rate: 0.96
+        staircase: true
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 0
--- a/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv2_qat_gpu_batch512.yaml
+++ b/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv2_qat_gpu_batch512.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float32'
+  loss_scale: 'dynamic'
+task:
+  model:
+    num_classes: 1001
+    input_size: [224, 224, 3]
+    backbone:
+      type: 'mobilenet'
+      mobilenet:
+        model_id: 'MobileNetV2'
+        filter_size_scale: 1.0
+    dropout_rate: 0.0  # changed from 0.2 to 0.0
+  losses:
+    l2_weight_decay: 0.0000001
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 512
+    dtype: 'float32'
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 512
+    dtype: 'float32'
+    drop_remainder: false
+  quantization:
+    pretrained_original_checkpoint: 'gs://**/mobilenetv2_gpu/22984194/ckpt-625500'
+trainer:
+  # With below setting, the accuracy of QAT reaches Top1-accuracy 0.7266 at 312750 steps after
+  # 1 day 22 hours of training with 8GPUs, which is higher than the result of PTQ in MobileNetV2.
+  train_steps: 500400  # 200 epochs
+  validation_steps: 98  # NUM_EXAMPLES (50000) // global_batch_size (512)
+  validation_interval: 2502  # 1 epoch
+  steps_per_loop: 2502  # NUM_EXAMPLES (1281167) // global_batch_size (512)
+  summary_interval: 2502  # 1 epoch
+  checkpoint_interval: 2502  # 1 epoch
+  max_to_keep: 200
+  optimizer_config:
+    learning_rate:
+      type: 'exponential'
+      exponential:
+        initial_learning_rate: 0.0002
+        decay_steps: 1251  # steps_per_epoch // 2
+        decay_rate: 0.96
+        staircase: true
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 0
--- a/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv3.5_qat_gpu.yaml
+++ b/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv3.5_qat_gpu.yaml
+runtime:
+  distribution_strategy: 'mirrored'
+  mixed_precision_dtype: 'float32'
+  loss_scale: 'dynamic'
+task:
+  model:
+    num_classes: 1001
+    input_size: [224, 224, 3]
+    backbone:
+      type: 'mobilenet'
+      mobilenet:
+        model_id: 'MobileNetMultiAVG'
+        filter_size_scale: 1.0
+    dropout_rate: 0.3
+  losses:
+    l2_weight_decay: 0.000001
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 512
+    dtype: 'float32'
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 512
+    dtype: 'float32'
+    drop_remainder: false
+  quantization:
+    pretrained_original_checkpoint: 'gs://**/tf2_mhave_nobias_bn_aug05/28334857/ckpt-156000'
+trainer:
+  # With below setting, the accuracy of QAT reaches to accuracy 0.7513 after 30 hours with 8 GPUS.
+  train_steps: 250200
+  validation_steps: 98
+  validation_interval: 2502
+  steps_per_loop: 2502
+  summary_interval: 2502
+  checkpoint_interval: 2502
+  optimizer_config:
+    learning_rate:
+      type: 'exponential'
+      exponential:
+        decay_rate: 0.9
+        decay_steps: 1251
+        initial_learning_rate: 0.0004
+        name: 'ExponentialDecay'
+        offset: 0
+        staircase: true
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 0
--- a/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv3large_qat_tpu.yaml
+++ b/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv3large_qat_tpu.yaml
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'float32'
+task:
+  model:
+    num_classes: 1001
+    input_size: [224, 224, 3]
+    backbone:
+      type: 'mobilenet'
+      mobilenet:
+        model_id: 'MobileNetV3Large'
+        filter_size_scale: 1.0
+    dropout_rate: 0.3
+  losses:
+    l2_weight_decay: 1.0e-06  # 1/10 of original value.
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'float32'
+    aug_rand_hflip: true
+    aug_type:
+      autoaug:
+        augmentation_name: v0
+        cutout_const: 100
+        translate_const: 250
+      type: autoaug
+    drop_remainder: true
+  validation_data:
+    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'float32'
+    drop_remainder: false
+    aug_rand_hflip: true
+  quantization:
+    pretrained_original_checkpoint: 'gs://**/mobilenetv3_baseline_31/ckpt-156000'
+trainer:
+  # With below setting, the accuracy of QAT reaches to accuracy 0.74.43 after ~2 hours with 4x4 DF.
+  train_steps: 62400
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    learning_rate:
+      cosine:
+        alpha: 0.0
+        decay_steps: 62400
+        initial_learning_rate: 0.0003  # 1/10 of original lr.
+        name: CosineDecay
+        offset: 0
+      type: cosine
+    optimizer:
+      adamw:
+        amsgrad: false
+        beta_1: 0.9
+        beta_2: 0.999
+        epsilon: 1.0e-07
+        gradient_clip_norm: 1.0
+        weight_decay_rate: 0.0
+      type: adamw
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 0