Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

0225b135 · Srihari Humbarwadi · GitHub · 7479dbb8 · 4c571a3c · 0225b135
Unverified Commit 0225b135 authored Mar 05, 2022 by Srihari Humbarwadi Committed by GitHub Mar 05, 2022
20 changed files
--- a/official/vision/configs/image_classification_test.py
+++ b/official/vision/configs/image_classification_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for image_classification."""
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official import vision
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.configs import image_classification as exp_cfg
+
+
+class ImageClassificationConfigTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      ('resnet_imagenet',),
+      ('resnet_rs_imagenet',),
+      ('revnet_imagenet',),
+      ('mobilenet_imagenet'),
+  )
+  def test_image_classification_configs(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.ImageClassificationTask)
+    self.assertIsInstance(config.task.model,
+                          exp_cfg.ImageClassificationModel)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaises(KeyError):
+      config.validate()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/configs/maskrcnn.py
+++ b/official/vision/configs/maskrcnn.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""R-CNN(-RS) configuration definition."""
+
+import dataclasses
+import os
+from typing import List, Optional, Union
+
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling import optimization
+from official.vision.configs import common
+from official.vision.configs import decoders
+from official.vision.configs import backbones
+
+
+# pylint: disable=missing-class-docstring
+@dataclasses.dataclass
+class Parser(hyperparams.Config):
+  num_channels: int = 3
+  match_threshold: float = 0.5
+  unmatched_threshold: float = 0.5
+  aug_rand_hflip: bool = False
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  skip_crowd_during_training: bool = True
+  max_num_instances: int = 100
+  rpn_match_threshold: float = 0.7
+  rpn_unmatched_threshold: float = 0.3
+  rpn_batch_size_per_im: int = 256
+  rpn_fg_fraction: float = 0.5
+  mask_crop_size: int = 112
+
+
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """Input config for training."""
+  input_path: str = ''
+  global_batch_size: int = 0
+  is_training: bool = False
+  dtype: str = 'bfloat16'
+  decoder: common.DataDecoder = common.DataDecoder()
+  parser: Parser = Parser()
+  shuffle_buffer_size: int = 10000
+  file_type: str = 'tfrecord'
+  drop_remainder: bool = True
+  # Number of examples in the data set, it's used to create the annotation file.
+  num_examples: int = -1
+
+
+@dataclasses.dataclass
+class Anchor(hyperparams.Config):
+  num_scales: int = 1
+  aspect_ratios: List[float] = dataclasses.field(
+      default_factory=lambda: [0.5, 1.0, 2.0])
+  anchor_size: float = 8.0
+
+
+@dataclasses.dataclass
+class RPNHead(hyperparams.Config):
+  num_convs: int = 1
+  num_filters: int = 256
+  use_separable_conv: bool = False
+
+
+@dataclasses.dataclass
+class DetectionHead(hyperparams.Config):
+  num_convs: int = 4
+  num_filters: int = 256
+  use_separable_conv: bool = False
+  num_fcs: int = 1
+  fc_dims: int = 1024
+  class_agnostic_bbox_pred: bool = False  # Has to be True for Cascade RCNN.
+  # If additional IoUs are passed in 'cascade_iou_thresholds'
+  # then ensemble the class probabilities from all heads.
+  cascade_class_ensemble: bool = False
+
+
+@dataclasses.dataclass
+class ROIGenerator(hyperparams.Config):
+  pre_nms_top_k: int = 2000
+  pre_nms_score_threshold: float = 0.0
+  pre_nms_min_size_threshold: float = 0.0
+  nms_iou_threshold: float = 0.7
+  num_proposals: int = 1000
+  test_pre_nms_top_k: int = 1000
+  test_pre_nms_score_threshold: float = 0.0
+  test_pre_nms_min_size_threshold: float = 0.0
+  test_nms_iou_threshold: float = 0.7
+  test_num_proposals: int = 1000
+  use_batched_nms: bool = False
+
+
+@dataclasses.dataclass
+class ROISampler(hyperparams.Config):
+  mix_gt_boxes: bool = True
+  num_sampled_rois: int = 512
+  foreground_fraction: float = 0.25
+  foreground_iou_threshold: float = 0.5
+  background_iou_high_threshold: float = 0.5
+  background_iou_low_threshold: float = 0.0
+  # IoU thresholds for additional FRCNN heads in Cascade mode.
+  # `foreground_iou_threshold` is the first threshold.
+  cascade_iou_thresholds: Optional[List[float]] = None
+
+
+@dataclasses.dataclass
+class ROIAligner(hyperparams.Config):
+  crop_size: int = 7
+  sample_offset: float = 0.5
+
+
+@dataclasses.dataclass
+class DetectionGenerator(hyperparams.Config):
+  apply_nms: bool = True
+  pre_nms_top_k: int = 5000
+  pre_nms_score_threshold: float = 0.05
+  nms_iou_threshold: float = 0.5
+  max_num_detections: int = 100
+  nms_version: str = 'v2'  # `v2`, `v1`, `batched`
+  use_cpu_nms: bool = False
+  soft_nms_sigma: Optional[float] = None  # Only works when nms_version='v1'.
+
+
+@dataclasses.dataclass
+class MaskHead(hyperparams.Config):
+  upsample_factor: int = 2
+  num_convs: int = 4
+  num_filters: int = 256
+  use_separable_conv: bool = False
+  class_agnostic: bool = False
+
+
+@dataclasses.dataclass
+class MaskSampler(hyperparams.Config):
+  num_sampled_masks: int = 128
+
+
+@dataclasses.dataclass
+class MaskROIAligner(hyperparams.Config):
+  crop_size: int = 14
+  sample_offset: float = 0.5
+
+
+@dataclasses.dataclass
+class MaskRCNN(hyperparams.Config):
+  num_classes: int = 0
+  input_size: List[int] = dataclasses.field(default_factory=list)
+  min_level: int = 2
+  max_level: int = 6
+  anchor: Anchor = Anchor()
+  include_mask: bool = True
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='resnet', resnet=backbones.ResNet())
+  decoder: decoders.Decoder = decoders.Decoder(
+      type='fpn', fpn=decoders.FPN())
+  rpn_head: RPNHead = RPNHead()
+  detection_head: DetectionHead = DetectionHead()
+  roi_generator: ROIGenerator = ROIGenerator()
+  roi_sampler: ROISampler = ROISampler()
+  roi_aligner: ROIAligner = ROIAligner()
+  detection_generator: DetectionGenerator = DetectionGenerator()
+  mask_head: Optional[MaskHead] = MaskHead()
+  mask_sampler: Optional[MaskSampler] = MaskSampler()
+  mask_roi_aligner: Optional[MaskROIAligner] = MaskROIAligner()
+  norm_activation: common.NormActivation = common.NormActivation(
+      norm_momentum=0.997,
+      norm_epsilon=0.0001,
+      use_sync_bn=True)
+
+
+@dataclasses.dataclass
+class Losses(hyperparams.Config):
+  loss_weight: float = 1.0
+  rpn_huber_loss_delta: float = 1. / 9.
+  frcnn_huber_loss_delta: float = 1.
+  l2_weight_decay: float = 0.0
+  rpn_score_weight: float = 1.0
+  rpn_box_weight: float = 1.0
+  frcnn_class_weight: float = 1.0
+  frcnn_box_weight: float = 1.0
+  mask_weight: float = 1.0
+
+
+@dataclasses.dataclass
+class MaskRCNNTask(cfg.TaskConfig):
+  model: MaskRCNN = MaskRCNN()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(is_training=False,
+                                           drop_remainder=False)
+  losses: Losses = Losses()
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: Union[
+      str, List[str]] = 'all'  # all, backbone, and/or decoder
+  annotation_file: Optional[str] = None
+  per_category_metrics: bool = False
+  # If set, we only use masks for the specified class IDs.
+  allowed_mask_class_ids: Optional[List[int]] = None
+  # If set, the COCO metrics will be computed.
+  use_coco_metrics: bool = True
+  # If set, the Waymo Open Dataset evaluator would be used.
+  use_wod_metrics: bool = False
+
+
+COCO_INPUT_PATH_BASE = 'coco'
+
+
+@exp_factory.register_config_factory('fasterrcnn_resnetfpn_coco')
+def fasterrcnn_resnetfpn_coco() -> cfg.ExperimentConfig:
+  """COCO object detection with Faster R-CNN."""
+  steps_per_epoch = 500
+  coco_val_samples = 5000
+  train_batch_size = 64
+  eval_batch_size = 8
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=MaskRCNNTask(
+          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080',
+          init_checkpoint_modules='backbone',
+          annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json'),
+          model=MaskRCNN(
+              num_classes=91,
+              input_size=[1024, 1024, 3],
+              include_mask=False,
+              mask_head=None,
+              mask_sampler=None,
+              mask_roi_aligner=None),
+          losses=Losses(l2_weight_decay=0.00004),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.25)),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=22500,
+          validation_steps=coco_val_samples // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [15000, 20000],
+                      'values': [0.12, 0.012, 0.0012],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 500,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
+
+
+@exp_factory.register_config_factory('maskrcnn_resnetfpn_coco')
+def maskrcnn_resnetfpn_coco() -> cfg.ExperimentConfig:
+  """COCO object detection with Mask R-CNN."""
+  steps_per_epoch = 500
+  coco_val_samples = 5000
+  train_batch_size = 64
+  eval_batch_size = 8
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(
+          mixed_precision_dtype='bfloat16', enable_xla=True),
+      task=MaskRCNNTask(
+          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080',
+          init_checkpoint_modules='backbone',
+          annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json'),
+          model=MaskRCNN(
+              num_classes=91, input_size=[1024, 1024, 3], include_mask=True),
+          losses=Losses(l2_weight_decay=0.00004),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.25)),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=22500,
+          validation_steps=coco_val_samples // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [15000, 20000],
+                      'values': [0.12, 0.012, 0.0012],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 500,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
+
+
+@exp_factory.register_config_factory('maskrcnn_spinenet_coco')
+def maskrcnn_spinenet_coco() -> cfg.ExperimentConfig:
+  """COCO object detection with Mask R-CNN with SpineNet backbone."""
+  steps_per_epoch = 463
+  coco_val_samples = 5000
+  train_batch_size = 256
+  eval_batch_size = 8
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=MaskRCNNTask(
+          annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json'),
+          model=MaskRCNN(
+              backbone=backbones.Backbone(
+                  type='spinenet',
+                  spinenet=backbones.SpineNet(
+                      model_id='49',
+                      min_level=3,
+                      max_level=7,
+                  )),
+              decoder=decoders.Decoder(
+                  type='identity', identity=decoders.Identity()),
+              anchor=Anchor(anchor_size=3),
+              norm_activation=common.NormActivation(use_sync_bn=True),
+              num_classes=91,
+              input_size=[640, 640, 3],
+              min_level=3,
+              max_level=7,
+              include_mask=True),
+          losses=Losses(l2_weight_decay=0.00004),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.5, aug_scale_max=2.0)),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=steps_per_epoch * 350,
+          validation_steps=coco_val_samples // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [
+                          steps_per_epoch * 320, steps_per_epoch * 340
+                      ],
+                      'values': [0.32, 0.032, 0.0032],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 2000,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.model.min_level == task.model.backbone.spinenet.min_level',
+          'task.model.max_level == task.model.backbone.spinenet.max_level',
+      ])
+  return config
+
+
+@exp_factory.register_config_factory('cascadercnn_spinenet_coco')
+def cascadercnn_spinenet_coco() -> cfg.ExperimentConfig:
+  """COCO object detection with Cascade RCNN-RS with SpineNet backbone."""
+  steps_per_epoch = 463
+  coco_val_samples = 5000
+  train_batch_size = 256
+  eval_batch_size = 8
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=MaskRCNNTask(
+          annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json'),
+          model=MaskRCNN(
+              backbone=backbones.Backbone(
+                  type='spinenet',
+                  spinenet=backbones.SpineNet(
+                      model_id='49',
+                      min_level=3,
+                      max_level=7,
+                  )),
+              decoder=decoders.Decoder(
+                  type='identity', identity=decoders.Identity()),
+              roi_sampler=ROISampler(cascade_iou_thresholds=[0.6, 0.7]),
+              detection_head=DetectionHead(
+                  class_agnostic_bbox_pred=True, cascade_class_ensemble=True),
+              anchor=Anchor(anchor_size=3),
+              norm_activation=common.NormActivation(
+                  use_sync_bn=True, activation='swish'),
+              num_classes=91,
+              input_size=[640, 640, 3],
+              min_level=3,
+              max_level=7,
+              include_mask=True),
+          losses=Losses(l2_weight_decay=0.00004),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.5)),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=steps_per_epoch * 500,
+          validation_steps=coco_val_samples // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [
+                          steps_per_epoch * 475, steps_per_epoch * 490
+                      ],
+                      'values': [0.32, 0.032, 0.0032],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 2000,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.model.min_level == task.model.backbone.spinenet.min_level',
+          'task.model.max_level == task.model.backbone.spinenet.max_level',
+      ])
+  return config
--- a/official/vision/configs/maskrcnn_test.py
+++ b/official/vision/configs/maskrcnn_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for maskrcnn."""
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official import vision
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.configs import maskrcnn as exp_cfg
+
+
+class MaskRCNNConfigTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      ('fasterrcnn_resnetfpn_coco',),
+      ('maskrcnn_resnetfpn_coco',),
+      ('maskrcnn_spinenet_coco',),
+      ('cascadercnn_spinenet_coco',),
+  )
+  def test_maskrcnn_configs(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.MaskRCNNTask)
+    self.assertIsInstance(config.task.model, exp_cfg.MaskRCNN)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'):
+      config.validate()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/configs/retinanet.py
+++ b/official/vision/configs/retinanet.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""RetinaNet configuration definition."""
+
+import dataclasses
+import os
+from typing import List, Optional, Union
+
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling import optimization
+from official.vision.configs import common
+from official.vision.configs import decoders
+from official.vision.configs import backbones
+
+
+# pylint: disable=missing-class-docstring
+# Keep for backward compatibility.
+@dataclasses.dataclass
+class TfExampleDecoder(common.TfExampleDecoder):
+  """A simple TF Example decoder config."""
+
+
+# Keep for backward compatibility.
+@dataclasses.dataclass
+class TfExampleDecoderLabelMap(common.TfExampleDecoderLabelMap):
+  """TF Example decoder with label map config."""
+
+
+# Keep for backward compatibility.
+@dataclasses.dataclass
+class DataDecoder(common.DataDecoder):
+  """Data decoder config."""
+
+
+@dataclasses.dataclass
+class Parser(hyperparams.Config):
+  num_channels: int = 3
+  match_threshold: float = 0.5
+  unmatched_threshold: float = 0.5
+  aug_rand_hflip: bool = False
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  skip_crowd_during_training: bool = True
+  max_num_instances: int = 100
+  # Can choose AutoAugment and RandAugment.
+  aug_type: Optional[common.Augmentation] = None
+
+  # Keep for backward compatibility. Not used.
+  aug_policy: Optional[str] = None
+
+
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """Input config for training."""
+  input_path: str = ''
+  global_batch_size: int = 0
+  is_training: bool = False
+  dtype: str = 'bfloat16'
+  decoder: common.DataDecoder = common.DataDecoder()
+  parser: Parser = Parser()
+  shuffle_buffer_size: int = 10000
+  file_type: str = 'tfrecord'
+
+
+@dataclasses.dataclass
+class Anchor(hyperparams.Config):
+  num_scales: int = 3
+  aspect_ratios: List[float] = dataclasses.field(
+      default_factory=lambda: [0.5, 1.0, 2.0])
+  anchor_size: float = 4.0
+
+
+@dataclasses.dataclass
+class Losses(hyperparams.Config):
+  loss_weight: float = 1.0
+  focal_loss_alpha: float = 0.25
+  focal_loss_gamma: float = 1.5
+  huber_loss_delta: float = 0.1
+  box_loss_weight: int = 50
+  l2_weight_decay: float = 0.0
+
+
+@dataclasses.dataclass
+class AttributeHead(hyperparams.Config):
+  name: str = ''
+  type: str = 'regression'
+  size: int = 1
+
+
+@dataclasses.dataclass
+class RetinaNetHead(hyperparams.Config):
+  num_convs: int = 4
+  num_filters: int = 256
+  use_separable_conv: bool = False
+  attribute_heads: List[AttributeHead] = dataclasses.field(default_factory=list)
+
+
+@dataclasses.dataclass
+class DetectionGenerator(hyperparams.Config):
+  apply_nms: bool = True
+  pre_nms_top_k: int = 5000
+  pre_nms_score_threshold: float = 0.05
+  nms_iou_threshold: float = 0.5
+  max_num_detections: int = 100
+  nms_version: str = 'v2'  # `v2`, `v1`, `batched`, or `tflite`.
+  use_cpu_nms: bool = False
+  soft_nms_sigma: Optional[float] = None  # Only works when nms_version='v1'.
+
+  # When nms_version = `tflite`, values from tflite_post_processing need to be
+  # specified. They are compatible with the input arguments used by TFLite
+  # custom NMS op and override above parameters.
+  tflite_post_processing: common.TFLitePostProcessingConfig = common.TFLitePostProcessingConfig(
+  )
+
+  max_detections: int = 200
+  max_classes_per_detection: int = 5
+  # Regular NMS run in a multi-class fashion and is slow. Setting it to False
+  # uses class-agnostic NMS, which is faster.
+  use_regular_nms: bool = False
+  nms_score_threshold: float = 0.1
+
+
+@dataclasses.dataclass
+class RetinaNet(hyperparams.Config):
+  num_classes: int = 0
+  input_size: List[int] = dataclasses.field(default_factory=list)
+  min_level: int = 3
+  max_level: int = 7
+  anchor: Anchor = Anchor()
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='resnet', resnet=backbones.ResNet())
+  decoder: decoders.Decoder = decoders.Decoder(
+      type='fpn', fpn=decoders.FPN())
+  head: RetinaNetHead = RetinaNetHead()
+  detection_generator: DetectionGenerator = DetectionGenerator()
+  norm_activation: common.NormActivation = common.NormActivation()
+
+
+@dataclasses.dataclass
+class ExportConfig(hyperparams.Config):
+  output_normalized_coordinates: bool = False
+  cast_num_detections_to_float: bool = False
+  cast_detection_classes_to_float: bool = False
+
+
+@dataclasses.dataclass
+class RetinaNetTask(cfg.TaskConfig):
+  model: RetinaNet = RetinaNet()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(is_training=False)
+  losses: Losses = Losses()
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: Union[
+      str, List[str]] = 'all'  # all, backbone, and/or decoder
+  annotation_file: Optional[str] = None
+  per_category_metrics: bool = False
+  export_config: ExportConfig = ExportConfig()
+  # If set, the COCO metrics will be computed.
+  use_coco_metrics: bool = True
+  # If set, the Waymo Open Dataset evaluator would be used.
+  use_wod_metrics: bool = False
+
+
+@exp_factory.register_config_factory('retinanet')
+def retinanet() -> cfg.ExperimentConfig:
+  """RetinaNet general config."""
+  return cfg.ExperimentConfig(
+      task=RetinaNetTask(),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+
+COCO_INPUT_PATH_BASE = 'coco'
+COCO_TRAIN_EXAMPLES = 118287
+COCO_VAL_EXAMPLES = 5000
+
+
+@exp_factory.register_config_factory('retinanet_resnetfpn_coco')
+def retinanet_resnetfpn_coco() -> cfg.ExperimentConfig:
+  """COCO object detection with RetinaNet."""
+  train_batch_size = 256
+  eval_batch_size = 8
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=RetinaNetTask(
+          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080',
+          init_checkpoint_modules='backbone',
+          annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json'),
+          model=RetinaNet(
+              num_classes=91,
+              input_size=[640, 640, 3],
+              norm_activation=common.NormActivation(use_sync_bn=False),
+              min_level=3,
+              max_level=7),
+          losses=Losses(l2_weight_decay=1e-4),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.2)),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size)),
+      trainer=cfg.TrainerConfig(
+          train_steps=72 * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [
+                          57 * steps_per_epoch, 67 * steps_per_epoch
+                      ],
+                      'values': [
+                          0.32 * train_batch_size / 256.0,
+                          0.032 * train_batch_size / 256.0,
+                          0.0032 * train_batch_size / 256.0
+                      ],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 500,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('retinanet_spinenet_coco')
+def retinanet_spinenet_coco() -> cfg.ExperimentConfig:
+  """COCO object detection with RetinaNet using SpineNet backbone."""
+  train_batch_size = 256
+  eval_batch_size = 8
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+  input_size = 640
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='float32'),
+      task=RetinaNetTask(
+          annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json'),
+          model=RetinaNet(
+              backbone=backbones.Backbone(
+                  type='spinenet',
+                  spinenet=backbones.SpineNet(
+                      model_id='49',
+                      stochastic_depth_drop_rate=0.2,
+                      min_level=3,
+                      max_level=7)),
+              decoder=decoders.Decoder(
+                  type='identity', identity=decoders.Identity()),
+              anchor=Anchor(anchor_size=3),
+              norm_activation=common.NormActivation(
+                  use_sync_bn=True, activation='swish'),
+              num_classes=91,
+              input_size=[input_size, input_size, 3],
+              min_level=3,
+              max_level=7),
+          losses=Losses(l2_weight_decay=4e-5),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.0)),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size)),
+      trainer=cfg.TrainerConfig(
+          train_steps=500 * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [
+                          475 * steps_per_epoch, 490 * steps_per_epoch
+                      ],
+                      'values': [
+                          0.32 * train_batch_size / 256.0,
+                          0.032 * train_batch_size / 256.0,
+                          0.0032 * train_batch_size / 256.0
+                      ],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 2000,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.model.min_level == task.model.backbone.spinenet.min_level',
+          'task.model.max_level == task.model.backbone.spinenet.max_level',
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('retinanet_mobile_coco')
+def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig:
+  """COCO object detection with mobile RetinaNet."""
+  train_batch_size = 256
+  eval_batch_size = 8
+  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
+  input_size = 384
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='float32'),
+      task=RetinaNetTask(
+          annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json'),
+          model=RetinaNet(
+              backbone=backbones.Backbone(
+                  type='spinenet_mobile',
+                  spinenet_mobile=backbones.SpineNetMobile(
+                      model_id='49',
+                      stochastic_depth_drop_rate=0.2,
+                      min_level=3,
+                      max_level=7,
+                      use_keras_upsampling_2d=False)),
+              decoder=decoders.Decoder(
+                  type='identity', identity=decoders.Identity()),
+              head=RetinaNetHead(num_filters=48, use_separable_conv=True),
+              anchor=Anchor(anchor_size=3),
+              norm_activation=common.NormActivation(
+                  use_sync_bn=True, activation='swish'),
+              num_classes=91,
+              input_size=[input_size, input_size, 3],
+              min_level=3,
+              max_level=7),
+          losses=Losses(l2_weight_decay=3e-5),
+          train_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.0)),
+          validation_data=DataConfig(
+              input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size)),
+      trainer=cfg.TrainerConfig(
+          train_steps=600 * steps_per_epoch,
+          validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [
+                          575 * steps_per_epoch, 590 * steps_per_epoch
+                      ],
+                      'values': [
+                          0.32 * train_batch_size / 256.0,
+                          0.032 * train_batch_size / 256.0,
+                          0.0032 * train_batch_size / 256.0
+                      ],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 2000,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+      ])
+
+  return config
--- a/official/vision/configs/retinanet_test.py
+++ b/official/vision/configs/retinanet_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for retinanet."""
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official import vision
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.configs import retinanet as exp_cfg
+
+
+class RetinaNetConfigTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      ('retinanet_resnetfpn_coco',),
+      ('retinanet_spinenet_coco',),
+      ('retinanet_mobile_coco',),
+  )
+  def test_retinanet_configs(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.RetinaNetTask)
+    self.assertIsInstance(config.task.model, exp_cfg.RetinaNet)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'):
+      config.validate()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/configs/semantic_segmentation.py
+++ b/official/vision/configs/semantic_segmentation.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Semantic segmentation configuration definition."""
+import dataclasses
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling import optimization
+from official.vision.configs import common
+from official.vision.configs import decoders
+from official.vision.configs import backbones
+
+
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """Input config for training."""
+  output_size: List[int] = dataclasses.field(default_factory=list)
+  # If crop_size is specified, image will be resized first to
+  # output_size, then crop of size crop_size will be cropped.
+  crop_size: List[int] = dataclasses.field(default_factory=list)
+  input_path: str = ''
+  global_batch_size: int = 0
+  is_training: bool = True
+  dtype: str = 'float32'
+  shuffle_buffer_size: int = 1000
+  cycle_length: int = 10
+  # If resize_eval_groundtruth is set to False, original image sizes are used
+  # for eval. In that case, groundtruth_padded_size has to be specified too to
+  # allow for batching the variable input sizes of images.
+  resize_eval_groundtruth: bool = True
+  groundtruth_padded_size: List[int] = dataclasses.field(default_factory=list)
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  aug_rand_hflip: bool = True
+  preserve_aspect_ratio: bool = True
+  aug_policy: Optional[str] = None
+  drop_remainder: bool = True
+  file_type: str = 'tfrecord'
+  decoder: Optional[common.DataDecoder] = common.DataDecoder()
+
+
+@dataclasses.dataclass
+class SegmentationHead(hyperparams.Config):
+  """Segmentation head config."""
+  level: int = 3
+  num_convs: int = 2
+  num_filters: int = 256
+  use_depthwise_convolution: bool = False
+  prediction_kernel_size: int = 1
+  upsample_factor: int = 1
+  feature_fusion: Optional[
+      str] = None  # None, deeplabv3plus, panoptic_fpn_fusion or pyramid_fusion
+  # deeplabv3plus feature fusion params
+  low_level: Union[int, str] = 2
+  low_level_num_filters: int = 48
+  # panoptic_fpn_fusion params
+  decoder_min_level: Optional[Union[int, str]] = None
+  decoder_max_level: Optional[Union[int, str]] = None
+
+
+@dataclasses.dataclass
+class MaskScoringHead(hyperparams.Config):
+  """Mask Scoring head config."""
+  num_convs: int = 4
+  num_filters: int = 128
+  fc_input_size: List[int] = dataclasses.field(default_factory=list)
+  num_fcs: int = 2
+  fc_dims: int = 1024
+
+
+@dataclasses.dataclass
+class SemanticSegmentationModel(hyperparams.Config):
+  """Semantic segmentation model config."""
+  num_classes: int = 0
+  input_size: List[int] = dataclasses.field(default_factory=list)
+  min_level: int = 3
+  max_level: int = 6
+  head: SegmentationHead = SegmentationHead()
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='resnet', resnet=backbones.ResNet())
+  decoder: decoders.Decoder = decoders.Decoder(type='identity')
+  mask_scoring_head: Optional[MaskScoringHead] = None
+  norm_activation: common.NormActivation = common.NormActivation()
+
+
+@dataclasses.dataclass
+class Losses(hyperparams.Config):
+  loss_weight: float = 1.0
+  label_smoothing: float = 0.0
+  ignore_label: int = 255
+  class_weights: List[float] = dataclasses.field(default_factory=list)
+  l2_weight_decay: float = 0.0
+  use_groundtruth_dimension: bool = True
+  top_k_percent_pixels: float = 1.0
+
+
+@dataclasses.dataclass
+class Evaluation(hyperparams.Config):
+  report_per_class_iou: bool = True
+  report_train_mean_iou: bool = True  # Turning this off can speed up training.
+
+
+@dataclasses.dataclass
+class SemanticSegmentationTask(cfg.TaskConfig):
+  """The model config."""
+  model: SemanticSegmentationModel = SemanticSegmentationModel()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(is_training=False)
+  losses: Losses = Losses()
+  evaluation: Evaluation = Evaluation()
+  train_input_partition_dims: List[int] = dataclasses.field(
+      default_factory=list)
+  eval_input_partition_dims: List[int] = dataclasses.field(
+      default_factory=list)
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: Union[
+      str, List[str]] = 'all'  # all, backbone, and/or decoder
+
+
+@exp_factory.register_config_factory('semantic_segmentation')
+def semantic_segmentation() -> cfg.ExperimentConfig:
+  """Semantic segmentation general."""
+  return cfg.ExperimentConfig(
+      task=SemanticSegmentationTask(),
+      trainer=cfg.TrainerConfig(),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+# PASCAL VOC 2012 Dataset
+PASCAL_TRAIN_EXAMPLES = 10582
+PASCAL_VAL_EXAMPLES = 1449
+PASCAL_INPUT_PATH_BASE = 'gs://**/pascal_voc_seg'
+
+
+@exp_factory.register_config_factory('seg_deeplabv3_pascal')
+def seg_deeplabv3_pascal() -> cfg.ExperimentConfig:
+  """Image segmentation on pascal voc with resnet deeplabv3."""
+  train_batch_size = 16
+  eval_batch_size = 8
+  steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
+  output_stride = 16
+  aspp_dilation_rates = [12, 24, 36]  # [6, 12, 18] if output_stride = 16
+  multigrid = [1, 2, 4]
+  stem_type = 'v1'
+  level = int(np.math.log2(output_stride))
+  config = cfg.ExperimentConfig(
+      task=SemanticSegmentationTask(
+          model=SemanticSegmentationModel(
+              num_classes=21,
+              input_size=[None, None, 3],
+              backbone=backbones.Backbone(
+                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
+                      model_id=101, output_stride=output_stride,
+                      multigrid=multigrid, stem_type=stem_type)),
+              decoder=decoders.Decoder(
+                  type='aspp', aspp=decoders.ASPP(
+                      level=level, dilation_rates=aspp_dilation_rates)),
+              head=SegmentationHead(level=level, num_convs=0),
+              norm_activation=common.NormActivation(
+                  activation='swish',
+                  norm_momentum=0.9997,
+                  norm_epsilon=1e-3,
+                  use_sync_bn=True)),
+          losses=Losses(l2_weight_decay=1e-4),
+          train_data=DataConfig(
+              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
+              # TODO(arashwan): test changing size to 513 to match deeplab.
+              output_size=[512, 512],
+              is_training=True,
+              global_batch_size=train_batch_size,
+              aug_scale_min=0.5,
+              aug_scale_max=2.0),
+          validation_data=DataConfig(
+              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
+              output_size=[512, 512],
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              resize_eval_groundtruth=False,
+              groundtruth_padded_size=[512, 512],
+              drop_remainder=False),
+          # resnet101
+          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
+          init_checkpoint_modules='backbone'),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=45 * steps_per_epoch,
+          validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 0.007,
+                      'decay_steps': 45 * steps_per_epoch,
+                      'end_learning_rate': 0.0,
+                      'power': 0.9
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 5 * steps_per_epoch,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('seg_deeplabv3plus_pascal')
+def seg_deeplabv3plus_pascal() -> cfg.ExperimentConfig:
+  """Image segmentation on pascal voc with resnet deeplabv3+."""
+  train_batch_size = 16
+  eval_batch_size = 8
+  steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
+  output_stride = 16
+  aspp_dilation_rates = [6, 12, 18]
+  multigrid = [1, 2, 4]
+  stem_type = 'v1'
+  level = int(np.math.log2(output_stride))
+  config = cfg.ExperimentConfig(
+      task=SemanticSegmentationTask(
+          model=SemanticSegmentationModel(
+              num_classes=21,
+              input_size=[None, None, 3],
+              backbone=backbones.Backbone(
+                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
+                      model_id=101, output_stride=output_stride,
+                      stem_type=stem_type, multigrid=multigrid)),
+              decoder=decoders.Decoder(
+                  type='aspp',
+                  aspp=decoders.ASPP(
+                      level=level, dilation_rates=aspp_dilation_rates)),
+              head=SegmentationHead(
+                  level=level,
+                  num_convs=2,
+                  feature_fusion='deeplabv3plus',
+                  low_level=2,
+                  low_level_num_filters=48),
+              norm_activation=common.NormActivation(
+                  activation='swish',
+                  norm_momentum=0.9997,
+                  norm_epsilon=1e-3,
+                  use_sync_bn=True)),
+          losses=Losses(l2_weight_decay=1e-4),
+          train_data=DataConfig(
+              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
+              output_size=[512, 512],
+              is_training=True,
+              global_batch_size=train_batch_size,
+              aug_scale_min=0.5,
+              aug_scale_max=2.0),
+          validation_data=DataConfig(
+              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
+              output_size=[512, 512],
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              resize_eval_groundtruth=False,
+              groundtruth_padded_size=[512, 512],
+              drop_remainder=False),
+          # resnet101
+          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
+          init_checkpoint_modules='backbone'),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=45 * steps_per_epoch,
+          validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 0.007,
+                      'decay_steps': 45 * steps_per_epoch,
+                      'end_learning_rate': 0.0,
+                      'power': 0.9
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 5 * steps_per_epoch,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('seg_resnetfpn_pascal')
+def seg_resnetfpn_pascal() -> cfg.ExperimentConfig:
+  """Image segmentation on pascal voc with resnet-fpn."""
+  train_batch_size = 256
+  eval_batch_size = 32
+  steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
+  config = cfg.ExperimentConfig(
+      task=SemanticSegmentationTask(
+          model=SemanticSegmentationModel(
+              num_classes=21,
+              input_size=[512, 512, 3],
+              min_level=3,
+              max_level=7,
+              backbone=backbones.Backbone(
+                  type='resnet', resnet=backbones.ResNet(model_id=50)),
+              decoder=decoders.Decoder(type='fpn', fpn=decoders.FPN()),
+              head=SegmentationHead(level=3, num_convs=3),
+              norm_activation=common.NormActivation(
+                  activation='swish',
+                  use_sync_bn=True)),
+          losses=Losses(l2_weight_decay=1e-4),
+          train_data=DataConfig(
+              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              aug_scale_min=0.2,
+              aug_scale_max=1.5),
+          validation_data=DataConfig(
+              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              resize_eval_groundtruth=False,
+              groundtruth_padded_size=[512, 512],
+              drop_remainder=False),
+      ),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=450 * steps_per_epoch,
+          validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 0.007,
+                      'decay_steps': 450 * steps_per_epoch,
+                      'end_learning_rate': 0.0,
+                      'power': 0.9
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 5 * steps_per_epoch,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('mnv2_deeplabv3_pascal')
+def mnv2_deeplabv3_pascal() -> cfg.ExperimentConfig:
+  """Image segmentation on pascal with mobilenetv2 deeplabv3."""
+  train_batch_size = 16
+  eval_batch_size = 16
+  steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
+  output_stride = 16
+  aspp_dilation_rates = []
+  level = int(np.math.log2(output_stride))
+  pool_kernel_size = []
+
+  config = cfg.ExperimentConfig(
+      task=SemanticSegmentationTask(
+          model=SemanticSegmentationModel(
+              num_classes=21,
+              input_size=[None, None, 3],
+              backbone=backbones.Backbone(
+                  type='mobilenet',
+                  mobilenet=backbones.MobileNet(
+                      model_id='MobileNetV2', output_stride=output_stride)),
+              decoder=decoders.Decoder(
+                  type='aspp',
+                  aspp=decoders.ASPP(
+                      level=level,
+                      dilation_rates=aspp_dilation_rates,
+                      pool_kernel_size=pool_kernel_size)),
+              head=SegmentationHead(level=level, num_convs=0),
+              norm_activation=common.NormActivation(
+                  activation='relu',
+                  norm_momentum=0.99,
+                  norm_epsilon=1e-3,
+                  use_sync_bn=True)),
+          losses=Losses(l2_weight_decay=4e-5),
+          train_data=DataConfig(
+              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
+              output_size=[512, 512],
+              is_training=True,
+              global_batch_size=train_batch_size,
+              aug_scale_min=0.5,
+              aug_scale_max=2.0),
+          validation_data=DataConfig(
+              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
+              output_size=[512, 512],
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              resize_eval_groundtruth=False,
+              groundtruth_padded_size=[512, 512],
+              drop_remainder=False),
+          # mobilenetv2
+          init_checkpoint='gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63',
+          init_checkpoint_modules=['backbone', 'decoder']),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=30000,
+          validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          best_checkpoint_eval_metric='mean_iou',
+          best_checkpoint_export_subdir='best_ckpt',
+          best_checkpoint_metric_comp='higher',
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 0.007 * train_batch_size / 16,
+                      'decay_steps': 30000,
+                      'end_learning_rate': 0.0,
+                      'power': 0.9
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 5 * steps_per_epoch,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+# Cityscapes Dataset (Download and process the dataset yourself)
+CITYSCAPES_TRAIN_EXAMPLES = 2975
+CITYSCAPES_VAL_EXAMPLES = 500
+CITYSCAPES_INPUT_PATH_BASE = 'cityscapes'
+
+
+@exp_factory.register_config_factory('seg_deeplabv3plus_cityscapes')
+def seg_deeplabv3plus_cityscapes() -> cfg.ExperimentConfig:
+  """Image segmentation on cityscapes with resnet deeplabv3+."""
+  train_batch_size = 16
+  eval_batch_size = 16
+  steps_per_epoch = CITYSCAPES_TRAIN_EXAMPLES // train_batch_size
+  output_stride = 16
+  aspp_dilation_rates = [6, 12, 18]
+  multigrid = [1, 2, 4]
+  stem_type = 'v1'
+  level = int(np.math.log2(output_stride))
+  config = cfg.ExperimentConfig(
+      task=SemanticSegmentationTask(
+          model=SemanticSegmentationModel(
+              # Cityscapes uses only 19 semantic classes for train/evaluation.
+              # The void (background) class is ignored in train and evaluation.
+              num_classes=19,
+              input_size=[None, None, 3],
+              backbone=backbones.Backbone(
+                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
+                      model_id=101, output_stride=output_stride,
+                      stem_type=stem_type, multigrid=multigrid)),
+              decoder=decoders.Decoder(
+                  type='aspp',
+                  aspp=decoders.ASPP(
+                      level=level, dilation_rates=aspp_dilation_rates,
+                      pool_kernel_size=[512, 1024])),
+              head=SegmentationHead(
+                  level=level,
+                  num_convs=2,
+                  feature_fusion='deeplabv3plus',
+                  low_level=2,
+                  low_level_num_filters=48),
+              norm_activation=common.NormActivation(
+                  activation='swish',
+                  norm_momentum=0.99,
+                  norm_epsilon=1e-3,
+                  use_sync_bn=True)),
+          losses=Losses(l2_weight_decay=1e-4),
+          train_data=DataConfig(
+              input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE,
+                                      'train_fine**'),
+              crop_size=[512, 1024],
+              output_size=[1024, 2048],
+              is_training=True,
+              global_batch_size=train_batch_size,
+              aug_scale_min=0.5,
+              aug_scale_max=2.0),
+          validation_data=DataConfig(
+              input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, 'val_fine*'),
+              output_size=[1024, 2048],
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              resize_eval_groundtruth=True,
+              drop_remainder=False),
+          # resnet101
+          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
+          init_checkpoint_modules='backbone'),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=500 * steps_per_epoch,
+          validation_steps=CITYSCAPES_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 0.01,
+                      'decay_steps': 500 * steps_per_epoch,
+                      'end_learning_rate': 0.0,
+                      'power': 0.9
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 5 * steps_per_epoch,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('mnv2_deeplabv3_cityscapes')
+def mnv2_deeplabv3_cityscapes() -> cfg.ExperimentConfig:
+  """Image segmentation on cityscapes with mobilenetv2 deeplabv3."""
+  train_batch_size = 16
+  eval_batch_size = 16
+  steps_per_epoch = CITYSCAPES_TRAIN_EXAMPLES // train_batch_size
+  output_stride = 16
+  aspp_dilation_rates = []
+  pool_kernel_size = [512, 1024]
+
+  level = int(np.math.log2(output_stride))
+  config = cfg.ExperimentConfig(
+      task=SemanticSegmentationTask(
+          model=SemanticSegmentationModel(
+              # Cityscapes uses only 19 semantic classes for train/evaluation.
+              # The void (background) class is ignored in train and evaluation.
+              num_classes=19,
+              input_size=[None, None, 3],
+              backbone=backbones.Backbone(
+                  type='mobilenet',
+                  mobilenet=backbones.MobileNet(
+                      model_id='MobileNetV2', output_stride=output_stride)),
+              decoder=decoders.Decoder(
+                  type='aspp',
+                  aspp=decoders.ASPP(
+                      level=level,
+                      dilation_rates=aspp_dilation_rates,
+                      pool_kernel_size=pool_kernel_size)),
+              head=SegmentationHead(level=level, num_convs=0),
+              norm_activation=common.NormActivation(
+                  activation='relu',
+                  norm_momentum=0.99,
+                  norm_epsilon=1e-3,
+                  use_sync_bn=True)),
+          losses=Losses(l2_weight_decay=4e-5),
+          train_data=DataConfig(
+              input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE,
+                                      'train_fine**'),
+              crop_size=[512, 1024],
+              output_size=[1024, 2048],
+              is_training=True,
+              global_batch_size=train_batch_size,
+              aug_scale_min=0.5,
+              aug_scale_max=2.0),
+          validation_data=DataConfig(
+              input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, 'val_fine*'),
+              output_size=[1024, 2048],
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              resize_eval_groundtruth=True,
+              drop_remainder=False),
+          # Coco pre-trained mobilenetv2 checkpoint
+          init_checkpoint='gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63',
+          init_checkpoint_modules='backbone'),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=100000,
+          validation_steps=CITYSCAPES_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          best_checkpoint_eval_metric='mean_iou',
+          best_checkpoint_export_subdir='best_ckpt',
+          best_checkpoint_metric_comp='higher',
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 0.01,
+                      'decay_steps': 100000,
+                      'end_learning_rate': 0.0,
+                      'power': 0.9
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 5 * steps_per_epoch,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('mnv2_deeplabv3plus_cityscapes')
+def mnv2_deeplabv3plus_cityscapes() -> cfg.ExperimentConfig:
+  """Image segmentation on cityscapes with mobilenetv2 deeplabv3plus."""
+  config = mnv2_deeplabv3_cityscapes()
+  config.task.model.head = SegmentationHead(
+      level=4,
+      num_convs=2,
+      feature_fusion='deeplabv3plus',
+      use_depthwise_convolution=True,
+      low_level='2/depthwise',
+      low_level_num_filters=48)
+  config.task.model.backbone.mobilenet.output_intermediate_endpoints = True
+  return config
--- a/official/vision/configs/semantic_segmentation_test.py
+++ b/official/vision/configs/semantic_segmentation_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for semantic_segmentation."""
+
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official import vision
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.configs import semantic_segmentation as exp_cfg
+
+
+class ImageSegmentationConfigTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(('seg_deeplabv3_pascal',),
+                            ('seg_deeplabv3plus_pascal',))
+  def test_semantic_segmentation_configs(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.SemanticSegmentationTask)
+    self.assertIsInstance(config.task.model,
+                          exp_cfg.SemanticSegmentationModel)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaises(KeyError):
+      config.validate()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/configs/video_classification.py
+++ b/official/vision/configs/video_classification.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Video classification configuration definition."""
+import dataclasses
+from typing import Optional, Tuple
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling import optimization
+from official.vision.configs import backbones_3d
+from official.vision.configs import common
+
+
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """The base configuration for building datasets."""
+  name: Optional[str] = None
+  file_type: Optional[str] = 'tfrecord'
+  compressed_input: bool = False
+  split: str = 'train'
+  variant_name: Optional[str] = None
+  feature_shape: Tuple[int, ...] = (64, 224, 224, 3)
+  temporal_stride: int = 1
+  random_stride_range: int = 0
+  num_test_clips: int = 1
+  num_test_crops: int = 1
+  num_classes: int = -1
+  num_examples: int = -1
+  global_batch_size: int = 128
+  data_format: str = 'channels_last'
+  dtype: str = 'float32'
+  one_hot: bool = True
+  shuffle_buffer_size: int = 64
+  cache: bool = False
+  input_path: str = ''
+  is_training: bool = True
+  cycle_length: int = 10
+  drop_remainder: bool = True
+  min_image_size: int = 256
+  is_multilabel: bool = False
+  output_audio: bool = False
+  audio_feature: str = ''
+  audio_feature_shape: Tuple[int, ...] = (-1,)
+  aug_min_aspect_ratio: float = 0.5
+  aug_max_aspect_ratio: float = 2.0
+  aug_min_area_ratio: float = 0.49
+  aug_max_area_ratio: float = 1.0
+  aug_type: Optional[str] = None  # 'autoaug', 'randaug', or None
+  image_field_key: str = 'image/encoded'
+  label_field_key: str = 'clip/label/index'
+
+
+def kinetics400(is_training):
+  """Generated Kinectics 400 dataset configs."""
+  return DataConfig(
+      name='kinetics400',
+      num_classes=400,
+      is_training=is_training,
+      split='train' if is_training else 'valid',
+      drop_remainder=is_training,
+      num_examples=215570 if is_training else 17706,
+      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
+
+
+def kinetics600(is_training):
+  """Generated Kinectics 600 dataset configs."""
+  return DataConfig(
+      name='kinetics600',
+      num_classes=600,
+      is_training=is_training,
+      split='train' if is_training else 'valid',
+      drop_remainder=is_training,
+      num_examples=366016 if is_training else 27780,
+      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
+
+
+def kinetics700(is_training):
+  """Generated Kinectics 600 dataset configs."""
+  return DataConfig(
+      name='kinetics700',
+      num_classes=700,
+      is_training=is_training,
+      split='train' if is_training else 'valid',
+      drop_remainder=is_training,
+      num_examples=522883 if is_training else 33441,
+      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
+
+
+def kinetics700_2020(is_training):
+  """Generated Kinectics 600 dataset configs."""
+  return DataConfig(
+      name='kinetics700',
+      num_classes=700,
+      is_training=is_training,
+      split='train' if is_training else 'valid',
+      drop_remainder=is_training,
+      num_examples=535982 if is_training else 33640,
+      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
+
+
+@dataclasses.dataclass
+class VideoClassificationModel(hyperparams.Config):
+  """The model config."""
+  model_type: str = 'video_classification'
+  backbone: backbones_3d.Backbone3D = backbones_3d.Backbone3D(
+      type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50())
+  norm_activation: common.NormActivation = common.NormActivation(
+      use_sync_bn=False)
+  dropout_rate: float = 0.2
+  aggregate_endpoints: bool = False
+  require_endpoints: Optional[Tuple[str, ...]] = None
+
+
+@dataclasses.dataclass
+class Losses(hyperparams.Config):
+  one_hot: bool = True
+  label_smoothing: float = 0.0
+  l2_weight_decay: float = 0.0
+
+
+@dataclasses.dataclass
+class Metrics(hyperparams.Config):
+  use_per_class_recall: bool = False
+
+
+@dataclasses.dataclass
+class VideoClassificationTask(cfg.TaskConfig):
+  """The task config."""
+  model: VideoClassificationModel = VideoClassificationModel()
+  train_data: DataConfig = DataConfig(is_training=True, drop_remainder=True)
+  validation_data: DataConfig = DataConfig(
+      is_training=False, drop_remainder=False)
+  losses: Losses = Losses()
+  metrics: Metrics = Metrics()
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: str = 'all'  # all or backbone
+  # Spatial Partitioning fields.
+  train_input_partition_dims: Optional[Tuple[int, ...]] = None
+  eval_input_partition_dims: Optional[Tuple[int, ...]] = None
+
+
+def add_trainer(experiment: cfg.ExperimentConfig,
+                train_batch_size: int,
+                eval_batch_size: int,
+                learning_rate: float = 1.6,
+                train_epochs: int = 44,
+                warmup_epochs: int = 5):
+  """Add and config a trainer to the experiment config."""
+  if experiment.task.train_data.num_examples <= 0:
+    raise ValueError('Wrong train dataset size {!r}'.format(
+        experiment.task.train_data))
+  if experiment.task.validation_data.num_examples <= 0:
+    raise ValueError('Wrong validation dataset size {!r}'.format(
+        experiment.task.validation_data))
+  experiment.task.train_data.global_batch_size = train_batch_size
+  experiment.task.validation_data.global_batch_size = eval_batch_size
+  steps_per_epoch = experiment.task.train_data.num_examples // train_batch_size
+  experiment.trainer = cfg.TrainerConfig(
+      steps_per_loop=steps_per_epoch,
+      summary_interval=steps_per_epoch,
+      checkpoint_interval=steps_per_epoch,
+      train_steps=train_epochs * steps_per_epoch,
+      validation_steps=experiment.task.validation_data.num_examples //
+      eval_batch_size,
+      validation_interval=steps_per_epoch,
+      optimizer_config=optimization.OptimizationConfig({
+          'optimizer': {
+              'type': 'sgd',
+              'sgd': {
+                  'momentum': 0.9,
+                  'nesterov': True,
+              }
+          },
+          'learning_rate': {
+              'type': 'cosine',
+              'cosine': {
+                  'initial_learning_rate': learning_rate,
+                  'decay_steps': train_epochs * steps_per_epoch,
+              }
+          },
+          'warmup': {
+              'type': 'linear',
+              'linear': {
+                  'warmup_steps': warmup_epochs * steps_per_epoch,
+                  'warmup_learning_rate': 0
+              }
+          }
+      }))
+  return experiment
+
+
+@exp_factory.register_config_factory('video_classification')
+def video_classification() -> cfg.ExperimentConfig:
+  """Video classification general."""
+  return cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=VideoClassificationTask(),
+      trainer=cfg.TrainerConfig(),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+
+
+@exp_factory.register_config_factory('video_classification_ucf101')
+def video_classification_ucf101() -> cfg.ExperimentConfig:
+  """Video classification on UCF-101 with resnet."""
+  train_dataset = DataConfig(
+      name='ucf101',
+      num_classes=101,
+      is_training=True,
+      split='train',
+      drop_remainder=True,
+      num_examples=9537,
+      temporal_stride=2,
+      feature_shape=(32, 224, 224, 3))
+  train_dataset.tfds_name = 'ucf101'
+  train_dataset.tfds_split = 'train'
+  validation_dataset = DataConfig(
+      name='ucf101',
+      num_classes=101,
+      is_training=True,
+      split='test',
+      drop_remainder=False,
+      num_examples=3783,
+      temporal_stride=2,
+      feature_shape=(32, 224, 224, 3))
+  validation_dataset.tfds_name = 'ucf101'
+  validation_dataset.tfds_split = 'test'
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(
+      config,
+      train_batch_size=64,
+      eval_batch_size=16,
+      learning_rate=0.8,
+      train_epochs=100)
+  return config
+
+
+@exp_factory.register_config_factory('video_classification_kinetics400')
+def video_classification_kinetics400() -> cfg.ExperimentConfig:
+  """Video classification on Kinectics 400 with resnet."""
+  train_dataset = kinetics400(is_training=True)
+  validation_dataset = kinetics400(is_training=False)
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
+  return config
+
+
+@exp_factory.register_config_factory('video_classification_kinetics600')
+def video_classification_kinetics600() -> cfg.ExperimentConfig:
+  """Video classification on Kinectics 600 with resnet."""
+  train_dataset = kinetics600(is_training=True)
+  validation_dataset = kinetics600(is_training=False)
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
+  return config
+
+
+@exp_factory.register_config_factory('video_classification_kinetics700')
+def video_classification_kinetics700() -> cfg.ExperimentConfig:
+  """Video classification on Kinectics 700 with resnet."""
+  train_dataset = kinetics700(is_training=True)
+  validation_dataset = kinetics700(is_training=False)
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
+  return config
+
+
+@exp_factory.register_config_factory('video_classification_kinetics700_2020')
+def video_classification_kinetics700_2020() -> cfg.ExperimentConfig:
+  """Video classification on Kinectics 700 2020 with resnet."""
+  train_dataset = kinetics700_2020(is_training=True)
+  validation_dataset = kinetics700_2020(is_training=False)
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
+  return config
--- a/official/vision/configs/video_classification_test.py
+++ b/official/vision/configs/video_classification_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for video_classification."""
+
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official import vision
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.configs import video_classification as exp_cfg
+
+
+class VideoClassificationConfigTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(('video_classification',),
+                            ('video_classification_kinetics600',))
+  def test_video_classification_configs(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.VideoClassificationTask)
+    self.assertIsInstance(config.task.model, exp_cfg.VideoClassificationModel)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaises(KeyError):
+      config.validate()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/data/__init__.py
+++ b/official/vision/data/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/vision/data/create_coco_tf_record.py
+++ b/official/vision/data/create_coco_tf_record.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Convert raw COCO dataset to TFRecord format.
+
+This scripts follows the label map decoder format and supports detection
+boxes, instance masks and captions.
+
+Example usage:
+    python create_coco_tf_record.py --logtostderr \
+      --image_dir="${TRAIN_IMAGE_DIR}" \
+      --image_info_file="${TRAIN_IMAGE_INFO_FILE}" \
+      --object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
+      --caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
+      --output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
+      --num_shards=100
+"""
+
+import collections
+import json
+import logging
+import os
+
+from absl import app  # pylint:disable=unused-import
+from absl import flags
+import numpy as np
+
+from pycocotools import mask
+import tensorflow as tf
+
+import multiprocessing as mp
+from official.vision.data import tfrecord_lib
+
+
+flags.DEFINE_boolean(
+    'include_masks', False, 'Whether to include instance segmentations masks '
+    '(PNG encoded) in the result. default: False.')
+flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
+flags.DEFINE_string(
+    'image_info_file', '', 'File containing image information. '
+    'Tf Examples in the output files correspond to the image '
+    'info entries in this file. If this file is not provided '
+    'object_annotations_file is used if present. Otherwise, '
+    'caption_annotations_file is used to get image info.')
+flags.DEFINE_string(
+    'object_annotations_file', '', 'File containing object '
+    'annotations - boxes and instance masks.')
+flags.DEFINE_string('caption_annotations_file', '', 'File containing image '
+                    'captions.')
+flags.DEFINE_string('panoptic_annotations_file', '', 'File containing panoptic '
+                    'annotations.')
+flags.DEFINE_string('panoptic_masks_dir', '',
+                    'Directory containing panoptic masks annotations.')
+flags.DEFINE_boolean(
+    'include_panoptic_masks', False, 'Whether to include category and '
+    'instance masks in the result. These are required to run the PQ evaluator '
+    'default: False.')
+flags.DEFINE_string('output_file_prefix', '/tmp/train', 'Path to output file')
+flags.DEFINE_integer('num_shards', 32, 'Number of shards for output file.')
+
+FLAGS = flags.FLAGS
+
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+
+_VOID_LABEL = 0
+_VOID_INSTANCE_ID = 0
+_THING_CLASS_ID = 1
+_STUFF_CLASSES_OFFSET = 90
+
+
+def coco_segmentation_to_mask_png(segmentation, height, width, is_crowd):
+  """Encode a COCO mask segmentation as PNG string."""
+  run_len_encoding = mask.frPyObjects(segmentation, height, width)
+  binary_mask = mask.decode(run_len_encoding)
+  if not is_crowd:
+    binary_mask = np.amax(binary_mask, axis=2)
+
+  return tfrecord_lib.encode_mask_as_png(binary_mask)
+
+
+def generate_coco_panoptics_masks(segments_info, mask_path,
+                                  include_panoptic_masks,
+                                  is_category_thing):
+  """Creates masks for panoptic segmentation task.
+
+  Args:
+    segments_info: a list of dicts, where each dict has keys: [u'id',
+      u'category_id', u'area', u'bbox', u'iscrowd'], detailing information for
+      each segment in the panoptic mask.
+    mask_path: path to the panoptic mask.
+    include_panoptic_masks: bool, when set to True, category and instance
+      masks are included in the outputs. Set this to True, when using
+      the Panoptic Quality evaluator.
+    is_category_thing: a dict with category ids as keys and, 0/1 as values to
+      represent "stuff" and "things" classes respectively.
+
+  Returns:
+    A dict with with keys: [u'semantic_segmentation_mask', u'category_mask',
+      u'instance_mask']. The dict contains 'category_mask' and 'instance_mask'
+      only if `include_panoptic_eval_masks` is set to True.
+  """
+  rgb_mask = tfrecord_lib.read_image(mask_path)
+  r, g, b = np.split(rgb_mask, 3, axis=-1)
+
+  # decode rgb encoded panoptic mask to get segments ids
+  # refer https://cocodataset.org/#format-data
+  segments_encoded_mask = (r + g * 256 + b * (256**2)).squeeze()
+
+  semantic_segmentation_mask = np.ones_like(
+      segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
+  if include_panoptic_masks:
+    category_mask = np.ones_like(
+        segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
+    instance_mask = np.ones_like(
+        segments_encoded_mask, dtype=np.uint8) * _VOID_INSTANCE_ID
+
+  for idx, segment in enumerate(segments_info):
+    segment_id = segment['id']
+    category_id = segment['category_id']
+
+    if is_category_thing[category_id]:
+      encoded_category_id = _THING_CLASS_ID
+      instance_id = idx + 1
+    else:
+      encoded_category_id = category_id - _STUFF_CLASSES_OFFSET
+      instance_id = _VOID_INSTANCE_ID
+
+    segment_mask = (segments_encoded_mask == segment_id)
+    semantic_segmentation_mask[segment_mask] = encoded_category_id
+
+    if include_panoptic_masks:
+      category_mask[segment_mask] = category_id
+      instance_mask[segment_mask] = instance_id
+
+  outputs = {
+      'semantic_segmentation_mask': tfrecord_lib.encode_mask_as_png(
+          semantic_segmentation_mask)
+      }
+
+  if include_panoptic_masks:
+    outputs.update({
+        'category_mask': tfrecord_lib.encode_mask_as_png(category_mask),
+        'instance_mask': tfrecord_lib.encode_mask_as_png(instance_mask)
+        })
+  return outputs
+
+
+def coco_annotations_to_lists(bbox_annotations, id_to_name_map,
+                              image_height, image_width, include_masks):
+  """Converts COCO annotations to feature lists."""
+
+  data = dict((k, list()) for k in
+              ['xmin', 'xmax', 'ymin', 'ymax', 'is_crowd',
+               'category_id', 'category_names', 'area'])
+  if include_masks:
+    data['encoded_mask_png'] = []
+
+  num_annotations_skipped = 0
+
+  for object_annotations in bbox_annotations:
+    (x, y, width, height) = tuple(object_annotations['bbox'])
+
+    if width <= 0 or height <= 0:
+      num_annotations_skipped += 1
+      continue
+    if x + width > image_width or y + height > image_height:
+      num_annotations_skipped += 1
+      continue
+    data['xmin'].append(float(x) / image_width)
+    data['xmax'].append(float(x + width) / image_width)
+    data['ymin'].append(float(y) / image_height)
+    data['ymax'].append(float(y + height) / image_height)
+    data['is_crowd'].append(object_annotations['iscrowd'])
+    category_id = int(object_annotations['category_id'])
+    data['category_id'].append(category_id)
+    data['category_names'].append(id_to_name_map[category_id].encode('utf8'))
+    data['area'].append(object_annotations['area'])
+
+    if include_masks:
+      data['encoded_mask_png'].append(
+          coco_segmentation_to_mask_png(object_annotations['segmentation'],
+                                        image_height, image_width,
+                                        object_annotations['iscrowd'])
+      )
+
+  return data, num_annotations_skipped
+
+
+def bbox_annotations_to_feature_dict(
+    bbox_annotations, image_height, image_width, id_to_name_map, include_masks):
+  """Convert COCO annotations to an encoded feature dict."""
+
+  data, num_skipped = coco_annotations_to_lists(
+      bbox_annotations, id_to_name_map, image_height, image_width,
+      include_masks)
+  feature_dict = {
+      'image/object/bbox/xmin':
+          tfrecord_lib.convert_to_feature(data['xmin']),
+      'image/object/bbox/xmax':
+          tfrecord_lib.convert_to_feature(data['xmax']),
+      'image/object/bbox/ymin':
+          tfrecord_lib.convert_to_feature(data['ymin']),
+      'image/object/bbox/ymax':
+          tfrecord_lib.convert_to_feature(data['ymax']),
+      'image/object/class/text':
+          tfrecord_lib.convert_to_feature(data['category_names']),
+      'image/object/class/label':
+          tfrecord_lib.convert_to_feature(data['category_id']),
+      'image/object/is_crowd':
+          tfrecord_lib.convert_to_feature(data['is_crowd']),
+      'image/object/area':
+          tfrecord_lib.convert_to_feature(data['area']),
+  }
+  if include_masks:
+    feature_dict['image/object/mask'] = (
+        tfrecord_lib.convert_to_feature(data['encoded_mask_png']))
+
+  return feature_dict, num_skipped
+
+
+def encode_caption_annotations(caption_annotations):
+  captions = []
+  for caption_annotation in caption_annotations:
+    captions.append(caption_annotation['caption'].encode('utf8'))
+
+  return captions
+
+
+def create_tf_example(image,
+                      image_dirs,
+                      panoptic_masks_dir=None,
+                      bbox_annotations=None,
+                      id_to_name_map=None,
+                      caption_annotations=None,
+                      panoptic_annotation=None,
+                      is_category_thing=None,
+                      include_panoptic_masks=False,
+                      include_masks=False):
+  """Converts image and annotations to a tf.Example proto.
+
+  Args:
+    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
+      u'width', u'date_captured', u'flickr_url', u'id']
+    image_dirs: list of directories containing the image files.
+    panoptic_masks_dir: `str` of the panoptic masks directory.
+    bbox_annotations:
+      list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
+        u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
+        coordinates in the official COCO dataset are given as [x, y, width,
+        height] tuples using absolute coordinates where x, y represent the
+        top-left (0-indexed) corner.  This function converts to the format
+        expected by the Tensorflow Object Detection API (which is which is
+        [ymin, xmin, ymax, xmax] with coordinates normalized relative to image
+        size).
+    id_to_name_map: a dict mapping category IDs to string names.
+    caption_annotations:
+      list of dict with keys: [u'id', u'image_id', u'str'].
+    panoptic_annotation: dict with keys: [u'image_id', u'file_name',
+      u'segments_info']. Where the value for segments_info is a list of dicts,
+      with each dict containing information for a single segment in the mask.
+    is_category_thing: `bool`, whether it is a category thing.
+    include_panoptic_masks: `bool`, whether to include panoptic masks.
+    include_masks: Whether to include instance segmentations masks
+      (PNG encoded) in the result. default: False.
+
+  Returns:
+    example: The converted tf.Example
+    num_annotations_skipped: Number of (invalid) annotations that were ignored.
+
+  Raises:
+    ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
+      does not exist, or is not unique across image directories.
+  """
+  image_height = image['height']
+  image_width = image['width']
+  filename = image['file_name']
+  image_id = image['id']
+
+  if len(image_dirs) > 1:
+    full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
+    full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
+    if not full_existing_paths:
+      raise ValueError(
+          '{} does not exist across image directories.'.format(filename))
+    if len(full_existing_paths) > 1:
+      raise ValueError(
+          '{} is not unique across image directories'.format(filename))
+    full_path, = full_existing_paths
+  # If there is only one image directory, it's not worth checking for existence,
+  # since trying to open the file will raise an informative error message if it
+  # does not exist.
+  else:
+    image_dir, = image_dirs
+    full_path = os.path.join(image_dir, filename)
+
+  with tf.io.gfile.GFile(full_path, 'rb') as fid:
+    encoded_jpg = fid.read()
+
+  feature_dict = tfrecord_lib.image_info_to_feature_dict(
+      image_height, image_width, filename, image_id, encoded_jpg, 'jpg')
+
+  num_annotations_skipped = 0
+  if bbox_annotations:
+    box_feature_dict, num_skipped = bbox_annotations_to_feature_dict(
+        bbox_annotations, image_height, image_width, id_to_name_map,
+        include_masks)
+    num_annotations_skipped += num_skipped
+    feature_dict.update(box_feature_dict)
+
+  if caption_annotations:
+    encoded_captions = encode_caption_annotations(caption_annotations)
+    feature_dict.update(
+        {'image/caption': tfrecord_lib.convert_to_feature(encoded_captions)})
+
+  if panoptic_annotation:
+    segments_info = panoptic_annotation['segments_info']
+    panoptic_mask_filename = os.path.join(
+        panoptic_masks_dir,
+        panoptic_annotation['file_name'])
+    encoded_panoptic_masks = generate_coco_panoptics_masks(
+        segments_info, panoptic_mask_filename, include_panoptic_masks,
+        is_category_thing)
+    feature_dict.update(
+        {'image/segmentation/class/encoded': tfrecord_lib.convert_to_feature(
+            encoded_panoptic_masks['semantic_segmentation_mask'])})
+
+    if include_panoptic_masks:
+      feature_dict.update({
+          'image/panoptic/category_mask': tfrecord_lib.convert_to_feature(
+              encoded_panoptic_masks['category_mask']),
+          'image/panoptic/instance_mask': tfrecord_lib.convert_to_feature(
+              encoded_panoptic_masks['instance_mask'])
+            })
+
+  example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
+  return example, num_annotations_skipped
+
+
+def _load_object_annotations(object_annotations_file):
+  """Loads object annotation JSON file."""
+  with tf.io.gfile.GFile(object_annotations_file, 'r') as fid:
+    obj_annotations = json.load(fid)
+
+  images = obj_annotations['images']
+  id_to_name_map = dict((element['id'], element['name']) for element in
+                        obj_annotations['categories'])
+
+  img_to_obj_annotation = collections.defaultdict(list)
+  logging.info('Building bounding box index.')
+  for annotation in obj_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_obj_annotation[image_id].append(annotation)
+
+  missing_annotation_count = 0
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_obj_annotation:
+      missing_annotation_count += 1
+
+  logging.info('%d images are missing bboxes.', missing_annotation_count)
+
+  return img_to_obj_annotation, id_to_name_map
+
+
+def _load_caption_annotations(caption_annotations_file):
+  """Loads caption annotation JSON file."""
+  with tf.io.gfile.GFile(caption_annotations_file, 'r') as fid:
+    caption_annotations = json.load(fid)
+
+  img_to_caption_annotation = collections.defaultdict(list)
+  logging.info('Building caption index.')
+  for annotation in caption_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_caption_annotation[image_id].append(annotation)
+
+  missing_annotation_count = 0
+  images = caption_annotations['images']
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_caption_annotation:
+      missing_annotation_count += 1
+
+  logging.info('%d images are missing captions.', missing_annotation_count)
+
+  return img_to_caption_annotation
+
+
+def _load_panoptic_annotations(panoptic_annotations_file):
+  """Loads panoptic annotation from file."""
+  with tf.io.gfile.GFile(panoptic_annotations_file, 'r') as fid:
+    panoptic_annotations = json.load(fid)
+
+  img_to_panoptic_annotation = dict()
+  logging.info('Building panoptic index.')
+  for annotation in panoptic_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_panoptic_annotation[image_id] = annotation
+
+  is_category_thing = dict()
+  for category_info in panoptic_annotations['categories']:
+    is_category_thing[category_info['id']] = category_info['isthing'] == 1
+
+  missing_annotation_count = 0
+  images = panoptic_annotations['images']
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_panoptic_annotation:
+      missing_annotation_count += 1
+  logging.info(
+      '%d images are missing panoptic annotations.', missing_annotation_count)
+
+  return img_to_panoptic_annotation, is_category_thing
+
+
+def _load_images_info(images_info_file):
+  with tf.io.gfile.GFile(images_info_file, 'r') as fid:
+    info_dict = json.load(fid)
+  return info_dict['images']
+
+
+def generate_annotations(images, image_dirs,
+                         panoptic_masks_dir=None,
+                         img_to_obj_annotation=None,
+                         img_to_caption_annotation=None,
+                         img_to_panoptic_annotation=None,
+                         is_category_thing=None,
+                         id_to_name_map=None,
+                         include_panoptic_masks=False,
+                         include_masks=False):
+  """Generator for COCO annotations."""
+  for image in images:
+    object_annotation = (img_to_obj_annotation.get(image['id'], None) if
+                         img_to_obj_annotation else None)
+
+    caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
+                         img_to_caption_annotation else None)
+
+    panoptic_annotation = (img_to_panoptic_annotation.get(image['id'], None) if
+                           img_to_panoptic_annotation else None)
+    yield (image, image_dirs, panoptic_masks_dir, object_annotation,
+           id_to_name_map, caption_annotaion, panoptic_annotation,
+           is_category_thing, include_panoptic_masks, include_masks)
+
+
+def _create_tf_record_from_coco_annotations(images_info_file,
+                                            image_dirs,
+                                            output_path,
+                                            num_shards,
+                                            object_annotations_file=None,
+                                            caption_annotations_file=None,
+                                            panoptic_masks_dir=None,
+                                            panoptic_annotations_file=None,
+                                            include_panoptic_masks=False,
+                                            include_masks=False):
+  """Loads COCO annotation json files and converts to tf.Record format.
+
+  Args:
+    images_info_file: JSON file containing image info. The number of tf.Examples
+      in the output tf Record files is exactly equal to the number of image info
+      entries in this file. This can be any of train/val/test annotation json
+      files Eg. 'image_info_test-dev2017.json',
+      'instance_annotations_train2017.json',
+      'caption_annotations_train2017.json', etc.
+    image_dirs: List of directories containing the image files.
+    output_path: Path to output tf.Record file.
+    num_shards: Number of output files to create.
+    object_annotations_file: JSON file containing bounding box annotations.
+    caption_annotations_file: JSON file containing caption annotations.
+    panoptic_masks_dir: Directory containing panoptic masks.
+    panoptic_annotations_file: JSON file containing panoptic annotations.
+    include_panoptic_masks: Whether to include 'category_mask'
+      and 'instance_mask', which is required by the panoptic quality evaluator.
+    include_masks: Whether to include instance segmentations masks
+      (PNG encoded) in the result. default: False.
+  """
+
+  logging.info('writing to output path: %s', output_path)
+
+  images = _load_images_info(images_info_file)
+
+  img_to_obj_annotation = None
+  img_to_caption_annotation = None
+  id_to_name_map = None
+  img_to_panoptic_annotation = None
+  is_category_thing = None
+  if object_annotations_file:
+    img_to_obj_annotation, id_to_name_map = (
+        _load_object_annotations(object_annotations_file))
+  if caption_annotations_file:
+    img_to_caption_annotation = (
+        _load_caption_annotations(caption_annotations_file))
+  if panoptic_annotations_file:
+    img_to_panoptic_annotation, is_category_thing = (
+        _load_panoptic_annotations(panoptic_annotations_file))
+
+  coco_annotations_iter = generate_annotations(
+      images=images,
+      image_dirs=image_dirs,
+      panoptic_masks_dir=panoptic_masks_dir,
+      img_to_obj_annotation=img_to_obj_annotation,
+      img_to_caption_annotation=img_to_caption_annotation,
+      img_to_panoptic_annotation=img_to_panoptic_annotation,
+      is_category_thing=is_category_thing,
+      id_to_name_map=id_to_name_map,
+      include_panoptic_masks=include_panoptic_masks,
+      include_masks=include_masks)
+
+  num_skipped = tfrecord_lib.write_tf_record_dataset(
+      output_path, coco_annotations_iter, create_tf_example, num_shards)
+
+  logging.info('Finished writing, skipped %d annotations.', num_skipped)
+
+
+def main(_):
+  assert FLAGS.image_dir, '`image_dir` missing.'
+  assert (FLAGS.image_info_file or FLAGS.object_annotations_file or
+          FLAGS.caption_annotations_file), ('All annotation files are '
+                                            'missing.')
+  if FLAGS.image_info_file:
+    images_info_file = FLAGS.image_info_file
+  elif FLAGS.object_annotations_file:
+    images_info_file = FLAGS.object_annotations_file
+  else:
+    images_info_file = FLAGS.caption_annotations_file
+
+  directory = os.path.dirname(FLAGS.output_file_prefix)
+  if not tf.io.gfile.isdir(directory):
+    tf.io.gfile.makedirs(directory)
+
+  _create_tf_record_from_coco_annotations(images_info_file, FLAGS.image_dir,
+                                          FLAGS.output_file_prefix,
+                                          FLAGS.num_shards,
+                                          FLAGS.object_annotations_file,
+                                          FLAGS.caption_annotations_file,
+                                          FLAGS.panoptic_masks_dir,
+                                          FLAGS.panoptic_annotations_file,
+                                          FLAGS.include_panoptic_masks,
+                                          FLAGS.include_masks)
+
+
+if __name__ == '__main__':
+  app.run(main)
--- a/official/vision/data/process_coco_few_shot.sh
+++ b/official/vision/data/process_coco_few_shot.sh
+#!/bin/bash
+#
+# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
+
+tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
+base_image_dir="/tmp/coco_images"
+output_dir="/tmp/coco_few_shot"
+while getopts ":i:o:" o; do
+  case "${o}" in
+    o) output_dir=${OPTARG} ;;
+    i) base_image_dir=${OPTARG} ;;
+    *) echo "Usage: ${0} [-i <base_image_dir>] [-o <output_dir>]" 1>&2; exit 1 ;;
+  esac
+done
+
+cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
+wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
+    -P "${tmp_dir}" -A "trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json" \
+    "http://${cocosplit_url}/"
+mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
+rm -rf "${tmp_dir}/${cocosplit_url}/"
+
+python process_coco_few_shot_json_files.py \
+    --logtostderr --workdir="${tmp_dir}"
+
+for seed in {0..9}; do
+  for shots in 1 3 5 10 30; do
+    python create_coco_tf_record.py \
+        --logtostderr \
+        --image_dir="${base_image_dir}/train2014" \
+        --image_dir="${base_image_dir}/val2014" \
+        --image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --caption_annotations_file="" \
+        --output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
+        --num_shards=4
+  done
+done
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir="${base_image_dir}/train2014" \
+    --image_dir="${base_image_dir}/val2014" \
+    --image_info_file="${tmp_dir}/datasplit/5k.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/5k.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/5k" \
+    --num_shards=10
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir="${base_image_dir}/train2014" \
+    --image_dir="${base_image_dir}/val2014" \
+    --image_info_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/trainvalno5k_base" \
+    --num_shards=200
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir="${base_image_dir}/train2014" \
+    --image_dir="${base_image_dir}/val2014" \
+    --image_info_file="${tmp_dir}/datasplit/5k_base.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/5k_base.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/5k_base" \
+    --num_shards=10
+
+rm -rf "${tmp_dir}"
--- a/official/vision/data/process_coco_few_shot_json_files.py
+++ b/official/vision/data/process_coco_few_shot_json_files.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processes the JSON files for COCO few-shot.
+
+We assume that `workdir` mirrors the contents of
+http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
+files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
+"Frustratingly Simple Few-Shot Object Detection" paper uses.
+"""
+
+import collections
+import itertools
+import json
+import logging
+import os
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+
+flags.DEFINE_string('workdir', None, 'Working directory.')
+
+FLAGS = flags.FLAGS
+CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
+              'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
+              'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
+              'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
+              'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
+              'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
+              'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
+              'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
+              'parking meter', 'person', 'pizza', 'potted plant',
+              'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
+              'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
+              'stop sign', 'suitcase', 'surfboard', 'teddy bear',
+              'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
+              'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
+              'wine glass', 'zebra']
+SEEDS = list(range(10))
+SHOTS = [1, 3, 5, 10, 30]
+
+FILE_SUFFIXES = collections.defaultdict(list)
+for _seed, _shots in itertools.product(SEEDS, SHOTS):
+  for _category in CATEGORIES:
+    FILE_SUFFIXES[(_seed, _shots)].append(
+        '{}full_box_{}shot_{}_trainval.json'.format(
+            # http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
+            #
+            #   datasplit/
+            #     trainvalno5k.json
+            #     5k.json
+            #   full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #   seed{1-9}/
+            #     full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #
+            # This means that the JSON files for seed0 are located in the root
+            # directory rather than in a `seed?/` subdirectory, hence the
+            # conditional expression below.
+            '' if _seed == 0 else 'seed{}/'.format(_seed),
+            _shots,
+            _category))
+
+# Base class IDs, as defined in
+# https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/evaluation/coco_evaluation.py#L60-L65
+BASE_CLASS_IDS = [8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
+                  35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51,
+                  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 65, 70, 73, 74, 75,
+                  76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+
+
+def main(unused_argv):
+  workdir = FLAGS.workdir
+
+  # Filter novel class annotations from the training and validation sets.
+  for name in ('trainvalno5k', '5k'):
+    file_path = os.path.join(workdir, 'datasplit', '{}.json'.format(name))
+    with tf.io.gfile.GFile(file_path, 'r') as f:
+      json_dict = json.load(f)
+
+    json_dict['annotations'] = [a for a in json_dict['annotations']
+                                if a['category_id'] in BASE_CLASS_IDS]
+    output_path = os.path.join(
+        workdir, 'datasplit', '{}_base.json'.format(name))
+    with tf.io.gfile.GFile(output_path, 'w') as f:
+      json.dump(json_dict, f)
+
+  for seed, shots in itertools.product(SEEDS, SHOTS):
+    # Retrieve all examples for a given seed and shots setting.
+    file_paths = [os.path.join(workdir, suffix)
+                  for suffix in FILE_SUFFIXES[(seed, shots)]]
+    json_dicts = []
+    for file_path in file_paths:
+      with tf.io.gfile.GFile(file_path, 'r') as f:
+        json_dicts.append(json.load(f))
+
+    # Make sure that all JSON files for a given seed and shots setting have the
+    # same metadata. We count on this to fuse them later on.
+    metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
+                       'categories': d['categories']} for d in json_dicts]
+    if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
+      raise RuntimeError(
+          'JSON files for {} shots (seed {}) '.format(shots, seed) +
+          'have different info, licences, or categories fields')
+
+    # Retrieve images across all JSON files.
+    images = sum((d['images'] for d in json_dicts), [])
+    # Remove duplicate image entries.
+    images = list({image['id']: image for image in images}.values())
+
+    output_dict = {
+        'info': json_dicts[0]['info'],
+        'licenses': json_dicts[0]['licenses'],
+        'categories': json_dicts[0]['categories'],
+        'images': images,
+        'annotations': sum((d['annotations'] for d in json_dicts), [])
+    }
+
+    output_path = os.path.join(workdir,
+                               '{}shot_seed{}.json'.format(shots, seed))
+    with tf.io.gfile.GFile(output_path, 'w') as f:
+      json.dump(output_dict, f)
+    logger.info('Processed %d shots (seed %d) and saved to %s',
+                shots, seed, output_path)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('workdir')
+  app.run(main)
--- a/official/vision/data/process_coco_panoptic.sh
+++ b/official/vision/data/process_coco_panoptic.sh
+#!/bin/bash
+
+sudo apt update
+sudo apt install unzip aria2 -y
+
+DATA_DIR=$1
+aria2c -j 8 -Z \
+  http://images.cocodataset.org/annotations/annotations_trainval2017.zip \
+  http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip \
+  http://images.cocodataset.org/zips/train2017.zip \
+  http://images.cocodataset.org/zips/val2017.zip \
+  --dir=$DATA_DIR;
+
+unzip $DATA_DIR/"*".zip -d $DATA_DIR;
+mkdir $DATA_DIR/zips && mv $DATA_DIR/*.zip $DATA_DIR/zips;
+unzip $DATA_DIR/annotations/panoptic_train2017.zip -d $DATA_DIR
+unzip $DATA_DIR/annotations/panoptic_val2017.zip -d $DATA_DIR
+
+python3 official/vision/beta/data/create_coco_tf_record.py \
+  --logtostderr  \
+  --image_dir="$DATA_DIR/val2017" \
+  --object_annotations_file="$DATA_DIR/annotations/instances_val2017.json"  \
+  --output_file_prefix="$DATA_DIR/tfrecords/val"  \
+  --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_val2017.json" \
+  --panoptic_masks_dir="$DATA_DIR/panoptic_val2017" \
+  --num_shards=8 \
+  --include_masks \
+  --include_panoptic_masks
+
+
+python3 official/vision/beta/data/create_coco_tf_record.py \
+  --logtostderr  \
+  --image_dir="$DATA_DIR/train2017" \
+  --object_annotations_file="$DATA_DIR/annotations/instances_train2017.json"  \
+  --output_file_prefix="$DATA_DIR/tfrecords/train"  \
+  --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_train2017.json" \
+  --panoptic_masks_dir="$DATA_DIR/panoptic_train2017" \
+  --num_shards=32 \
+  --include_masks \
+  --include_panoptic_masks
--- a/official/vision/data/tfrecord_lib.py
+++ b/official/vision/data/tfrecord_lib.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for creating TFRecord datasets."""
+
+import hashlib
+import io
+import itertools
+
+from absl import logging
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+import multiprocessing as mp
+
+
+def convert_to_feature(value, value_type=None):
+  """Converts the given python object to a tf.train.Feature.
+
+  Args:
+    value: int, float, bytes or a list of them.
+    value_type: optional, if specified, forces the feature to be of the given
+      type. Otherwise, type is inferred automatically. Can be one of
+      ['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list']
+
+  Returns:
+    feature: A tf.train.Feature object.
+  """
+
+  if value_type is None:
+
+    element = value[0] if isinstance(value, list) else value
+
+    if isinstance(element, bytes):
+      value_type = 'bytes'
+
+    elif isinstance(element, (int, np.integer)):
+      value_type = 'int64'
+
+    elif isinstance(element, (float, np.floating)):
+      value_type = 'float'
+
+    else:
+      raise ValueError('Cannot convert type {} to feature'.
+                       format(type(element)))
+
+    if isinstance(value, list):
+      value_type = value_type + '_list'
+
+  if value_type == 'int64':
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+  elif value_type == 'int64_list':
+    value = np.asarray(value).astype(np.int64).reshape(-1)
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+  elif value_type == 'float':
+    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
+
+  elif value_type == 'float_list':
+    value = np.asarray(value).astype(np.float32).reshape(-1)
+    return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+  elif value_type == 'bytes':
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+  elif value_type == 'bytes_list':
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
+
+  else:
+    raise ValueError('Unknown value_type parameter - {}'.format(value_type))
+
+
+def image_info_to_feature_dict(height, width, filename, image_id,
+                               encoded_str, encoded_format):
+  """Convert image information to a dict of features."""
+
+  key = hashlib.sha256(encoded_str).hexdigest()
+
+  return {
+      'image/height': convert_to_feature(height),
+      'image/width': convert_to_feature(width),
+      'image/filename': convert_to_feature(filename.encode('utf8')),
+      'image/source_id': convert_to_feature(str(image_id).encode('utf8')),
+      'image/key/sha256': convert_to_feature(key.encode('utf8')),
+      'image/encoded': convert_to_feature(encoded_str),
+      'image/format': convert_to_feature(encoded_format.encode('utf8')),
+  }
+
+
+def read_image(image_path):
+  pil_image = Image.open(image_path)
+  return np.asarray(pil_image)
+
+
+def encode_mask_as_png(mask):
+  pil_image = Image.fromarray(mask)
+  output_io = io.BytesIO()
+  pil_image.save(output_io, format='PNG')
+  return output_io.getvalue()
+
+
+def write_tf_record_dataset(output_path, annotation_iterator,
+                            process_func, num_shards,
+                            use_multiprocessing=True, unpack_arguments=True):
+  """Iterates over annotations, processes them and writes into TFRecords.
+
+  Args:
+    output_path: The prefix path to create TF record files.
+    annotation_iterator: An iterator of tuples containing details about the
+      dataset.
+    process_func: A function which takes the elements from the tuples of
+      annotation_iterator as arguments and returns a tuple of (tf.train.Example,
+      int). The integer indicates the number of annotations that were skipped.
+    num_shards: int, the number of shards to write for the dataset.
+    use_multiprocessing:
+      Whether or not to use multiple processes to write TF Records.
+    unpack_arguments:
+      Whether to unpack the tuples from annotation_iterator as individual
+        arguments to the process func or to pass the returned value as it is.
+
+  Returns:
+    num_skipped: The total number of skipped annotations.
+  """
+
+  writers = [
+      tf.io.TFRecordWriter(
+          output_path + '-%05d-of-%05d.tfrecord' % (i, num_shards))
+      for i in range(num_shards)
+  ]
+
+  total_num_annotations_skipped = 0
+
+  if use_multiprocessing:
+    pool = mp.Pool()
+    if unpack_arguments:
+      tf_example_iterator = pool.starmap(process_func, annotation_iterator)
+    else:
+      tf_example_iterator = pool.imap(process_func, annotation_iterator)
+  else:
+    if unpack_arguments:
+      tf_example_iterator = itertools.starmap(process_func, annotation_iterator)
+    else:
+      tf_example_iterator = map(process_func, annotation_iterator)
+
+  for idx, (tf_example, num_annotations_skipped) in enumerate(
+      tf_example_iterator):
+    if idx % 100 == 0:
+      logging.info('On image %d', idx)
+
+    total_num_annotations_skipped += num_annotations_skipped
+    writers[idx % num_shards].write(tf_example.SerializeToString())
+
+  if use_multiprocessing:
+    pool.close()
+    pool.join()
+
+  for writer in writers:
+    writer.close()
+
+  logging.info('Finished writing, skipped %d annotations.',
+               total_num_annotations_skipped)
+  return total_num_annotations_skipped
+
+
+def check_and_make_dir(directory):
+  """Creates the directory if it doesn't exist."""
+  if not tf.io.gfile.isdir(directory):
+    tf.io.gfile.makedirs(directory)
--- a/official/vision/data/tfrecord_lib_test.py
+++ b/official/vision/data/tfrecord_lib_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tfrecord_lib."""
+
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.data import tfrecord_lib
+
+
+FLAGS = flags.FLAGS
+
+
+def process_sample(x):
+  d = {'x': x}
+  return tf.train.Example(features=tf.train.Features(feature=d)), 0
+
+
+def parse_function(example_proto):
+
+  feature_description = {
+      'x': tf.io.FixedLenFeature([], tf.int64, default_value=-1)
+  }
+  return tf.io.parse_single_example(example_proto, feature_description)
+
+
+class TfrecordLibTest(parameterized.TestCase):
+
+  def test_write_tf_record_dataset(self):
+    data = [(tfrecord_lib.convert_to_feature(i),) for i in range(17)]
+
+    path = os.path.join(FLAGS.test_tmpdir, 'train')
+
+    tfrecord_lib.write_tf_record_dataset(
+        path, data, process_sample, 3, use_multiprocessing=False)
+    tfrecord_files = tf.io.gfile.glob(path + '*')
+
+    self.assertLen(tfrecord_files, 3)
+
+    dataset = tf.data.TFRecordDataset(tfrecord_files)
+    dataset = dataset.map(parse_function)
+
+    read_values = set(d['x'] for d in dataset.as_numpy_iterator())
+    self.assertSetEqual(read_values, set(range(17)))
+
+  def test_convert_to_feature_float(self):
+
+    proto = tfrecord_lib.convert_to_feature(0.0)
+    self.assertEqual(proto.float_list.value[0], 0.0)
+
+  def test_convert_to_feature_int(self):
+
+    proto = tfrecord_lib.convert_to_feature(0)
+    self.assertEqual(proto.int64_list.value[0], 0)
+
+  def test_convert_to_feature_bytes(self):
+
+    proto = tfrecord_lib.convert_to_feature(b'123')
+    self.assertEqual(proto.bytes_list.value[0], b'123')
+
+  def test_convert_to_feature_float_list(self):
+
+    proto = tfrecord_lib.convert_to_feature([0.0, 1.0])
+    self.assertSequenceAlmostEqual(proto.float_list.value, [0.0, 1.0])
+
+  def test_convert_to_feature_int_list(self):
+
+    proto = tfrecord_lib.convert_to_feature([0, 1])
+    self.assertSequenceAlmostEqual(proto.int64_list.value, [0, 1])
+
+  def test_convert_to_feature_bytes_list(self):
+
+    proto = tfrecord_lib.convert_to_feature([b'123', b'456'])
+    self.assertSequenceAlmostEqual(proto.bytes_list.value, [b'123', b'456'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/dataloaders/__init__.py
+++ b/official/vision/dataloaders/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/vision/dataloaders/classification_input.py
+++ b/official/vision/dataloaders/classification_input.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classification decoder and parser."""
+from typing import Any, Dict, List, Optional
+# Import libraries
+import tensorflow as tf
+
+from official.vision.configs import common
+from official.vision.dataloaders import decoder
+from official.vision.dataloaders import parser
+from official.vision.ops import augment
+from official.vision.ops import preprocess_ops
+
+MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
+STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
+
+DEFAULT_IMAGE_FIELD_KEY = 'image/encoded'
+DEFAULT_LABEL_FIELD_KEY = 'image/class/label'
+
+
+class Decoder(decoder.Decoder):
+  """A tf.Example decoder for classification task."""
+
+  def __init__(self,
+               image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
+               label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
+               is_multilabel: bool = False,
+               keys_to_features: Optional[Dict[str, Any]] = None):
+    if not keys_to_features:
+      keys_to_features = {
+          image_field_key:
+              tf.io.FixedLenFeature((), tf.string, default_value=''),
+      }
+      if is_multilabel:
+        keys_to_features.update(
+            {label_field_key: tf.io.VarLenFeature(dtype=tf.int64)})
+      else:
+        keys_to_features.update({
+            label_field_key:
+                tf.io.FixedLenFeature((), tf.int64, default_value=-1)
+        })
+    self._keys_to_features = keys_to_features
+
+  def decode(self, serialized_example):
+    return tf.io.parse_single_example(
+        serialized_example, self._keys_to_features)
+
+
+class Parser(parser.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+
+  def __init__(self,
+               output_size: List[int],
+               num_classes: float,
+               image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
+               label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
+               decode_jpeg_only: bool = True,
+               aug_rand_hflip: bool = True,
+               aug_type: Optional[common.Augmentation] = None,
+               color_jitter: float = 0.,
+               random_erasing: Optional[common.RandomErasing] = None,
+               is_multilabel: bool = False,
+               dtype: str = 'float32'):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      num_classes: `float`, number of classes.
+      image_field_key: `str`, the key name to encoded image in tf.Example.
+      label_field_key: `str`, the key name to label in tf.Example.
+      decode_jpeg_only: `bool`, if True, only JPEG format is decoded, this is
+        faster than decoding other types. Default is True.
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      aug_type: An optional Augmentation object to choose from AutoAugment and
+        RandAugment.
+      color_jitter: Magnitude of color jitter. If > 0, the value is used to
+        generate random scale factor for brightness, contrast and saturation.
+        See `preprocess_ops.color_jitter` for more details.
+      random_erasing: if not None, augment input image by random erasing. See
+        `augment.RandomErasing` for more details.
+      is_multilabel: A `bool`, whether or not each example has multiple labels.
+      dtype: `str`, cast output image in dtype. It can be 'float32', 'float16',
+        or 'bfloat16'.
+    """
+    self._output_size = output_size
+    self._aug_rand_hflip = aug_rand_hflip
+    self._num_classes = num_classes
+    self._image_field_key = image_field_key
+    if dtype == 'float32':
+      self._dtype = tf.float32
+    elif dtype == 'float16':
+      self._dtype = tf.float16
+    elif dtype == 'bfloat16':
+      self._dtype = tf.bfloat16
+    else:
+      raise ValueError('dtype {!r} is not supported!'.format(dtype))
+    if aug_type:
+      if aug_type.type == 'autoaug':
+        self._augmenter = augment.AutoAugment(
+            augmentation_name=aug_type.autoaug.augmentation_name,
+            cutout_const=aug_type.autoaug.cutout_const,
+            translate_const=aug_type.autoaug.translate_const)
+      elif aug_type.type == 'randaug':
+        self._augmenter = augment.RandAugment(
+            num_layers=aug_type.randaug.num_layers,
+            magnitude=aug_type.randaug.magnitude,
+            cutout_const=aug_type.randaug.cutout_const,
+            translate_const=aug_type.randaug.translate_const,
+            prob_to_apply=aug_type.randaug.prob_to_apply,
+            exclude_ops=aug_type.randaug.exclude_ops)
+      else:
+        raise ValueError('Augmentation policy {} not supported.'.format(
+            aug_type.type))
+    else:
+      self._augmenter = None
+    self._label_field_key = label_field_key
+    self._color_jitter = color_jitter
+    if random_erasing:
+      self._random_erasing = augment.RandomErasing(
+          probability=random_erasing.probability,
+          min_area=random_erasing.min_area,
+          max_area=random_erasing.max_area,
+          min_aspect=random_erasing.min_aspect,
+          max_aspect=random_erasing.max_aspect,
+          min_count=random_erasing.min_count,
+          max_count=random_erasing.max_count,
+          trials=random_erasing.trials)
+    else:
+      self._random_erasing = None
+    self._is_multilabel = is_multilabel
+    self._decode_jpeg_only = decode_jpeg_only
+
+  def _parse_train_data(self, decoded_tensors):
+    """Parses data for training."""
+    image = self._parse_train_image(decoded_tensors)
+    label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
+    if self._is_multilabel:
+      if isinstance(label, tf.sparse.SparseTensor):
+        label = tf.sparse.to_dense(label)
+      label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
+    return image, label
+
+  def _parse_eval_data(self, decoded_tensors):
+    """Parses data for evaluation."""
+    image = self._parse_eval_image(decoded_tensors)
+    label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
+    if self._is_multilabel:
+      if isinstance(label, tf.sparse.SparseTensor):
+        label = tf.sparse.to_dense(label)
+      label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
+    return image, label
+
+  def _parse_train_image(self, decoded_tensors):
+    """Parses image data for training."""
+    image_bytes = decoded_tensors[self._image_field_key]
+
+    if self._decode_jpeg_only:
+      image_shape = tf.image.extract_jpeg_shape(image_bytes)
+
+      # Crops image.
+      cropped_image = preprocess_ops.random_crop_image_v2(
+          image_bytes, image_shape)
+      image = tf.cond(
+          tf.reduce_all(tf.equal(tf.shape(cropped_image), image_shape)),
+          lambda: preprocess_ops.center_crop_image_v2(image_bytes, image_shape),
+          lambda: cropped_image)
+    else:
+      # Decodes image.
+      image = tf.io.decode_image(image_bytes, channels=3)
+      image.set_shape([None, None, 3])
+
+      # Crops image.
+      cropped_image = preprocess_ops.random_crop_image(image)
+
+      image = tf.cond(
+          tf.reduce_all(tf.equal(tf.shape(cropped_image), tf.shape(image))),
+          lambda: preprocess_ops.center_crop_image(image),
+          lambda: cropped_image)
+
+    if self._aug_rand_hflip:
+      image = tf.image.random_flip_left_right(image)
+
+    # Color jitter.
+    if self._color_jitter > 0:
+      image = preprocess_ops.color_jitter(image, self._color_jitter,
+                                          self._color_jitter,
+                                          self._color_jitter)
+
+    # Resizes image.
+    image = tf.image.resize(
+        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
+    image.set_shape([self._output_size[0], self._output_size[1], 3])
+
+    # Apply autoaug or randaug.
+    if self._augmenter is not None:
+      image = self._augmenter.distort(image)
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image,
+                                           offset=MEAN_RGB,
+                                           scale=STDDEV_RGB)
+
+    # Random erasing after the image has been normalized
+    if self._random_erasing is not None:
+      image = self._random_erasing.distort(image)
+
+    # Convert image to self._dtype.
+    image = tf.image.convert_image_dtype(image, self._dtype)
+
+    return image
+
+  def _parse_eval_image(self, decoded_tensors):
+    """Parses image data for evaluation."""
+    image_bytes = decoded_tensors[self._image_field_key]
+
+    if self._decode_jpeg_only:
+      image_shape = tf.image.extract_jpeg_shape(image_bytes)
+
+      # Center crops.
+      image = preprocess_ops.center_crop_image_v2(image_bytes, image_shape)
+    else:
+      # Decodes image.
+      image = tf.io.decode_image(image_bytes, channels=3)
+      image.set_shape([None, None, 3])
+
+      # Center crops.
+      image = preprocess_ops.center_crop_image(image)
+
+    image = tf.image.resize(
+        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
+    image.set_shape([self._output_size[0], self._output_size[1], 3])
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image,
+                                           offset=MEAN_RGB,
+                                           scale=STDDEV_RGB)
+
+    # Convert image to self._dtype.
+    image = tf.image.convert_image_dtype(image, self._dtype)
+
+    return image
+
+  @classmethod
+  def inference_fn(cls,
+                   image: tf.Tensor,
+                   input_image_size: List[int],
+                   num_channels: int = 3) -> tf.Tensor:
+    """Builds image model inputs for serving."""
+
+    image = tf.cast(image, dtype=tf.float32)
+    image = preprocess_ops.center_crop_image(image)
+    image = tf.image.resize(
+        image, input_image_size, method=tf.image.ResizeMethod.BILINEAR)
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(
+        image, offset=MEAN_RGB, scale=STDDEV_RGB)
+    image.set_shape(input_image_size + [num_channels])
+    return image
--- a/official/vision/dataloaders/decoder.py
+++ b/official/vision/dataloaders/decoder.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The generic decoder interface."""
+
+import abc
+
+
+class Decoder(object):
+  """Decodes the raw data into tensors."""
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def decode(self, serialized_example):
+    """Decodes the serialized example into tensors.
+
+    Args:
+      serialized_example: a serialized string tensor that encodes the data.
+
+    Returns:
+      decoded_tensors: a dict of Tensors.
+    """
+    pass
--- a/official/vision/dataloaders/input_reader.py
+++ b/official/vision/dataloaders/input_reader.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Dataset reader for vision model garden."""
+
+from typing import Any, Callable, Optional, Tuple
+
+import tensorflow as tf
+
+from official.core import config_definitions as cfg
+from official.core import input_reader
+
+
+def calculate_batch_sizes(total_batch_size: int,
+                          pseudo_label_ratio: float) -> Tuple[int, int]:
+  """Calculates labeled and pseudo-labeled dataset batch sizes.
+
+  Returns (labeled_batch_size, pseudo_labeled_batch_size) given a
+  total batch size and pseudo-label data ratio.
+
+  Args:
+   total_batch_size: The total batch size for all data.
+   pseudo_label_ratio: A non-negative float ratio of pseudo-labeled
+     to labeled data in a batch.
+
+  Returns:
+    (labeled_batch_size, pseudo_labeled_batch_size) as ints.
+
+  Raises:
+    ValueError: If total_batch_size is negative.
+    ValueError: If pseudo_label_ratio is negative.
+  """
+  if total_batch_size < 0:
+    raise ValueError('Invalid total_batch_size: {}'.format(total_batch_size))
+  if pseudo_label_ratio < 0.0:
+    raise ValueError(
+        'Invalid pseudo_label_ratio: {}'.format(pseudo_label_ratio))
+
+  ratio_factor = pseudo_label_ratio / (1.0 + pseudo_label_ratio)
+  pseudo_labeled_batch_size = int(round(total_batch_size * ratio_factor))
+  labeled_batch_size = total_batch_size - pseudo_labeled_batch_size
+  return labeled_batch_size, pseudo_labeled_batch_size
+
+
+class CombinationDatasetInputReader(input_reader.InputReader):
+  """Combination dataset input reader."""
+
+  def __init__(self,
+               params: cfg.DataConfig,
+               dataset_fn=tf.data.TFRecordDataset,
+               pseudo_label_dataset_fn=tf.data.TFRecordDataset,
+               decoder_fn: Optional[Callable[..., Any]] = None,
+               sample_fn: Optional[Callable[..., Any]] = None,
+               parser_fn: Optional[Callable[..., Any]] = None,
+               transform_and_batch_fn: Optional[Callable[
+                   [tf.data.Dataset, Optional[tf.distribute.InputContext]],
+                   tf.data.Dataset]] = None,
+               postprocess_fn: Optional[Callable[..., Any]] = None):
+    """Initializes an CombinationDatasetInputReader instance.
+
+    This class mixes a labeled and pseudo-labeled dataset. The params
+    must contain "pseudo_label_data.input_path" to specify the
+    pseudo-label dataset files and "pseudo_label_data.data_ratio"
+    to specify a per-batch mixing ratio of pseudo-label examples to
+    labeled dataset examples.
+
+    Args:
+      params: A config_definitions.DataConfig object.
+      dataset_fn: A `tf.data.Dataset` that consumes the input files. For
+        example, it can be `tf.data.TFRecordDataset`.
+      pseudo_label_dataset_fn: A `tf.data.Dataset` that consumes the input
+        files. For example, it can be `tf.data.TFRecordDataset`.
+      decoder_fn: An optional `callable` that takes the serialized data string
+        and decodes them into the raw tensor dictionary.
+      sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
+        input and outputs the transformed dataset. It performs sampling on the
+        decoded raw tensors dict before the parser_fn.
+      parser_fn: An optional `callable` that takes the decoded raw tensors dict
+        and parse them into a dictionary of tensors that can be consumed by the
+        model. It will be executed after decoder_fn.
+      transform_and_batch_fn: An optional `callable` that takes a
+        `tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
+        input, and returns a `tf.data.Dataset` object. It will be executed after
+        `parser_fn` to transform and batch the dataset; if None, after
+        `parser_fn` is executed, the dataset will be batched into per-replica
+        batch size.
+      postprocess_fn: A optional `callable` that processes batched tensors. It
+        will be executed after batching.
+
+    Raises:
+      ValueError: If drop_remainder is False.
+    """
+    super().__init__(params=params,
+                     dataset_fn=dataset_fn,
+                     decoder_fn=decoder_fn,
+                     sample_fn=sample_fn,
+                     parser_fn=parser_fn,
+                     transform_and_batch_fn=transform_and_batch_fn,
+                     postprocess_fn=postprocess_fn)
+
+    self._pseudo_label_file_pattern = params.pseudo_label_data.input_path
+    self._pseudo_label_dataset_fn = pseudo_label_dataset_fn
+    self._pseudo_label_data_ratio = params.pseudo_label_data.data_ratio
+    self._pseudo_label_matched_files = input_reader.match_files(
+        self._pseudo_label_file_pattern)
+    if not self._drop_remainder:
+      raise ValueError(
+          'Must use drop_remainder=True with CombinationDatasetInputReader')
+
+  def read(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Generates a tf.data.Dataset object."""
+
+    labeled_batch_size, pl_batch_size = calculate_batch_sizes(
+        self._global_batch_size, self._pseudo_label_data_ratio)
+
+    if not labeled_batch_size and pl_batch_size:
+      raise ValueError(
+          'Invalid batch_size: {} and pseudo_label_data_ratio: {}, '
+          'resulting in a 0 batch size for one of the datasets.'.format(
+              self._global_batch_size, self._pseudo_label_data_ratio))
+
+    def _read_decode_and_parse_dataset(matched_files, dataset_fn, batch_size,
+                                       input_context, tfds_builder):
+      dataset = self._read_data_source(matched_files, dataset_fn, input_context,
+                                       tfds_builder)
+      return self._decode_and_parse_dataset(dataset, batch_size, input_context)
+
+    labeled_dataset = _read_decode_and_parse_dataset(
+        matched_files=self._matched_files,
+        dataset_fn=self._dataset_fn,
+        batch_size=labeled_batch_size,
+        input_context=input_context,
+        tfds_builder=self._tfds_builder)
+
+    pseudo_labeled_dataset = _read_decode_and_parse_dataset(
+        matched_files=self._pseudo_label_matched_files,
+        dataset_fn=self._pseudo_label_dataset_fn,
+        batch_size=pl_batch_size,
+        input_context=input_context,
+        tfds_builder=False)
+
+    def concat_fn(d1, d2):
+      return tf.nest.map_structure(
+          lambda x1, x2: tf.concat([x1, x2], axis=0), d1, d2)
+
+    dataset_concat = tf.data.Dataset.zip(
+        (labeled_dataset, pseudo_labeled_dataset))
+    dataset_concat = dataset_concat.map(
+        concat_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    def maybe_map_fn(dataset, fn):
+      return dataset if fn is None else dataset.map(
+          fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    dataset_concat = maybe_map_fn(dataset_concat, self._postprocess_fn)
+    dataset_concat = self._maybe_apply_data_service(dataset_concat,
+                                                    input_context)
+
+    if self._deterministic is not None:
+      options = tf.data.Options()
+      options.experimental_deterministic = self._deterministic
+      dataset_concat = dataset_concat.with_options(options)
+
+    return dataset_concat.prefetch(tf.data.experimental.AUTOTUNE)