Internal change

PiperOrigin-RevId: 431756117

Internal change
PiperOrigin-RevId: 431756117
c8e6faf7 · A. Unique TensorFlower · 13a5e4fb · c8e6faf7 · c8e6faf7 · c8e6faf7
Commit c8e6faf7 authored Mar 01, 2022 by A. Unique TensorFlower
20 changed files
--- a/official/vision/configs/semantic_segmentation_test.py
+++ b/official/vision/configs/semantic_segmentation_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for semantic_segmentation."""
+
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official import vision
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.configs import semantic_segmentation as exp_cfg
+
+
+class ImageSegmentationConfigTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(('seg_deeplabv3_pascal',),
+                            ('seg_deeplabv3plus_pascal',))
+  def test_semantic_segmentation_configs(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.SemanticSegmentationTask)
+    self.assertIsInstance(config.task.model,
+                          exp_cfg.SemanticSegmentationModel)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaises(KeyError):
+      config.validate()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/configs/video_classification.py
+++ b/official/vision/configs/video_classification.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Video classification configuration definition."""
+import dataclasses
+from typing import Optional, Tuple
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling import optimization
+from official.vision.configs import backbones_3d
+from official.vision.configs import common
+
+
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """The base configuration for building datasets."""
+  name: Optional[str] = None
+  file_type: Optional[str] = 'tfrecord'
+  compressed_input: bool = False
+  split: str = 'train'
+  variant_name: Optional[str] = None
+  feature_shape: Tuple[int, ...] = (64, 224, 224, 3)
+  temporal_stride: int = 1
+  random_stride_range: int = 0
+  num_test_clips: int = 1
+  num_test_crops: int = 1
+  num_classes: int = -1
+  num_examples: int = -1
+  global_batch_size: int = 128
+  data_format: str = 'channels_last'
+  dtype: str = 'float32'
+  one_hot: bool = True
+  shuffle_buffer_size: int = 64
+  cache: bool = False
+  input_path: str = ''
+  is_training: bool = True
+  cycle_length: int = 10
+  drop_remainder: bool = True
+  min_image_size: int = 256
+  is_multilabel: bool = False
+  output_audio: bool = False
+  audio_feature: str = ''
+  audio_feature_shape: Tuple[int, ...] = (-1,)
+  aug_min_aspect_ratio: float = 0.5
+  aug_max_aspect_ratio: float = 2.0
+  aug_min_area_ratio: float = 0.49
+  aug_max_area_ratio: float = 1.0
+  aug_type: Optional[str] = None  # 'autoaug', 'randaug', or None
+  image_field_key: str = 'image/encoded'
+  label_field_key: str = 'clip/label/index'
+
+
+def kinetics400(is_training):
+  """Generated Kinectics 400 dataset configs."""
+  return DataConfig(
+      name='kinetics400',
+      num_classes=400,
+      is_training=is_training,
+      split='train' if is_training else 'valid',
+      drop_remainder=is_training,
+      num_examples=215570 if is_training else 17706,
+      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
+
+
+def kinetics600(is_training):
+  """Generated Kinectics 600 dataset configs."""
+  return DataConfig(
+      name='kinetics600',
+      num_classes=600,
+      is_training=is_training,
+      split='train' if is_training else 'valid',
+      drop_remainder=is_training,
+      num_examples=366016 if is_training else 27780,
+      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
+
+
+def kinetics700(is_training):
+  """Generated Kinectics 600 dataset configs."""
+  return DataConfig(
+      name='kinetics700',
+      num_classes=700,
+      is_training=is_training,
+      split='train' if is_training else 'valid',
+      drop_remainder=is_training,
+      num_examples=522883 if is_training else 33441,
+      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
+
+
+def kinetics700_2020(is_training):
+  """Generated Kinectics 600 dataset configs."""
+  return DataConfig(
+      name='kinetics700',
+      num_classes=700,
+      is_training=is_training,
+      split='train' if is_training else 'valid',
+      drop_remainder=is_training,
+      num_examples=535982 if is_training else 33640,
+      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
+
+
+@dataclasses.dataclass
+class VideoClassificationModel(hyperparams.Config):
+  """The model config."""
+  model_type: str = 'video_classification'
+  backbone: backbones_3d.Backbone3D = backbones_3d.Backbone3D(
+      type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50())
+  norm_activation: common.NormActivation = common.NormActivation(
+      use_sync_bn=False)
+  dropout_rate: float = 0.2
+  aggregate_endpoints: bool = False
+  require_endpoints: Optional[Tuple[str, ...]] = None
+
+
+@dataclasses.dataclass
+class Losses(hyperparams.Config):
+  one_hot: bool = True
+  label_smoothing: float = 0.0
+  l2_weight_decay: float = 0.0
+
+
+@dataclasses.dataclass
+class Metrics(hyperparams.Config):
+  use_per_class_recall: bool = False
+
+
+@dataclasses.dataclass
+class VideoClassificationTask(cfg.TaskConfig):
+  """The task config."""
+  model: VideoClassificationModel = VideoClassificationModel()
+  train_data: DataConfig = DataConfig(is_training=True, drop_remainder=True)
+  validation_data: DataConfig = DataConfig(
+      is_training=False, drop_remainder=False)
+  losses: Losses = Losses()
+  metrics: Metrics = Metrics()
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: str = 'all'  # all or backbone
+  # Spatial Partitioning fields.
+  train_input_partition_dims: Optional[Tuple[int, ...]] = None
+  eval_input_partition_dims: Optional[Tuple[int, ...]] = None
+
+
+def add_trainer(experiment: cfg.ExperimentConfig,
+                train_batch_size: int,
+                eval_batch_size: int,
+                learning_rate: float = 1.6,
+                train_epochs: int = 44,
+                warmup_epochs: int = 5):
+  """Add and config a trainer to the experiment config."""
+  if experiment.task.train_data.num_examples <= 0:
+    raise ValueError('Wrong train dataset size {!r}'.format(
+        experiment.task.train_data))
+  if experiment.task.validation_data.num_examples <= 0:
+    raise ValueError('Wrong validation dataset size {!r}'.format(
+        experiment.task.validation_data))
+  experiment.task.train_data.global_batch_size = train_batch_size
+  experiment.task.validation_data.global_batch_size = eval_batch_size
+  steps_per_epoch = experiment.task.train_data.num_examples // train_batch_size
+  experiment.trainer = cfg.TrainerConfig(
+      steps_per_loop=steps_per_epoch,
+      summary_interval=steps_per_epoch,
+      checkpoint_interval=steps_per_epoch,
+      train_steps=train_epochs * steps_per_epoch,
+      validation_steps=experiment.task.validation_data.num_examples //
+      eval_batch_size,
+      validation_interval=steps_per_epoch,
+      optimizer_config=optimization.OptimizationConfig({
+          'optimizer': {
+              'type': 'sgd',
+              'sgd': {
+                  'momentum': 0.9,
+                  'nesterov': True,
+              }
+          },
+          'learning_rate': {
+              'type': 'cosine',
+              'cosine': {
+                  'initial_learning_rate': learning_rate,
+                  'decay_steps': train_epochs * steps_per_epoch,
+              }
+          },
+          'warmup': {
+              'type': 'linear',
+              'linear': {
+                  'warmup_steps': warmup_epochs * steps_per_epoch,
+                  'warmup_learning_rate': 0
+              }
+          }
+      }))
+  return experiment
+
+
+@exp_factory.register_config_factory('video_classification')
+def video_classification() -> cfg.ExperimentConfig:
+  """Video classification general."""
+  return cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=VideoClassificationTask(),
+      trainer=cfg.TrainerConfig(),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+
+
+@exp_factory.register_config_factory('video_classification_ucf101')
+def video_classification_ucf101() -> cfg.ExperimentConfig:
+  """Video classification on UCF-101 with resnet."""
+  train_dataset = DataConfig(
+      name='ucf101',
+      num_classes=101,
+      is_training=True,
+      split='train',
+      drop_remainder=True,
+      num_examples=9537,
+      temporal_stride=2,
+      feature_shape=(32, 224, 224, 3))
+  train_dataset.tfds_name = 'ucf101'
+  train_dataset.tfds_split = 'train'
+  validation_dataset = DataConfig(
+      name='ucf101',
+      num_classes=101,
+      is_training=True,
+      split='test',
+      drop_remainder=False,
+      num_examples=3783,
+      temporal_stride=2,
+      feature_shape=(32, 224, 224, 3))
+  validation_dataset.tfds_name = 'ucf101'
+  validation_dataset.tfds_split = 'test'
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(
+      config,
+      train_batch_size=64,
+      eval_batch_size=16,
+      learning_rate=0.8,
+      train_epochs=100)
+  return config
+
+
+@exp_factory.register_config_factory('video_classification_kinetics400')
+def video_classification_kinetics400() -> cfg.ExperimentConfig:
+  """Video classification on Kinectics 400 with resnet."""
+  train_dataset = kinetics400(is_training=True)
+  validation_dataset = kinetics400(is_training=False)
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
+  return config
+
+
+@exp_factory.register_config_factory('video_classification_kinetics600')
+def video_classification_kinetics600() -> cfg.ExperimentConfig:
+  """Video classification on Kinectics 600 with resnet."""
+  train_dataset = kinetics600(is_training=True)
+  validation_dataset = kinetics600(is_training=False)
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
+  return config
+
+
+@exp_factory.register_config_factory('video_classification_kinetics700')
+def video_classification_kinetics700() -> cfg.ExperimentConfig:
+  """Video classification on Kinectics 700 with resnet."""
+  train_dataset = kinetics700(is_training=True)
+  validation_dataset = kinetics700(is_training=False)
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
+  return config
+
+
+@exp_factory.register_config_factory('video_classification_kinetics700_2020')
+def video_classification_kinetics700_2020() -> cfg.ExperimentConfig:
+  """Video classification on Kinectics 700 2020 with resnet."""
+  train_dataset = kinetics700_2020(is_training=True)
+  validation_dataset = kinetics700_2020(is_training=False)
+  task = VideoClassificationTask(
+      model=VideoClassificationModel(
+          backbone=backbones_3d.Backbone3D(
+              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
+          norm_activation=common.NormActivation(
+              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
+      losses=Losses(l2_weight_decay=1e-4),
+      train_data=train_dataset,
+      validation_data=validation_dataset)
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=task,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.train_data.num_classes == task.validation_data.num_classes',
+      ])
+  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
+  return config
--- a/official/vision/configs/video_classification_test.py
+++ b/official/vision/configs/video_classification_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for video_classification."""
+
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official import vision
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.configs import video_classification as exp_cfg
+
+
+class VideoClassificationConfigTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(('video_classification',),
+                            ('video_classification_kinetics600',))
+  def test_video_classification_configs(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.VideoClassificationTask)
+    self.assertIsInstance(config.task.model, exp_cfg.VideoClassificationModel)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaises(KeyError):
+      config.validate()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/data/__init__.py
+++ b/official/vision/data/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/vision/data/create_coco_tf_record.py
+++ b/official/vision/data/create_coco_tf_record.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Convert raw COCO dataset to TFRecord format.
+
+This scripts follows the label map decoder format and supports detection
+boxes, instance masks and captions.
+
+Example usage:
+    python create_coco_tf_record.py --logtostderr \
+      --image_dir="${TRAIN_IMAGE_DIR}" \
+      --image_info_file="${TRAIN_IMAGE_INFO_FILE}" \
+      --object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
+      --caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
+      --output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
+      --num_shards=100
+"""
+
+import collections
+import json
+import logging
+import os
+
+from absl import app  # pylint:disable=unused-import
+from absl import flags
+import numpy as np
+
+from pycocotools import mask
+import tensorflow as tf
+
+import multiprocessing as mp
+from official.vision.data import tfrecord_lib
+
+
+flags.DEFINE_boolean(
+    'include_masks', False, 'Whether to include instance segmentations masks '
+    '(PNG encoded) in the result. default: False.')
+flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
+flags.DEFINE_string(
+    'image_info_file', '', 'File containing image information. '
+    'Tf Examples in the output files correspond to the image '
+    'info entries in this file. If this file is not provided '
+    'object_annotations_file is used if present. Otherwise, '
+    'caption_annotations_file is used to get image info.')
+flags.DEFINE_string(
+    'object_annotations_file', '', 'File containing object '
+    'annotations - boxes and instance masks.')
+flags.DEFINE_string('caption_annotations_file', '', 'File containing image '
+                    'captions.')
+flags.DEFINE_string('panoptic_annotations_file', '', 'File containing panoptic '
+                    'annotations.')
+flags.DEFINE_string('panoptic_masks_dir', '',
+                    'Directory containing panoptic masks annotations.')
+flags.DEFINE_boolean(
+    'include_panoptic_masks', False, 'Whether to include category and '
+    'instance masks in the result. These are required to run the PQ evaluator '
+    'default: False.')
+flags.DEFINE_string('output_file_prefix', '/tmp/train', 'Path to output file')
+flags.DEFINE_integer('num_shards', 32, 'Number of shards for output file.')
+
+FLAGS = flags.FLAGS
+
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+
+_VOID_LABEL = 0
+_VOID_INSTANCE_ID = 0
+_THING_CLASS_ID = 1
+_STUFF_CLASSES_OFFSET = 90
+
+
+def coco_segmentation_to_mask_png(segmentation, height, width, is_crowd):
+  """Encode a COCO mask segmentation as PNG string."""
+  run_len_encoding = mask.frPyObjects(segmentation, height, width)
+  binary_mask = mask.decode(run_len_encoding)
+  if not is_crowd:
+    binary_mask = np.amax(binary_mask, axis=2)
+
+  return tfrecord_lib.encode_mask_as_png(binary_mask)
+
+
+def generate_coco_panoptics_masks(segments_info, mask_path,
+                                  include_panoptic_masks,
+                                  is_category_thing):
+  """Creates masks for panoptic segmentation task.
+
+  Args:
+    segments_info: a list of dicts, where each dict has keys: [u'id',
+      u'category_id', u'area', u'bbox', u'iscrowd'], detailing information for
+      each segment in the panoptic mask.
+    mask_path: path to the panoptic mask.
+    include_panoptic_masks: bool, when set to True, category and instance
+      masks are included in the outputs. Set this to True, when using
+      the Panoptic Quality evaluator.
+    is_category_thing: a dict with category ids as keys and, 0/1 as values to
+      represent "stuff" and "things" classes respectively.
+
+  Returns:
+    A dict with with keys: [u'semantic_segmentation_mask', u'category_mask',
+      u'instance_mask']. The dict contains 'category_mask' and 'instance_mask'
+      only if `include_panoptic_eval_masks` is set to True.
+  """
+  rgb_mask = tfrecord_lib.read_image(mask_path)
+  r, g, b = np.split(rgb_mask, 3, axis=-1)
+
+  # decode rgb encoded panoptic mask to get segments ids
+  # refer https://cocodataset.org/#format-data
+  segments_encoded_mask = (r + g * 256 + b * (256**2)).squeeze()
+
+  semantic_segmentation_mask = np.ones_like(
+      segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
+  if include_panoptic_masks:
+    category_mask = np.ones_like(
+        segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
+    instance_mask = np.ones_like(
+        segments_encoded_mask, dtype=np.uint8) * _VOID_INSTANCE_ID
+
+  for idx, segment in enumerate(segments_info):
+    segment_id = segment['id']
+    category_id = segment['category_id']
+
+    if is_category_thing[category_id]:
+      encoded_category_id = _THING_CLASS_ID
+      instance_id = idx + 1
+    else:
+      encoded_category_id = category_id - _STUFF_CLASSES_OFFSET
+      instance_id = _VOID_INSTANCE_ID
+
+    segment_mask = (segments_encoded_mask == segment_id)
+    semantic_segmentation_mask[segment_mask] = encoded_category_id
+
+    if include_panoptic_masks:
+      category_mask[segment_mask] = category_id
+      instance_mask[segment_mask] = instance_id
+
+  outputs = {
+      'semantic_segmentation_mask': tfrecord_lib.encode_mask_as_png(
+          semantic_segmentation_mask)
+      }
+
+  if include_panoptic_masks:
+    outputs.update({
+        'category_mask': tfrecord_lib.encode_mask_as_png(category_mask),
+        'instance_mask': tfrecord_lib.encode_mask_as_png(instance_mask)
+        })
+  return outputs
+
+
+def coco_annotations_to_lists(bbox_annotations, id_to_name_map,
+                              image_height, image_width, include_masks):
+  """Converts COCO annotations to feature lists."""
+
+  data = dict((k, list()) for k in
+              ['xmin', 'xmax', 'ymin', 'ymax', 'is_crowd',
+               'category_id', 'category_names', 'area'])
+  if include_masks:
+    data['encoded_mask_png'] = []
+
+  num_annotations_skipped = 0
+
+  for object_annotations in bbox_annotations:
+    (x, y, width, height) = tuple(object_annotations['bbox'])
+
+    if width <= 0 or height <= 0:
+      num_annotations_skipped += 1
+      continue
+    if x + width > image_width or y + height > image_height:
+      num_annotations_skipped += 1
+      continue
+    data['xmin'].append(float(x) / image_width)
+    data['xmax'].append(float(x + width) / image_width)
+    data['ymin'].append(float(y) / image_height)
+    data['ymax'].append(float(y + height) / image_height)
+    data['is_crowd'].append(object_annotations['iscrowd'])
+    category_id = int(object_annotations['category_id'])
+    data['category_id'].append(category_id)
+    data['category_names'].append(id_to_name_map[category_id].encode('utf8'))
+    data['area'].append(object_annotations['area'])
+
+    if include_masks:
+      data['encoded_mask_png'].append(
+          coco_segmentation_to_mask_png(object_annotations['segmentation'],
+                                        image_height, image_width,
+                                        object_annotations['iscrowd'])
+      )
+
+  return data, num_annotations_skipped
+
+
+def bbox_annotations_to_feature_dict(
+    bbox_annotations, image_height, image_width, id_to_name_map, include_masks):
+  """Convert COCO annotations to an encoded feature dict."""
+
+  data, num_skipped = coco_annotations_to_lists(
+      bbox_annotations, id_to_name_map, image_height, image_width,
+      include_masks)
+  feature_dict = {
+      'image/object/bbox/xmin':
+          tfrecord_lib.convert_to_feature(data['xmin']),
+      'image/object/bbox/xmax':
+          tfrecord_lib.convert_to_feature(data['xmax']),
+      'image/object/bbox/ymin':
+          tfrecord_lib.convert_to_feature(data['ymin']),
+      'image/object/bbox/ymax':
+          tfrecord_lib.convert_to_feature(data['ymax']),
+      'image/object/class/text':
+          tfrecord_lib.convert_to_feature(data['category_names']),
+      'image/object/class/label':
+          tfrecord_lib.convert_to_feature(data['category_id']),
+      'image/object/is_crowd':
+          tfrecord_lib.convert_to_feature(data['is_crowd']),
+      'image/object/area':
+          tfrecord_lib.convert_to_feature(data['area']),
+  }
+  if include_masks:
+    feature_dict['image/object/mask'] = (
+        tfrecord_lib.convert_to_feature(data['encoded_mask_png']))
+
+  return feature_dict, num_skipped
+
+
+def encode_caption_annotations(caption_annotations):
+  captions = []
+  for caption_annotation in caption_annotations:
+    captions.append(caption_annotation['caption'].encode('utf8'))
+
+  return captions
+
+
+def create_tf_example(image,
+                      image_dirs,
+                      panoptic_masks_dir=None,
+                      bbox_annotations=None,
+                      id_to_name_map=None,
+                      caption_annotations=None,
+                      panoptic_annotation=None,
+                      is_category_thing=None,
+                      include_panoptic_masks=False,
+                      include_masks=False):
+  """Converts image and annotations to a tf.Example proto.
+
+  Args:
+    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
+      u'width', u'date_captured', u'flickr_url', u'id']
+    image_dirs: list of directories containing the image files.
+    panoptic_masks_dir: `str` of the panoptic masks directory.
+    bbox_annotations:
+      list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
+        u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
+        coordinates in the official COCO dataset are given as [x, y, width,
+        height] tuples using absolute coordinates where x, y represent the
+        top-left (0-indexed) corner.  This function converts to the format
+        expected by the Tensorflow Object Detection API (which is which is
+        [ymin, xmin, ymax, xmax] with coordinates normalized relative to image
+        size).
+    id_to_name_map: a dict mapping category IDs to string names.
+    caption_annotations:
+      list of dict with keys: [u'id', u'image_id', u'str'].
+    panoptic_annotation: dict with keys: [u'image_id', u'file_name',
+      u'segments_info']. Where the value for segments_info is a list of dicts,
+      with each dict containing information for a single segment in the mask.
+    is_category_thing: `bool`, whether it is a category thing.
+    include_panoptic_masks: `bool`, whether to include panoptic masks.
+    include_masks: Whether to include instance segmentations masks
+      (PNG encoded) in the result. default: False.
+
+  Returns:
+    example: The converted tf.Example
+    num_annotations_skipped: Number of (invalid) annotations that were ignored.
+
+  Raises:
+    ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
+      does not exist, or is not unique across image directories.
+  """
+  image_height = image['height']
+  image_width = image['width']
+  filename = image['file_name']
+  image_id = image['id']
+
+  if len(image_dirs) > 1:
+    full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
+    full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
+    if not full_existing_paths:
+      raise ValueError(
+          '{} does not exist across image directories.'.format(filename))
+    if len(full_existing_paths) > 1:
+      raise ValueError(
+          '{} is not unique across image directories'.format(filename))
+    full_path, = full_existing_paths
+  # If there is only one image directory, it's not worth checking for existence,
+  # since trying to open the file will raise an informative error message if it
+  # does not exist.
+  else:
+    image_dir, = image_dirs
+    full_path = os.path.join(image_dir, filename)
+
+  with tf.io.gfile.GFile(full_path, 'rb') as fid:
+    encoded_jpg = fid.read()
+
+  feature_dict = tfrecord_lib.image_info_to_feature_dict(
+      image_height, image_width, filename, image_id, encoded_jpg, 'jpg')
+
+  num_annotations_skipped = 0
+  if bbox_annotations:
+    box_feature_dict, num_skipped = bbox_annotations_to_feature_dict(
+        bbox_annotations, image_height, image_width, id_to_name_map,
+        include_masks)
+    num_annotations_skipped += num_skipped
+    feature_dict.update(box_feature_dict)
+
+  if caption_annotations:
+    encoded_captions = encode_caption_annotations(caption_annotations)
+    feature_dict.update(
+        {'image/caption': tfrecord_lib.convert_to_feature(encoded_captions)})
+
+  if panoptic_annotation:
+    segments_info = panoptic_annotation['segments_info']
+    panoptic_mask_filename = os.path.join(
+        panoptic_masks_dir,
+        panoptic_annotation['file_name'])
+    encoded_panoptic_masks = generate_coco_panoptics_masks(
+        segments_info, panoptic_mask_filename, include_panoptic_masks,
+        is_category_thing)
+    feature_dict.update(
+        {'image/segmentation/class/encoded': tfrecord_lib.convert_to_feature(
+            encoded_panoptic_masks['semantic_segmentation_mask'])})
+
+    if include_panoptic_masks:
+      feature_dict.update({
+          'image/panoptic/category_mask': tfrecord_lib.convert_to_feature(
+              encoded_panoptic_masks['category_mask']),
+          'image/panoptic/instance_mask': tfrecord_lib.convert_to_feature(
+              encoded_panoptic_masks['instance_mask'])
+            })
+
+  example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
+  return example, num_annotations_skipped
+
+
+def _load_object_annotations(object_annotations_file):
+  """Loads object annotation JSON file."""
+  with tf.io.gfile.GFile(object_annotations_file, 'r') as fid:
+    obj_annotations = json.load(fid)
+
+  images = obj_annotations['images']
+  id_to_name_map = dict((element['id'], element['name']) for element in
+                        obj_annotations['categories'])
+
+  img_to_obj_annotation = collections.defaultdict(list)
+  logging.info('Building bounding box index.')
+  for annotation in obj_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_obj_annotation[image_id].append(annotation)
+
+  missing_annotation_count = 0
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_obj_annotation:
+      missing_annotation_count += 1
+
+  logging.info('%d images are missing bboxes.', missing_annotation_count)
+
+  return img_to_obj_annotation, id_to_name_map
+
+
+def _load_caption_annotations(caption_annotations_file):
+  """Loads caption annotation JSON file."""
+  with tf.io.gfile.GFile(caption_annotations_file, 'r') as fid:
+    caption_annotations = json.load(fid)
+
+  img_to_caption_annotation = collections.defaultdict(list)
+  logging.info('Building caption index.')
+  for annotation in caption_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_caption_annotation[image_id].append(annotation)
+
+  missing_annotation_count = 0
+  images = caption_annotations['images']
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_caption_annotation:
+      missing_annotation_count += 1
+
+  logging.info('%d images are missing captions.', missing_annotation_count)
+
+  return img_to_caption_annotation
+
+
+def _load_panoptic_annotations(panoptic_annotations_file):
+  """Loads panoptic annotation from file."""
+  with tf.io.gfile.GFile(panoptic_annotations_file, 'r') as fid:
+    panoptic_annotations = json.load(fid)
+
+  img_to_panoptic_annotation = dict()
+  logging.info('Building panoptic index.')
+  for annotation in panoptic_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_panoptic_annotation[image_id] = annotation
+
+  is_category_thing = dict()
+  for category_info in panoptic_annotations['categories']:
+    is_category_thing[category_info['id']] = category_info['isthing'] == 1
+
+  missing_annotation_count = 0
+  images = panoptic_annotations['images']
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_panoptic_annotation:
+      missing_annotation_count += 1
+  logging.info(
+      '%d images are missing panoptic annotations.', missing_annotation_count)
+
+  return img_to_panoptic_annotation, is_category_thing
+
+
+def _load_images_info(images_info_file):
+  with tf.io.gfile.GFile(images_info_file, 'r') as fid:
+    info_dict = json.load(fid)
+  return info_dict['images']
+
+
+def generate_annotations(images, image_dirs,
+                         panoptic_masks_dir=None,
+                         img_to_obj_annotation=None,
+                         img_to_caption_annotation=None,
+                         img_to_panoptic_annotation=None,
+                         is_category_thing=None,
+                         id_to_name_map=None,
+                         include_panoptic_masks=False,
+                         include_masks=False):
+  """Generator for COCO annotations."""
+  for image in images:
+    object_annotation = (img_to_obj_annotation.get(image['id'], None) if
+                         img_to_obj_annotation else None)
+
+    caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
+                         img_to_caption_annotation else None)
+
+    panoptic_annotation = (img_to_panoptic_annotation.get(image['id'], None) if
+                           img_to_panoptic_annotation else None)
+    yield (image, image_dirs, panoptic_masks_dir, object_annotation,
+           id_to_name_map, caption_annotaion, panoptic_annotation,
+           is_category_thing, include_panoptic_masks, include_masks)
+
+
+def _create_tf_record_from_coco_annotations(images_info_file,
+                                            image_dirs,
+                                            output_path,
+                                            num_shards,
+                                            object_annotations_file=None,
+                                            caption_annotations_file=None,
+                                            panoptic_masks_dir=None,
+                                            panoptic_annotations_file=None,
+                                            include_panoptic_masks=False,
+                                            include_masks=False):
+  """Loads COCO annotation json files and converts to tf.Record format.
+
+  Args:
+    images_info_file: JSON file containing image info. The number of tf.Examples
+      in the output tf Record files is exactly equal to the number of image info
+      entries in this file. This can be any of train/val/test annotation json
+      files Eg. 'image_info_test-dev2017.json',
+      'instance_annotations_train2017.json',
+      'caption_annotations_train2017.json', etc.
+    image_dirs: List of directories containing the image files.
+    output_path: Path to output tf.Record file.
+    num_shards: Number of output files to create.
+    object_annotations_file: JSON file containing bounding box annotations.
+    caption_annotations_file: JSON file containing caption annotations.
+    panoptic_masks_dir: Directory containing panoptic masks.
+    panoptic_annotations_file: JSON file containing panoptic annotations.
+    include_panoptic_masks: Whether to include 'category_mask'
+      and 'instance_mask', which is required by the panoptic quality evaluator.
+    include_masks: Whether to include instance segmentations masks
+      (PNG encoded) in the result. default: False.
+  """
+
+  logging.info('writing to output path: %s', output_path)
+
+  images = _load_images_info(images_info_file)
+
+  img_to_obj_annotation = None
+  img_to_caption_annotation = None
+  id_to_name_map = None
+  img_to_panoptic_annotation = None
+  is_category_thing = None
+  if object_annotations_file:
+    img_to_obj_annotation, id_to_name_map = (
+        _load_object_annotations(object_annotations_file))
+  if caption_annotations_file:
+    img_to_caption_annotation = (
+        _load_caption_annotations(caption_annotations_file))
+  if panoptic_annotations_file:
+    img_to_panoptic_annotation, is_category_thing = (
+        _load_panoptic_annotations(panoptic_annotations_file))
+
+  coco_annotations_iter = generate_annotations(
+      images=images,
+      image_dirs=image_dirs,
+      panoptic_masks_dir=panoptic_masks_dir,
+      img_to_obj_annotation=img_to_obj_annotation,
+      img_to_caption_annotation=img_to_caption_annotation,
+      img_to_panoptic_annotation=img_to_panoptic_annotation,
+      is_category_thing=is_category_thing,
+      id_to_name_map=id_to_name_map,
+      include_panoptic_masks=include_panoptic_masks,
+      include_masks=include_masks)
+
+  num_skipped = tfrecord_lib.write_tf_record_dataset(
+      output_path, coco_annotations_iter, create_tf_example, num_shards)
+
+  logging.info('Finished writing, skipped %d annotations.', num_skipped)
+
+
+def main(_):
+  assert FLAGS.image_dir, '`image_dir` missing.'
+  assert (FLAGS.image_info_file or FLAGS.object_annotations_file or
+          FLAGS.caption_annotations_file), ('All annotation files are '
+                                            'missing.')
+  if FLAGS.image_info_file:
+    images_info_file = FLAGS.image_info_file
+  elif FLAGS.object_annotations_file:
+    images_info_file = FLAGS.object_annotations_file
+  else:
+    images_info_file = FLAGS.caption_annotations_file
+
+  directory = os.path.dirname(FLAGS.output_file_prefix)
+  if not tf.io.gfile.isdir(directory):
+    tf.io.gfile.makedirs(directory)
+
+  _create_tf_record_from_coco_annotations(images_info_file, FLAGS.image_dir,
+                                          FLAGS.output_file_prefix,
+                                          FLAGS.num_shards,
+                                          FLAGS.object_annotations_file,
+                                          FLAGS.caption_annotations_file,
+                                          FLAGS.panoptic_masks_dir,
+                                          FLAGS.panoptic_annotations_file,
+                                          FLAGS.include_panoptic_masks,
+                                          FLAGS.include_masks)
+
+
+if __name__ == '__main__':
+  app.run(main)
--- a/official/vision/data/process_coco_few_shot.sh
+++ b/official/vision/data/process_coco_few_shot.sh
+#!/bin/bash
+#
+# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
+
+tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
+base_image_dir="/tmp/coco_images"
+output_dir="/tmp/coco_few_shot"
+while getopts ":i:o:" o; do
+  case "${o}" in
+    o) output_dir=${OPTARG} ;;
+    i) base_image_dir=${OPTARG} ;;
+    *) echo "Usage: ${0} [-i <base_image_dir>] [-o <output_dir>]" 1>&2; exit 1 ;;
+  esac
+done
+
+cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
+wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
+    -P "${tmp_dir}" -A "trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json" \
+    "http://${cocosplit_url}/"
+mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
+rm -rf "${tmp_dir}/${cocosplit_url}/"
+
+python process_coco_few_shot_json_files.py \
+    --logtostderr --workdir="${tmp_dir}"
+
+for seed in {0..9}; do
+  for shots in 1 3 5 10 30; do
+    python create_coco_tf_record.py \
+        --logtostderr \
+        --image_dir="${base_image_dir}/train2014" \
+        --image_dir="${base_image_dir}/val2014" \
+        --image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --caption_annotations_file="" \
+        --output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
+        --num_shards=4
+  done
+done
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir="${base_image_dir}/train2014" \
+    --image_dir="${base_image_dir}/val2014" \
+    --image_info_file="${tmp_dir}/datasplit/5k.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/5k.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/5k" \
+    --num_shards=10
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir="${base_image_dir}/train2014" \
+    --image_dir="${base_image_dir}/val2014" \
+    --image_info_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/trainvalno5k_base" \
+    --num_shards=200
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir="${base_image_dir}/train2014" \
+    --image_dir="${base_image_dir}/val2014" \
+    --image_info_file="${tmp_dir}/datasplit/5k_base.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/5k_base.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/5k_base" \
+    --num_shards=10
+
+rm -rf "${tmp_dir}"
--- a/official/vision/data/process_coco_few_shot_json_files.py
+++ b/official/vision/data/process_coco_few_shot_json_files.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processes the JSON files for COCO few-shot.
+
+We assume that `workdir` mirrors the contents of
+http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
+files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
+"Frustratingly Simple Few-Shot Object Detection" paper uses.
+"""
+
+import collections
+import itertools
+import json
+import logging
+import os
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+
+flags.DEFINE_string('workdir', None, 'Working directory.')
+
+FLAGS = flags.FLAGS
+CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
+              'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
+              'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
+              'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
+              'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
+              'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
+              'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
+              'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
+              'parking meter', 'person', 'pizza', 'potted plant',
+              'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
+              'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
+              'stop sign', 'suitcase', 'surfboard', 'teddy bear',
+              'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
+              'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
+              'wine glass', 'zebra']
+SEEDS = list(range(10))
+SHOTS = [1, 3, 5, 10, 30]
+
+FILE_SUFFIXES = collections.defaultdict(list)
+for _seed, _shots in itertools.product(SEEDS, SHOTS):
+  for _category in CATEGORIES:
+    FILE_SUFFIXES[(_seed, _shots)].append(
+        '{}full_box_{}shot_{}_trainval.json'.format(
+            # http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
+            #
+            #   datasplit/
+            #     trainvalno5k.json
+            #     5k.json
+            #   full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #   seed{1-9}/
+            #     full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #
+            # This means that the JSON files for seed0 are located in the root
+            # directory rather than in a `seed?/` subdirectory, hence the
+            # conditional expression below.
+            '' if _seed == 0 else 'seed{}/'.format(_seed),
+            _shots,
+            _category))
+
+# Base class IDs, as defined in
+# https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/evaluation/coco_evaluation.py#L60-L65
+BASE_CLASS_IDS = [8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
+                  35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51,
+                  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 65, 70, 73, 74, 75,
+                  76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+
+
+def main(unused_argv):
+  workdir = FLAGS.workdir
+
+  # Filter novel class annotations from the training and validation sets.
+  for name in ('trainvalno5k', '5k'):
+    file_path = os.path.join(workdir, 'datasplit', '{}.json'.format(name))
+    with tf.io.gfile.GFile(file_path, 'r') as f:
+      json_dict = json.load(f)
+
+    json_dict['annotations'] = [a for a in json_dict['annotations']
+                                if a['category_id'] in BASE_CLASS_IDS]
+    output_path = os.path.join(
+        workdir, 'datasplit', '{}_base.json'.format(name))
+    with tf.io.gfile.GFile(output_path, 'w') as f:
+      json.dump(json_dict, f)
+
+  for seed, shots in itertools.product(SEEDS, SHOTS):
+    # Retrieve all examples for a given seed and shots setting.
+    file_paths = [os.path.join(workdir, suffix)
+                  for suffix in FILE_SUFFIXES[(seed, shots)]]
+    json_dicts = []
+    for file_path in file_paths:
+      with tf.io.gfile.GFile(file_path, 'r') as f:
+        json_dicts.append(json.load(f))
+
+    # Make sure that all JSON files for a given seed and shots setting have the
+    # same metadata. We count on this to fuse them later on.
+    metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
+                       'categories': d['categories']} for d in json_dicts]
+    if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
+      raise RuntimeError(
+          'JSON files for {} shots (seed {}) '.format(shots, seed) +
+          'have different info, licences, or categories fields')
+
+    # Retrieve images across all JSON files.
+    images = sum((d['images'] for d in json_dicts), [])
+    # Remove duplicate image entries.
+    images = list({image['id']: image for image in images}.values())
+
+    output_dict = {
+        'info': json_dicts[0]['info'],
+        'licenses': json_dicts[0]['licenses'],
+        'categories': json_dicts[0]['categories'],
+        'images': images,
+        'annotations': sum((d['annotations'] for d in json_dicts), [])
+    }
+
+    output_path = os.path.join(workdir,
+                               '{}shot_seed{}.json'.format(shots, seed))
+    with tf.io.gfile.GFile(output_path, 'w') as f:
+      json.dump(output_dict, f)
+    logger.info('Processed %d shots (seed %d) and saved to %s',
+                shots, seed, output_path)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('workdir')
+  app.run(main)
--- a/official/vision/data/process_coco_panoptic.sh
+++ b/official/vision/data/process_coco_panoptic.sh
+#!/bin/bash
+
+sudo apt update
+sudo apt install unzip aria2 -y
+
+DATA_DIR=$1
+aria2c -j 8 -Z \
+  http://images.cocodataset.org/annotations/annotations_trainval2017.zip \
+  http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip \
+  http://images.cocodataset.org/zips/train2017.zip \
+  http://images.cocodataset.org/zips/val2017.zip \
+  --dir=$DATA_DIR;
+
+unzip $DATA_DIR/"*".zip -d $DATA_DIR;
+mkdir $DATA_DIR/zips && mv $DATA_DIR/*.zip $DATA_DIR/zips;
+unzip $DATA_DIR/annotations/panoptic_train2017.zip -d $DATA_DIR
+unzip $DATA_DIR/annotations/panoptic_val2017.zip -d $DATA_DIR
+
+python3 official/vision/beta/data/create_coco_tf_record.py \
+  --logtostderr  \
+  --image_dir="$DATA_DIR/val2017" \
+  --object_annotations_file="$DATA_DIR/annotations/instances_val2017.json"  \
+  --output_file_prefix="$DATA_DIR/tfrecords/val"  \
+  --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_val2017.json" \
+  --panoptic_masks_dir="$DATA_DIR/panoptic_val2017" \
+  --num_shards=8 \
+  --include_masks \
+  --include_panoptic_masks
+
+
+python3 official/vision/beta/data/create_coco_tf_record.py \
+  --logtostderr  \
+  --image_dir="$DATA_DIR/train2017" \
+  --object_annotations_file="$DATA_DIR/annotations/instances_train2017.json"  \
+  --output_file_prefix="$DATA_DIR/tfrecords/train"  \
+  --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_train2017.json" \
+  --panoptic_masks_dir="$DATA_DIR/panoptic_train2017" \
+  --num_shards=32 \
+  --include_masks \
+  --include_panoptic_masks
--- a/official/vision/data/tfrecord_lib.py
+++ b/official/vision/data/tfrecord_lib.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for creating TFRecord datasets."""
+
+import hashlib
+import io
+import itertools
+
+from absl import logging
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+import multiprocessing as mp
+
+
+def convert_to_feature(value, value_type=None):
+  """Converts the given python object to a tf.train.Feature.
+
+  Args:
+    value: int, float, bytes or a list of them.
+    value_type: optional, if specified, forces the feature to be of the given
+      type. Otherwise, type is inferred automatically. Can be one of
+      ['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list']
+
+  Returns:
+    feature: A tf.train.Feature object.
+  """
+
+  if value_type is None:
+
+    element = value[0] if isinstance(value, list) else value
+
+    if isinstance(element, bytes):
+      value_type = 'bytes'
+
+    elif isinstance(element, (int, np.integer)):
+      value_type = 'int64'
+
+    elif isinstance(element, (float, np.floating)):
+      value_type = 'float'
+
+    else:
+      raise ValueError('Cannot convert type {} to feature'.
+                       format(type(element)))
+
+    if isinstance(value, list):
+      value_type = value_type + '_list'
+
+  if value_type == 'int64':
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+  elif value_type == 'int64_list':
+    value = np.asarray(value).astype(np.int64).reshape(-1)
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+  elif value_type == 'float':
+    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
+
+  elif value_type == 'float_list':
+    value = np.asarray(value).astype(np.float32).reshape(-1)
+    return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+  elif value_type == 'bytes':
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+  elif value_type == 'bytes_list':
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
+
+  else:
+    raise ValueError('Unknown value_type parameter - {}'.format(value_type))
+
+
+def image_info_to_feature_dict(height, width, filename, image_id,
+                               encoded_str, encoded_format):
+  """Convert image information to a dict of features."""
+
+  key = hashlib.sha256(encoded_str).hexdigest()
+
+  return {
+      'image/height': convert_to_feature(height),
+      'image/width': convert_to_feature(width),
+      'image/filename': convert_to_feature(filename.encode('utf8')),
+      'image/source_id': convert_to_feature(str(image_id).encode('utf8')),
+      'image/key/sha256': convert_to_feature(key.encode('utf8')),
+      'image/encoded': convert_to_feature(encoded_str),
+      'image/format': convert_to_feature(encoded_format.encode('utf8')),
+  }
+
+
+def read_image(image_path):
+  pil_image = Image.open(image_path)
+  return np.asarray(pil_image)
+
+
+def encode_mask_as_png(mask):
+  pil_image = Image.fromarray(mask)
+  output_io = io.BytesIO()
+  pil_image.save(output_io, format='PNG')
+  return output_io.getvalue()
+
+
+def write_tf_record_dataset(output_path, annotation_iterator,
+                            process_func, num_shards,
+                            use_multiprocessing=True, unpack_arguments=True):
+  """Iterates over annotations, processes them and writes into TFRecords.
+
+  Args:
+    output_path: The prefix path to create TF record files.
+    annotation_iterator: An iterator of tuples containing details about the
+      dataset.
+    process_func: A function which takes the elements from the tuples of
+      annotation_iterator as arguments and returns a tuple of (tf.train.Example,
+      int). The integer indicates the number of annotations that were skipped.
+    num_shards: int, the number of shards to write for the dataset.
+    use_multiprocessing:
+      Whether or not to use multiple processes to write TF Records.
+    unpack_arguments:
+      Whether to unpack the tuples from annotation_iterator as individual
+        arguments to the process func or to pass the returned value as it is.
+
+  Returns:
+    num_skipped: The total number of skipped annotations.
+  """
+
+  writers = [
+      tf.io.TFRecordWriter(
+          output_path + '-%05d-of-%05d.tfrecord' % (i, num_shards))
+      for i in range(num_shards)
+  ]
+
+  total_num_annotations_skipped = 0
+
+  if use_multiprocessing:
+    pool = mp.Pool()
+    if unpack_arguments:
+      tf_example_iterator = pool.starmap(process_func, annotation_iterator)
+    else:
+      tf_example_iterator = pool.imap(process_func, annotation_iterator)
+  else:
+    if unpack_arguments:
+      tf_example_iterator = itertools.starmap(process_func, annotation_iterator)
+    else:
+      tf_example_iterator = map(process_func, annotation_iterator)
+
+  for idx, (tf_example, num_annotations_skipped) in enumerate(
+      tf_example_iterator):
+    if idx % 100 == 0:
+      logging.info('On image %d', idx)
+
+    total_num_annotations_skipped += num_annotations_skipped
+    writers[idx % num_shards].write(tf_example.SerializeToString())
+
+  if use_multiprocessing:
+    pool.close()
+    pool.join()
+
+  for writer in writers:
+    writer.close()
+
+  logging.info('Finished writing, skipped %d annotations.',
+               total_num_annotations_skipped)
+  return total_num_annotations_skipped
+
+
+def check_and_make_dir(directory):
+  """Creates the directory if it doesn't exist."""
+  if not tf.io.gfile.isdir(directory):
+    tf.io.gfile.makedirs(directory)
--- a/official/vision/data/tfrecord_lib_test.py
+++ b/official/vision/data/tfrecord_lib_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tfrecord_lib."""
+
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.data import tfrecord_lib
+
+
+FLAGS = flags.FLAGS
+
+
+def process_sample(x):
+  d = {'x': x}
+  return tf.train.Example(features=tf.train.Features(feature=d)), 0
+
+
+def parse_function(example_proto):
+
+  feature_description = {
+      'x': tf.io.FixedLenFeature([], tf.int64, default_value=-1)
+  }
+  return tf.io.parse_single_example(example_proto, feature_description)
+
+
+class TfrecordLibTest(parameterized.TestCase):
+
+  def test_write_tf_record_dataset(self):
+    data = [(tfrecord_lib.convert_to_feature(i),) for i in range(17)]
+
+    path = os.path.join(FLAGS.test_tmpdir, 'train')
+
+    tfrecord_lib.write_tf_record_dataset(
+        path, data, process_sample, 3, use_multiprocessing=False)
+    tfrecord_files = tf.io.gfile.glob(path + '*')
+
+    self.assertLen(tfrecord_files, 3)
+
+    dataset = tf.data.TFRecordDataset(tfrecord_files)
+    dataset = dataset.map(parse_function)
+
+    read_values = set(d['x'] for d in dataset.as_numpy_iterator())
+    self.assertSetEqual(read_values, set(range(17)))
+
+  def test_convert_to_feature_float(self):
+
+    proto = tfrecord_lib.convert_to_feature(0.0)
+    self.assertEqual(proto.float_list.value[0], 0.0)
+
+  def test_convert_to_feature_int(self):
+
+    proto = tfrecord_lib.convert_to_feature(0)
+    self.assertEqual(proto.int64_list.value[0], 0)
+
+  def test_convert_to_feature_bytes(self):
+
+    proto = tfrecord_lib.convert_to_feature(b'123')
+    self.assertEqual(proto.bytes_list.value[0], b'123')
+
+  def test_convert_to_feature_float_list(self):
+
+    proto = tfrecord_lib.convert_to_feature([0.0, 1.0])
+    self.assertSequenceAlmostEqual(proto.float_list.value, [0.0, 1.0])
+
+  def test_convert_to_feature_int_list(self):
+
+    proto = tfrecord_lib.convert_to_feature([0, 1])
+    self.assertSequenceAlmostEqual(proto.int64_list.value, [0, 1])
+
+  def test_convert_to_feature_bytes_list(self):
+
+    proto = tfrecord_lib.convert_to_feature([b'123', b'456'])
+    self.assertSequenceAlmostEqual(proto.bytes_list.value, [b'123', b'456'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/dataloaders/__init__.py
+++ b/official/vision/dataloaders/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/vision/dataloaders/classification_input.py
+++ b/official/vision/dataloaders/classification_input.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classification decoder and parser."""
+from typing import Any, Dict, List, Optional
+# Import libraries
+import tensorflow as tf
+
+from official.vision.configs import common
+from official.vision.dataloaders import decoder
+from official.vision.dataloaders import parser
+from official.vision.ops import augment
+from official.vision.ops import preprocess_ops
+
+MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
+STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
+
+DEFAULT_IMAGE_FIELD_KEY = 'image/encoded'
+DEFAULT_LABEL_FIELD_KEY = 'image/class/label'
+
+
+class Decoder(decoder.Decoder):
+  """A tf.Example decoder for classification task."""
+
+  def __init__(self,
+               image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
+               label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
+               is_multilabel: bool = False,
+               keys_to_features: Optional[Dict[str, Any]] = None):
+    if not keys_to_features:
+      keys_to_features = {
+          image_field_key:
+              tf.io.FixedLenFeature((), tf.string, default_value=''),
+      }
+      if is_multilabel:
+        keys_to_features.update(
+            {label_field_key: tf.io.VarLenFeature(dtype=tf.int64)})
+      else:
+        keys_to_features.update({
+            label_field_key:
+                tf.io.FixedLenFeature((), tf.int64, default_value=-1)
+        })
+    self._keys_to_features = keys_to_features
+
+  def decode(self, serialized_example):
+    return tf.io.parse_single_example(
+        serialized_example, self._keys_to_features)
+
+
+class Parser(parser.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+
+  def __init__(self,
+               output_size: List[int],
+               num_classes: float,
+               image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
+               label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
+               decode_jpeg_only: bool = True,
+               aug_rand_hflip: bool = True,
+               aug_type: Optional[common.Augmentation] = None,
+               color_jitter: float = 0.,
+               random_erasing: Optional[common.RandomErasing] = None,
+               is_multilabel: bool = False,
+               dtype: str = 'float32'):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      num_classes: `float`, number of classes.
+      image_field_key: `str`, the key name to encoded image in tf.Example.
+      label_field_key: `str`, the key name to label in tf.Example.
+      decode_jpeg_only: `bool`, if True, only JPEG format is decoded, this is
+        faster than decoding other types. Default is True.
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      aug_type: An optional Augmentation object to choose from AutoAugment and
+        RandAugment.
+      color_jitter: Magnitude of color jitter. If > 0, the value is used to
+        generate random scale factor for brightness, contrast and saturation.
+        See `preprocess_ops.color_jitter` for more details.
+      random_erasing: if not None, augment input image by random erasing. See
+        `augment.RandomErasing` for more details.
+      is_multilabel: A `bool`, whether or not each example has multiple labels.
+      dtype: `str`, cast output image in dtype. It can be 'float32', 'float16',
+        or 'bfloat16'.
+    """
+    self._output_size = output_size
+    self._aug_rand_hflip = aug_rand_hflip
+    self._num_classes = num_classes
+    self._image_field_key = image_field_key
+    if dtype == 'float32':
+      self._dtype = tf.float32
+    elif dtype == 'float16':
+      self._dtype = tf.float16
+    elif dtype == 'bfloat16':
+      self._dtype = tf.bfloat16
+    else:
+      raise ValueError('dtype {!r} is not supported!'.format(dtype))
+    if aug_type:
+      if aug_type.type == 'autoaug':
+        self._augmenter = augment.AutoAugment(
+            augmentation_name=aug_type.autoaug.augmentation_name,
+            cutout_const=aug_type.autoaug.cutout_const,
+            translate_const=aug_type.autoaug.translate_const)
+      elif aug_type.type == 'randaug':
+        self._augmenter = augment.RandAugment(
+            num_layers=aug_type.randaug.num_layers,
+            magnitude=aug_type.randaug.magnitude,
+            cutout_const=aug_type.randaug.cutout_const,
+            translate_const=aug_type.randaug.translate_const,
+            prob_to_apply=aug_type.randaug.prob_to_apply,
+            exclude_ops=aug_type.randaug.exclude_ops)
+      else:
+        raise ValueError('Augmentation policy {} not supported.'.format(
+            aug_type.type))
+    else:
+      self._augmenter = None
+    self._label_field_key = label_field_key
+    self._color_jitter = color_jitter
+    if random_erasing:
+      self._random_erasing = augment.RandomErasing(
+          probability=random_erasing.probability,
+          min_area=random_erasing.min_area,
+          max_area=random_erasing.max_area,
+          min_aspect=random_erasing.min_aspect,
+          max_aspect=random_erasing.max_aspect,
+          min_count=random_erasing.min_count,
+          max_count=random_erasing.max_count,
+          trials=random_erasing.trials)
+    else:
+      self._random_erasing = None
+    self._is_multilabel = is_multilabel
+    self._decode_jpeg_only = decode_jpeg_only
+
+  def _parse_train_data(self, decoded_tensors):
+    """Parses data for training."""
+    image = self._parse_train_image(decoded_tensors)
+    label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
+    if self._is_multilabel:
+      if isinstance(label, tf.sparse.SparseTensor):
+        label = tf.sparse.to_dense(label)
+      label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
+    return image, label
+
+  def _parse_eval_data(self, decoded_tensors):
+    """Parses data for evaluation."""
+    image = self._parse_eval_image(decoded_tensors)
+    label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
+    if self._is_multilabel:
+      if isinstance(label, tf.sparse.SparseTensor):
+        label = tf.sparse.to_dense(label)
+      label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
+    return image, label
+
+  def _parse_train_image(self, decoded_tensors):
+    """Parses image data for training."""
+    image_bytes = decoded_tensors[self._image_field_key]
+
+    if self._decode_jpeg_only:
+      image_shape = tf.image.extract_jpeg_shape(image_bytes)
+
+      # Crops image.
+      cropped_image = preprocess_ops.random_crop_image_v2(
+          image_bytes, image_shape)
+      image = tf.cond(
+          tf.reduce_all(tf.equal(tf.shape(cropped_image), image_shape)),
+          lambda: preprocess_ops.center_crop_image_v2(image_bytes, image_shape),
+          lambda: cropped_image)
+    else:
+      # Decodes image.
+      image = tf.io.decode_image(image_bytes, channels=3)
+      image.set_shape([None, None, 3])
+
+      # Crops image.
+      cropped_image = preprocess_ops.random_crop_image(image)
+
+      image = tf.cond(
+          tf.reduce_all(tf.equal(tf.shape(cropped_image), tf.shape(image))),
+          lambda: preprocess_ops.center_crop_image(image),
+          lambda: cropped_image)
+
+    if self._aug_rand_hflip:
+      image = tf.image.random_flip_left_right(image)
+
+    # Color jitter.
+    if self._color_jitter > 0:
+      image = preprocess_ops.color_jitter(image, self._color_jitter,
+                                          self._color_jitter,
+                                          self._color_jitter)
+
+    # Resizes image.
+    image = tf.image.resize(
+        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
+    image.set_shape([self._output_size[0], self._output_size[1], 3])
+
+    # Apply autoaug or randaug.
+    if self._augmenter is not None:
+      image = self._augmenter.distort(image)
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image,
+                                           offset=MEAN_RGB,
+                                           scale=STDDEV_RGB)
+
+    # Random erasing after the image has been normalized
+    if self._random_erasing is not None:
+      image = self._random_erasing.distort(image)
+
+    # Convert image to self._dtype.
+    image = tf.image.convert_image_dtype(image, self._dtype)
+
+    return image
+
+  def _parse_eval_image(self, decoded_tensors):
+    """Parses image data for evaluation."""
+    image_bytes = decoded_tensors[self._image_field_key]
+
+    if self._decode_jpeg_only:
+      image_shape = tf.image.extract_jpeg_shape(image_bytes)
+
+      # Center crops.
+      image = preprocess_ops.center_crop_image_v2(image_bytes, image_shape)
+    else:
+      # Decodes image.
+      image = tf.io.decode_image(image_bytes, channels=3)
+      image.set_shape([None, None, 3])
+
+      # Center crops.
+      image = preprocess_ops.center_crop_image(image)
+
+    image = tf.image.resize(
+        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
+    image.set_shape([self._output_size[0], self._output_size[1], 3])
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image,
+                                           offset=MEAN_RGB,
+                                           scale=STDDEV_RGB)
+
+    # Convert image to self._dtype.
+    image = tf.image.convert_image_dtype(image, self._dtype)
+
+    return image
+
+  @classmethod
+  def inference_fn(cls,
+                   image: tf.Tensor,
+                   input_image_size: List[int],
+                   num_channels: int = 3) -> tf.Tensor:
+    """Builds image model inputs for serving."""
+
+    image = tf.cast(image, dtype=tf.float32)
+    image = preprocess_ops.center_crop_image(image)
+    image = tf.image.resize(
+        image, input_image_size, method=tf.image.ResizeMethod.BILINEAR)
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(
+        image, offset=MEAN_RGB, scale=STDDEV_RGB)
+    image.set_shape(input_image_size + [num_channels])
+    return image
--- a/official/vision/dataloaders/decoder.py
+++ b/official/vision/dataloaders/decoder.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The generic decoder interface."""
+
+import abc
+
+
+class Decoder(object):
+  """Decodes the raw data into tensors."""
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def decode(self, serialized_example):
+    """Decodes the serialized example into tensors.
+
+    Args:
+      serialized_example: a serialized string tensor that encodes the data.
+
+    Returns:
+      decoded_tensors: a dict of Tensors.
+    """
+    pass
--- a/official/vision/dataloaders/input_reader.py
+++ b/official/vision/dataloaders/input_reader.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Dataset reader for vision model garden."""
+
+from typing import Any, Callable, Optional, Tuple
+
+import tensorflow as tf
+
+from official.core import config_definitions as cfg
+from official.core import input_reader
+
+
+def calculate_batch_sizes(total_batch_size: int,
+                          pseudo_label_ratio: float) -> Tuple[int, int]:
+  """Calculates labeled and pseudo-labeled dataset batch sizes.
+
+  Returns (labeled_batch_size, pseudo_labeled_batch_size) given a
+  total batch size and pseudo-label data ratio.
+
+  Args:
+   total_batch_size: The total batch size for all data.
+   pseudo_label_ratio: A non-negative float ratio of pseudo-labeled
+     to labeled data in a batch.
+
+  Returns:
+    (labeled_batch_size, pseudo_labeled_batch_size) as ints.
+
+  Raises:
+    ValueError: If total_batch_size is negative.
+    ValueError: If pseudo_label_ratio is negative.
+  """
+  if total_batch_size < 0:
+    raise ValueError('Invalid total_batch_size: {}'.format(total_batch_size))
+  if pseudo_label_ratio < 0.0:
+    raise ValueError(
+        'Invalid pseudo_label_ratio: {}'.format(pseudo_label_ratio))
+
+  ratio_factor = pseudo_label_ratio / (1.0 + pseudo_label_ratio)
+  pseudo_labeled_batch_size = int(round(total_batch_size * ratio_factor))
+  labeled_batch_size = total_batch_size - pseudo_labeled_batch_size
+  return labeled_batch_size, pseudo_labeled_batch_size
+
+
+class CombinationDatasetInputReader(input_reader.InputReader):
+  """Combination dataset input reader."""
+
+  def __init__(self,
+               params: cfg.DataConfig,
+               dataset_fn=tf.data.TFRecordDataset,
+               pseudo_label_dataset_fn=tf.data.TFRecordDataset,
+               decoder_fn: Optional[Callable[..., Any]] = None,
+               sample_fn: Optional[Callable[..., Any]] = None,
+               parser_fn: Optional[Callable[..., Any]] = None,
+               transform_and_batch_fn: Optional[Callable[
+                   [tf.data.Dataset, Optional[tf.distribute.InputContext]],
+                   tf.data.Dataset]] = None,
+               postprocess_fn: Optional[Callable[..., Any]] = None):
+    """Initializes an CombinationDatasetInputReader instance.
+
+    This class mixes a labeled and pseudo-labeled dataset. The params
+    must contain "pseudo_label_data.input_path" to specify the
+    pseudo-label dataset files and "pseudo_label_data.data_ratio"
+    to specify a per-batch mixing ratio of pseudo-label examples to
+    labeled dataset examples.
+
+    Args:
+      params: A config_definitions.DataConfig object.
+      dataset_fn: A `tf.data.Dataset` that consumes the input files. For
+        example, it can be `tf.data.TFRecordDataset`.
+      pseudo_label_dataset_fn: A `tf.data.Dataset` that consumes the input
+        files. For example, it can be `tf.data.TFRecordDataset`.
+      decoder_fn: An optional `callable` that takes the serialized data string
+        and decodes them into the raw tensor dictionary.
+      sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
+        input and outputs the transformed dataset. It performs sampling on the
+        decoded raw tensors dict before the parser_fn.
+      parser_fn: An optional `callable` that takes the decoded raw tensors dict
+        and parse them into a dictionary of tensors that can be consumed by the
+        model. It will be executed after decoder_fn.
+      transform_and_batch_fn: An optional `callable` that takes a
+        `tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
+        input, and returns a `tf.data.Dataset` object. It will be executed after
+        `parser_fn` to transform and batch the dataset; if None, after
+        `parser_fn` is executed, the dataset will be batched into per-replica
+        batch size.
+      postprocess_fn: A optional `callable` that processes batched tensors. It
+        will be executed after batching.
+
+    Raises:
+      ValueError: If drop_remainder is False.
+    """
+    super().__init__(params=params,
+                     dataset_fn=dataset_fn,
+                     decoder_fn=decoder_fn,
+                     sample_fn=sample_fn,
+                     parser_fn=parser_fn,
+                     transform_and_batch_fn=transform_and_batch_fn,
+                     postprocess_fn=postprocess_fn)
+
+    self._pseudo_label_file_pattern = params.pseudo_label_data.input_path
+    self._pseudo_label_dataset_fn = pseudo_label_dataset_fn
+    self._pseudo_label_data_ratio = params.pseudo_label_data.data_ratio
+    self._pseudo_label_matched_files = input_reader.match_files(
+        self._pseudo_label_file_pattern)
+    if not self._drop_remainder:
+      raise ValueError(
+          'Must use drop_remainder=True with CombinationDatasetInputReader')
+
+  def read(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Generates a tf.data.Dataset object."""
+
+    labeled_batch_size, pl_batch_size = calculate_batch_sizes(
+        self._global_batch_size, self._pseudo_label_data_ratio)
+
+    if not labeled_batch_size and pl_batch_size:
+      raise ValueError(
+          'Invalid batch_size: {} and pseudo_label_data_ratio: {}, '
+          'resulting in a 0 batch size for one of the datasets.'.format(
+              self._global_batch_size, self._pseudo_label_data_ratio))
+
+    def _read_decode_and_parse_dataset(matched_files, dataset_fn, batch_size,
+                                       input_context, tfds_builder):
+      dataset = self._read_data_source(matched_files, dataset_fn, input_context,
+                                       tfds_builder)
+      return self._decode_and_parse_dataset(dataset, batch_size, input_context)
+
+    labeled_dataset = _read_decode_and_parse_dataset(
+        matched_files=self._matched_files,
+        dataset_fn=self._dataset_fn,
+        batch_size=labeled_batch_size,
+        input_context=input_context,
+        tfds_builder=self._tfds_builder)
+
+    pseudo_labeled_dataset = _read_decode_and_parse_dataset(
+        matched_files=self._pseudo_label_matched_files,
+        dataset_fn=self._pseudo_label_dataset_fn,
+        batch_size=pl_batch_size,
+        input_context=input_context,
+        tfds_builder=False)
+
+    def concat_fn(d1, d2):
+      return tf.nest.map_structure(
+          lambda x1, x2: tf.concat([x1, x2], axis=0), d1, d2)
+
+    dataset_concat = tf.data.Dataset.zip(
+        (labeled_dataset, pseudo_labeled_dataset))
+    dataset_concat = dataset_concat.map(
+        concat_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    def maybe_map_fn(dataset, fn):
+      return dataset if fn is None else dataset.map(
+          fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    dataset_concat = maybe_map_fn(dataset_concat, self._postprocess_fn)
+    dataset_concat = self._maybe_apply_data_service(dataset_concat,
+                                                    input_context)
+
+    if self._deterministic is not None:
+      options = tf.data.Options()
+      options.experimental_deterministic = self._deterministic
+      dataset_concat = dataset_concat.with_options(options)
+
+    return dataset_concat.prefetch(tf.data.experimental.AUTOTUNE)
--- a/official/vision/dataloaders/input_reader_factory.py
+++ b/official/vision/dataloaders/input_reader_factory.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Factory for getting TF-Vision input readers."""
+
+from official.common import dataset_fn as dataset_fn_util
+from official.core import config_definitions as cfg
+from official.core import input_reader as core_input_reader
+
+from official.vision.dataloaders import input_reader as vision_input_reader
+
+
+def input_reader_generator(params: cfg.DataConfig,
+                           **kwargs) -> core_input_reader.InputReader:
+  """Instantiates an input reader class according to the params.
+
+  Args:
+    params: A config_definitions.DataConfig object.
+    **kwargs: Additional arguments passed to input reader initialization.
+
+  Returns:
+    An InputReader object.
+
+  """
+  if params.is_training and params.get('pseudo_label_data', False):
+    return vision_input_reader.CombinationDatasetInputReader(
+        params,
+        pseudo_label_dataset_fn=dataset_fn_util.pick_dataset_fn(
+            params.pseudo_label_data.file_type),
+        **kwargs)
+  else:
+    return core_input_reader.InputReader(params, **kwargs)
--- a/official/vision/dataloaders/maskrcnn_input.py
+++ b/official/vision/dataloaders/maskrcnn_input.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data parser and processing for Mask R-CNN."""
+
+# Import libraries
+
+import tensorflow as tf
+
+from official.vision.dataloaders import parser
+from official.vision.dataloaders import utils
+from official.vision.ops import anchor
+from official.vision.ops import box_ops
+from official.vision.ops import preprocess_ops
+
+
+class Parser(parser.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+
+  def __init__(self,
+               output_size,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               rpn_match_threshold=0.7,
+               rpn_unmatched_threshold=0.3,
+               rpn_batch_size_per_im=256,
+               rpn_fg_fraction=0.5,
+               aug_rand_hflip=False,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               skip_crowd_during_training=True,
+               max_num_instances=100,
+               include_mask=False,
+               mask_crop_size=112,
+               dtype='float32'):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      min_level: `int` number of minimum level of the output feature pyramid.
+      max_level: `int` number of maximum level of the output feature pyramid.
+      num_scales: `int` number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: `list` of float numbers representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: `float` number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      rpn_match_threshold:
+      rpn_unmatched_threshold:
+      rpn_batch_size_per_im:
+      rpn_fg_fraction:
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
+        `is_crowd` equals to 1.
+      max_num_instances: `int` number of maximum number of instances in an
+        image. The groundtruth data will be padded to `max_num_instances`.
+      include_mask: a bool to indicate whether parse mask groundtruth.
+      mask_crop_size: the size which groundtruth mask is cropped to.
+      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
+    """
+
+    self._max_num_instances = max_num_instances
+    self._skip_crowd_during_training = skip_crowd_during_training
+
+    # Anchor.
+    self._output_size = output_size
+    self._min_level = min_level
+    self._max_level = max_level
+    self._num_scales = num_scales
+    self._aspect_ratios = aspect_ratios
+    self._anchor_size = anchor_size
+
+    # Target assigning.
+    self._rpn_match_threshold = rpn_match_threshold
+    self._rpn_unmatched_threshold = rpn_unmatched_threshold
+    self._rpn_batch_size_per_im = rpn_batch_size_per_im
+    self._rpn_fg_fraction = rpn_fg_fraction
+
+    # Data augmentation.
+    self._aug_rand_hflip = aug_rand_hflip
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+
+    # Mask.
+    self._include_mask = include_mask
+    self._mask_crop_size = mask_crop_size
+
+    # Image output dtype.
+    self._dtype = dtype
+
+  def _parse_train_data(self, data):
+    """Parses data for training.
+
+    Args:
+      data: the decoded tensor dictionary from TfExampleDecoder.
+
+    Returns:
+      image: image tensor that is preproessed to have normalized value and
+        dimension [output_size[0], output_size[1], 3]
+      labels: a dictionary of tensors used for training. The following describes
+        {key: value} pairs in the dictionary.
+        image_info: a 2D `Tensor` that encodes the information of the image and
+          the applied preprocessing. It is in the format of
+          [[original_height, original_width], [scaled_height, scaled_width],
+        anchor_boxes: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, 4] representing anchor boxes at each level.
+        rpn_score_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location]. The height_l and
+          width_l represent the dimension of class logits at l-th level.
+        rpn_box_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
+          width_l represent the dimension of bounding box regression output at
+          l-th level.
+        gt_boxes: Groundtruth bounding box annotations. The box is represented
+           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
+           image that is fed to the network. The tennsor is padded with -1 to
+           the fixed dimension [self._max_num_instances, 4].
+        gt_classes: Groundtruth classes annotations. The tennsor is padded
+          with -1 to the fixed dimension [self._max_num_instances].
+        gt_masks: groundtrugh masks cropped by the bounding box and
+          resized to a fixed size determined by mask_crop_size.
+    """
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+    if self._include_mask:
+      masks = data['groundtruth_instance_masks']
+
+    is_crowds = data['groundtruth_is_crowd']
+    # Skips annotations with `is_crowd` = True.
+    if self._skip_crowd_during_training:
+      num_groundtruths = tf.shape(classes)[0]
+      with tf.control_dependencies([num_groundtruths, is_crowds]):
+        indices = tf.cond(
+            tf.greater(tf.size(is_crowds), 0),
+            lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
+            lambda: tf.cast(tf.range(num_groundtruths), tf.int64))
+      classes = tf.gather(classes, indices)
+      boxes = tf.gather(boxes, indices)
+      if self._include_mask:
+        masks = tf.gather(masks, indices)
+
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(image)[0:2]
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    # Flips image randomly during training.
+    if self._aug_rand_hflip:
+      if self._include_mask:
+        image, boxes, masks = preprocess_ops.random_horizontal_flip(
+            image, boxes, masks)
+      else:
+        image, boxes, _ = preprocess_ops.random_horizontal_flip(
+            image, boxes)
+
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    # Now the coordinates of boxes are w.r.t. the original image.
+    boxes = box_ops.denormalize_boxes(boxes, image_shape)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(
+            self._output_size, 2 ** self._max_level),
+        aug_scale_min=self._aug_scale_min,
+        aug_scale_max=self._aug_scale_max)
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Resizes and crops boxes.
+    # Now the coordinates of boxes are w.r.t the scaled image.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = preprocess_ops.resize_and_crop_boxes(
+        boxes, image_scale, image_info[1, :], offset)
+
+    # Filters out ground truth boxes that are all zeros.
+    indices = box_ops.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+    if self._include_mask:
+      masks = tf.gather(masks, indices)
+      # Transfer boxes to the original image space and do normalization.
+      cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
+      cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
+      cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape)
+      num_masks = tf.shape(masks)[0]
+      masks = tf.image.crop_and_resize(
+          tf.expand_dims(masks, axis=-1),
+          cropped_boxes,
+          box_indices=tf.range(num_masks, dtype=tf.int32),
+          crop_size=[self._mask_crop_size, self._mask_crop_size],
+          method='bilinear')
+      masks = tf.squeeze(masks, axis=-1)
+
+    # Assigns anchor targets.
+    # Note that after the target assignment, box targets are absolute pixel
+    # offsets w.r.t. the scaled image.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size)
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+    anchor_labeler = anchor.RpnAnchorLabeler(
+        self._rpn_match_threshold,
+        self._rpn_unmatched_threshold,
+        self._rpn_batch_size_per_im,
+        self._rpn_fg_fraction)
+    rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
+        anchor_boxes, boxes,
+        tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))
+
+    # Casts input image to self._dtype
+    image = tf.cast(image, dtype=self._dtype)
+
+    # Packs labels for model_fn outputs.
+    labels = {
+        'anchor_boxes':
+            anchor_boxes,
+        'image_info':
+            image_info,
+        'rpn_score_targets':
+            rpn_score_targets,
+        'rpn_box_targets':
+            rpn_box_targets,
+        'gt_boxes':
+            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
+                                                     self._max_num_instances,
+                                                     -1),
+        'gt_classes':
+            preprocess_ops.clip_or_pad_to_fixed_size(classes,
+                                                     self._max_num_instances,
+                                                     -1),
+    }
+    if self._include_mask:
+      labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size(
+          masks, self._max_num_instances, -1)
+
+    return image, labels
+
+  def _parse_eval_data(self, data):
+    """Parses data for evaluation.
+
+    Args:
+      data: the decoded tensor dictionary from TfExampleDecoder.
+
+    Returns:
+      A dictionary of {'images': image, 'labels': labels} where
+        image: image tensor that is preproessed to have normalized value and
+          dimension [output_size[0], output_size[1], 3]
+        labels: a dictionary of tensors used for training. The following
+          describes {key: value} pairs in the dictionary.
+          source_ids: Source image id. Default value -1 if the source id is
+            empty in the groundtruth annotation.
+          image_info: a 2D `Tensor` that encodes the information of the image
+            and the applied preprocessing. It is in the format of
+            [[original_height, original_width], [scaled_height, scaled_width],
+          anchor_boxes: ordered dictionary with keys
+            [min_level, min_level+1, ..., max_level]. The values are tensor with
+            shape [height_l, width_l, 4] representing anchor boxes at each
+            level.
+    """
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(image)[0:2]
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(
+            self._output_size, 2 ** self._max_level),
+        aug_scale_min=1.0,
+        aug_scale_max=1.0)
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Casts input image to self._dtype
+    image = tf.cast(image, dtype=self._dtype)
+
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape)
+
+    # Compute Anchor boxes.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size)
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+
+    labels = {
+        'image_info': image_info,
+        'anchor_boxes': anchor_boxes,
+    }
+
+    groundtruths = {
+        'source_id': data['source_id'],
+        'height': data['height'],
+        'width': data['width'],
+        'num_detections': tf.shape(data['groundtruth_classes'])[0],
+        'boxes': boxes,
+        'classes': data['groundtruth_classes'],
+        'areas': data['groundtruth_area'],
+        'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
+    }
+    groundtruths['source_id'] = utils.process_source_id(
+        groundtruths['source_id'])
+    groundtruths = utils.pad_groundtruths_to_fixed_size(
+        groundtruths, self._max_num_instances)
+    labels['groundtruths'] = groundtruths
+    return image, labels
--- a/official/vision/dataloaders/parser.py
+++ b/official/vision/dataloaders/parser.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The generic parser interface."""
+
+import abc
+
+
+class Parser(object):
+  """Parses data and produces tensors to be consumed by models."""
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def _parse_train_data(self, decoded_tensors):
+    """Generates images and labels that are usable for model training.
+
+    Args:
+      decoded_tensors: a dict of Tensors produced by the decoder.
+
+    Returns:
+      images: the image tensor.
+      labels: a dict of Tensors that contains labels.
+    """
+    pass
+
+  @abc.abstractmethod
+  def _parse_eval_data(self, decoded_tensors):
+    """Generates images and labels that are usable for model evaluation.
+
+    Args:
+      decoded_tensors: a dict of Tensors produced by the decoder.
+
+    Returns:
+      images: the image tensor.
+      labels: a dict of Tensors that contains labels.
+    """
+    pass
+
+  def parse_fn(self, is_training):
+    """Returns a parse fn that reads and parses raw tensors from the decoder.
+
+    Args:
+      is_training: a `bool` to indicate whether it is in training mode.
+
+    Returns:
+      parse: a `callable` that takes the serialized example and generate the
+        images, labels tuple where labels is a dict of Tensors that contains
+        labels.
+    """
+    def parse(decoded_tensors):
+      """Parses the serialized example data."""
+      if is_training:
+        return self._parse_train_data(decoded_tensors)
+      else:
+        return self._parse_eval_data(decoded_tensors)
+
+    return parse
+
+  @classmethod
+  def inference_fn(cls, inputs):
+    """Parses inputs for predictions.
+
+    Args:
+      inputs: A Tensor, or dictionary of Tensors.
+
+    Returns:
+      processed_inputs: An input tensor to the model.
+    """
+    pass
--- a/official/vision/dataloaders/retinanet_input.py
+++ b/official/vision/dataloaders/retinanet_input.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data parser and processing for RetinaNet.
+
+Parse image and ground truths in a dataset to training targets and package them
+into (image, labels) tuple for RetinaNet.
+"""
+
+# Import libraries
+from absl import logging
+import tensorflow as tf
+
+from official.vision.dataloaders import parser
+from official.vision.dataloaders import utils
+from official.vision.ops import anchor
+from official.vision.ops import augment
+from official.vision.ops import box_ops
+from official.vision.ops import preprocess_ops
+
+
+class Parser(parser.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+
+  def __init__(self,
+               output_size,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               match_threshold=0.5,
+               unmatched_threshold=0.5,
+               aug_type=None,
+               aug_rand_hflip=False,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               use_autoaugment=False,
+               autoaugment_policy_name='v0',
+               skip_crowd_during_training=True,
+               max_num_instances=100,
+               dtype='bfloat16',
+               mode=None):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      min_level: `int` number of minimum level of the output feature pyramid.
+      max_level: `int` number of maximum level of the output feature pyramid.
+      num_scales: `int` number representing intermediate scales added on each
+        level. For instances, num_scales=2 adds one additional intermediate
+        anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: `list` of float numbers representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: `float` number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      match_threshold: `float` number between 0 and 1 representing the
+        lower-bound threshold to assign positive labels for anchors. An anchor
+        with a score over the threshold is labeled positive.
+      unmatched_threshold: `float` number between 0 and 1 representing the
+        upper-bound threshold to assign negative labels for anchors. An anchor
+        with a score below the threshold is labeled negative.
+      aug_type: An optional Augmentation object to choose from AutoAugment and
+        RandAugment.
+      aug_rand_hflip: `bool`, if True, augment training with random horizontal
+        flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
+        during training.
+      autoaugment_policy_name: `string` that specifies the name of the
+        AutoAugment policy that will be used during training.
+      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
+        `is_crowd` equals to 1.
+      max_num_instances: `int` number of maximum number of instances in an
+        image. The groundtruth data will be padded to `max_num_instances`.
+      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
+      mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
+        prediction with groundtruths in the outputs.
+    """
+    self._mode = mode
+    self._max_num_instances = max_num_instances
+    self._skip_crowd_during_training = skip_crowd_during_training
+
+    # Anchor.
+    self._output_size = output_size
+    self._min_level = min_level
+    self._max_level = max_level
+    self._num_scales = num_scales
+    self._aspect_ratios = aspect_ratios
+    self._anchor_size = anchor_size
+    self._match_threshold = match_threshold
+    self._unmatched_threshold = unmatched_threshold
+
+    # Data augmentation.
+    self._aug_rand_hflip = aug_rand_hflip
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+
+    # Data augmentation with AutoAugment or RandAugment.
+    self._augmenter = None
+    if aug_type is not None:
+      if aug_type.type == 'autoaug':
+        logging.info('Using AutoAugment.')
+        self._augmenter = augment.AutoAugment(
+            augmentation_name=aug_type.autoaug.augmentation_name,
+            cutout_const=aug_type.autoaug.cutout_const,
+            translate_const=aug_type.autoaug.translate_const)
+      elif aug_type.type == 'randaug':
+        logging.info('Using RandAugment.')
+        self._augmenter = augment.RandAugment.build_for_detection(
+            num_layers=aug_type.randaug.num_layers,
+            magnitude=aug_type.randaug.magnitude,
+            cutout_const=aug_type.randaug.cutout_const,
+            translate_const=aug_type.randaug.translate_const,
+            prob_to_apply=aug_type.randaug.prob_to_apply,
+            exclude_ops=aug_type.randaug.exclude_ops)
+      else:
+        raise ValueError(f'Augmentation policy {aug_type.type} not supported.')
+
+    # Deprecated. Data Augmentation with AutoAugment.
+    self._use_autoaugment = use_autoaugment
+    self._autoaugment_policy_name = autoaugment_policy_name
+
+    # Data type.
+    self._dtype = dtype
+
+  def _parse_train_data(self, data):
+    """Parses data for training and evaluation."""
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+    # If not empty, `attributes` is a dict of (name, ground_truth) pairs.
+    # `ground_gruth` of attributes is assumed in shape [N, attribute_size].
+    # TODO(xianzhi): support parsing attributes weights.
+    attributes = data.get('groundtruth_attributes', {})
+    is_crowds = data['groundtruth_is_crowd']
+
+    # Skips annotations with `is_crowd` = True.
+    if self._skip_crowd_during_training:
+      num_groundtrtuhs = tf.shape(input=classes)[0]
+      with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
+        indices = tf.cond(
+            pred=tf.greater(tf.size(input=is_crowds), 0),
+            true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
+            false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
+      classes = tf.gather(classes, indices)
+      boxes = tf.gather(boxes, indices)
+      for k, v in attributes.items():
+        attributes[k] = tf.gather(v, indices)
+
+    # Gets original image.
+    image = data['image']
+
+    # Apply autoaug or randaug.
+    if self._augmenter is not None:
+      image, boxes = self._augmenter.distort_with_boxes(image, boxes)
+    image_shape = tf.shape(input=image)[0:2]
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    # Flips image randomly during training.
+    if self._aug_rand_hflip:
+      image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
+
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_ops.denormalize_boxes(boxes, image_shape)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(self._output_size,
+                                                       2**self._max_level),
+        aug_scale_min=self._aug_scale_min,
+        aug_scale_max=self._aug_scale_max)
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Resizes and crops boxes.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
+                                                 image_info[1, :], offset)
+    # Filters out ground truth boxes that are all zeros.
+    indices = box_ops.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+    for k, v in attributes.items():
+      attributes[k] = tf.gather(v, indices)
+
+    # Assigns anchors.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size)
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+    anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
+                                          self._unmatched_threshold)
+    (cls_targets, box_targets, att_targets, cls_weights,
+     box_weights) = anchor_labeler.label_anchors(
+         anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)
+
+    # Casts input image to desired data type.
+    image = tf.cast(image, dtype=self._dtype)
+
+    # Packs labels for model_fn outputs.
+    labels = {
+        'cls_targets': cls_targets,
+        'box_targets': box_targets,
+        'anchor_boxes': anchor_boxes,
+        'cls_weights': cls_weights,
+        'box_weights': box_weights,
+        'image_info': image_info,
+    }
+    if att_targets:
+      labels['attribute_targets'] = att_targets
+    return image, labels
+
+  def _parse_eval_data(self, data):
+    """Parses data for training and evaluation."""
+    groundtruths = {}
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+    # If not empty, `attributes` is a dict of (name, ground_truth) pairs.
+    # `ground_gruth` of attributes is assumed in shape [N, attribute_size].
+    # TODO(xianzhi): support parsing attributes weights.
+    attributes = data.get('groundtruth_attributes', {})
+
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(input=image)[0:2]
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_ops.denormalize_boxes(boxes, image_shape)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(self._output_size,
+                                                       2**self._max_level),
+        aug_scale_min=1.0,
+        aug_scale_max=1.0)
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Resizes and crops boxes.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
+                                                 image_info[1, :], offset)
+    # Filters out ground truth boxes that are all zeros.
+    indices = box_ops.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+    for k, v in attributes.items():
+      attributes[k] = tf.gather(v, indices)
+
+    # Assigns anchors.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size)
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+    anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
+                                          self._unmatched_threshold)
+    (cls_targets, box_targets, att_targets, cls_weights,
+     box_weights) = anchor_labeler.label_anchors(
+         anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)
+
+    # Casts input image to desired data type.
+    image = tf.cast(image, dtype=self._dtype)
+
+    # Sets up groundtruth data for evaluation.
+    groundtruths = {
+        'source_id': data['source_id'],
+        'height': data['height'],
+        'width': data['width'],
+        'num_detections': tf.shape(data['groundtruth_classes']),
+        'image_info': image_info,
+        'boxes': box_ops.denormalize_boxes(
+            data['groundtruth_boxes'], image_shape),
+        'classes': data['groundtruth_classes'],
+        'areas': data['groundtruth_area'],
+        'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
+    }
+    if 'groundtruth_attributes' in data:
+      groundtruths['attributes'] = data['groundtruth_attributes']
+    groundtruths['source_id'] = utils.process_source_id(
+        groundtruths['source_id'])
+    groundtruths = utils.pad_groundtruths_to_fixed_size(
+        groundtruths, self._max_num_instances)
+
+    # Packs labels for model_fn outputs.
+    labels = {
+        'cls_targets': cls_targets,
+        'box_targets': box_targets,
+        'anchor_boxes': anchor_boxes,
+        'cls_weights': cls_weights,
+        'box_weights': box_weights,
+        'image_info': image_info,
+        'groundtruths': groundtruths,
+    }
+    if att_targets:
+      labels['attribute_targets'] = att_targets
+    return image, labels
--- a/official/vision/dataloaders/segmentation_input.py
+++ b/official/vision/dataloaders/segmentation_input.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data parser and processing for segmentation datasets."""
+
+import tensorflow as tf
+from official.vision.dataloaders import decoder
+from official.vision.dataloaders import parser
+from official.vision.ops import preprocess_ops
+
+
+class Decoder(decoder.Decoder):
+  """A tf.Example decoder for segmentation task."""
+
+  def __init__(self):
+    self._keys_to_features = {
+        'image/encoded': tf.io.FixedLenFeature((), tf.string, default_value=''),
+        'image/height': tf.io.FixedLenFeature((), tf.int64, default_value=0),
+        'image/width': tf.io.FixedLenFeature((), tf.int64, default_value=0),
+        'image/segmentation/class/encoded':
+            tf.io.FixedLenFeature((), tf.string, default_value='')
+    }
+
+  def decode(self, serialized_example):
+    return tf.io.parse_single_example(
+        serialized_example, self._keys_to_features)
+
+
+class Parser(parser.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors.
+  """
+
+  def __init__(self,
+               output_size,
+               crop_size=None,
+               resize_eval_groundtruth=True,
+               groundtruth_padded_size=None,
+               ignore_label=255,
+               aug_rand_hflip=False,
+               preserve_aspect_ratio=True,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               dtype='float32'):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      crop_size: `Tensor` or `list` for [height, width] of the crop. If
+        specified a training crop of size crop_size is returned. This is useful
+        for cropping original images during training while evaluating on
+        original image sizes.
+      resize_eval_groundtruth: `bool`, if True, eval groundtruth masks are
+        resized to output_size.
+      groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
+        resize_eval_groundtruth is set to False, the groundtruth masks are
+        padded to this size.
+      ignore_label: `int` the pixel with ignore label will not used for training
+        and evaluation.
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved,
+        otherwise, the image is resized to output_size.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
+    """
+    self._output_size = output_size
+    self._crop_size = crop_size
+    self._resize_eval_groundtruth = resize_eval_groundtruth
+    if (not resize_eval_groundtruth) and (groundtruth_padded_size is None):
+      raise ValueError('groundtruth_padded_size ([height, width]) needs to be'
+                       'specified when resize_eval_groundtruth is False.')
+    self._groundtruth_padded_size = groundtruth_padded_size
+    self._ignore_label = ignore_label
+    self._preserve_aspect_ratio = preserve_aspect_ratio
+
+    # Data augmentation.
+    self._aug_rand_hflip = aug_rand_hflip
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+
+    # dtype.
+    self._dtype = dtype
+
+  def _prepare_image_and_label(self, data):
+    """Prepare normalized image and label."""
+    image = tf.io.decode_image(data['image/encoded'], channels=3)
+    label = tf.io.decode_image(data['image/segmentation/class/encoded'],
+                               channels=1)
+    height = data['image/height']
+    width = data['image/width']
+    image = tf.reshape(image, (height, width, 3))
+
+    label = tf.reshape(label, (1, height, width))
+    label = tf.cast(label, tf.float32)
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    if not self._preserve_aspect_ratio:
+      label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
+      image = tf.image.resize(image, self._output_size, method='bilinear')
+      label = tf.image.resize(label, self._output_size, method='nearest')
+      label = tf.reshape(label[:, :, -1], [1] + self._output_size)
+
+    return image, label
+
+  def _parse_train_data(self, data):
+    """Parses data for training and evaluation."""
+    image, label = self._prepare_image_and_label(data)
+
+    if self._crop_size:
+
+      label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
+      # If output_size is specified, resize image, and label to desired
+      # output_size.
+      if self._output_size:
+        image = tf.image.resize(image, self._output_size, method='bilinear')
+        label = tf.image.resize(label, self._output_size, method='nearest')
+
+      image_mask = tf.concat([image, label], axis=2)
+      image_mask_crop = tf.image.random_crop(image_mask,
+                                             self._crop_size + [4])
+      image = image_mask_crop[:, :, :-1]
+      label = tf.reshape(image_mask_crop[:, :, -1], [1] + self._crop_size)
+
+    # Flips image randomly during training.
+    if self._aug_rand_hflip:
+      image, _, label = preprocess_ops.random_horizontal_flip(
+          image, masks=label)
+
+    train_image_size = self._crop_size if self._crop_size else self._output_size
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        train_image_size,
+        train_image_size,
+        aug_scale_min=self._aug_scale_min,
+        aug_scale_max=self._aug_scale_max)
+
+    # Resizes and crops boxes.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+
+    # Pad label and make sure the padded region assigned to the ignore label.
+    # The label is first offset by +1 and then padded with 0.
+    label += 1
+    label = tf.expand_dims(label, axis=3)
+    label = preprocess_ops.resize_and_crop_masks(
+        label, image_scale, train_image_size, offset)
+    label -= 1
+    label = tf.where(tf.equal(label, -1),
+                     self._ignore_label * tf.ones_like(label), label)
+    label = tf.squeeze(label, axis=0)
+    valid_mask = tf.not_equal(label, self._ignore_label)
+    labels = {
+        'masks': label,
+        'valid_masks': valid_mask,
+        'image_info': image_info,
+    }
+
+    # Cast image as self._dtype
+    image = tf.cast(image, dtype=self._dtype)
+
+    return image, labels
+
+  def _parse_eval_data(self, data):
+    """Parses data for training and evaluation."""
+    image, label = self._prepare_image_and_label(data)
+    # The label is first offset by +1 and then padded with 0.
+    label += 1
+    label = tf.expand_dims(label, axis=3)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image, self._output_size, self._output_size)
+
+    if self._resize_eval_groundtruth:
+      # Resizes eval masks to match input image sizes. In that case, mean IoU
+      # is computed on output_size not the original size of the images.
+      image_scale = image_info[2, :]
+      offset = image_info[3, :]
+      label = preprocess_ops.resize_and_crop_masks(label, image_scale,
+                                                   self._output_size, offset)
+    else:
+      label = tf.image.pad_to_bounding_box(
+          label, 0, 0, self._groundtruth_padded_size[0],
+          self._groundtruth_padded_size[1])
+
+    label -= 1
+    label = tf.where(tf.equal(label, -1),
+                     self._ignore_label * tf.ones_like(label), label)
+    label = tf.squeeze(label, axis=0)
+
+    valid_mask = tf.not_equal(label, self._ignore_label)
+    labels = {
+        'masks': label,
+        'valid_masks': valid_mask,
+        'image_info': image_info
+    }
+
+    # Cast image as self._dtype
+    image = tf.cast(image, dtype=self._dtype)
+
+    return image, labels
--- a/official/vision/dataloaders/tf_example_decoder.py
+++ b/official/vision/dataloaders/tf_example_decoder.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tensorflow Example proto decoder for object detection.
+
+A decoder to decode string tensors containing serialized tensorflow.Example
+protos for object detection.
+"""
+import tensorflow as tf
+
+from official.vision.dataloaders import decoder
+
+
+def _generate_source_id(image_bytes):
+  # Hashing using 22 bits since float32 has only 23 mantissa bits.
+  return tf.strings.as_string(
+      tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 22 - 1))
+
+
+class TfExampleDecoder(decoder.Decoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self,
+               include_mask=False,
+               regenerate_source_id=False,
+               mask_binarize_threshold=None):
+    self._include_mask = include_mask
+    self._regenerate_source_id = regenerate_source_id
+    self._keys_to_features = {
+        'image/encoded': tf.io.FixedLenFeature((), tf.string),
+        'image/height': tf.io.FixedLenFeature((), tf.int64),
+        'image/width': tf.io.FixedLenFeature((), tf.int64),
+        'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
+        'image/object/class/label': tf.io.VarLenFeature(tf.int64),
+        'image/object/area': tf.io.VarLenFeature(tf.float32),
+        'image/object/is_crowd': tf.io.VarLenFeature(tf.int64),
+    }
+    self._mask_binarize_threshold = mask_binarize_threshold
+    if include_mask:
+      self._keys_to_features.update({
+          'image/object/mask': tf.io.VarLenFeature(tf.string),
+      })
+    if not regenerate_source_id:
+      self._keys_to_features.update({
+          'image/source_id': tf.io.FixedLenFeature((), tf.string),
+      })
+
+  def _decode_image(self, parsed_tensors):
+    """Decodes the image and set its static shape."""
+    image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3)
+    image.set_shape([None, None, 3])
+    return image
+
+  def _decode_boxes(self, parsed_tensors):
+    """Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
+    xmin = parsed_tensors['image/object/bbox/xmin']
+    xmax = parsed_tensors['image/object/bbox/xmax']
+    ymin = parsed_tensors['image/object/bbox/ymin']
+    ymax = parsed_tensors['image/object/bbox/ymax']
+    return tf.stack([ymin, xmin, ymax, xmax], axis=-1)
+
+  def _decode_classes(self, parsed_tensors):
+    return parsed_tensors['image/object/class/label']
+
+  def _decode_areas(self, parsed_tensors):
+    xmin = parsed_tensors['image/object/bbox/xmin']
+    xmax = parsed_tensors['image/object/bbox/xmax']
+    ymin = parsed_tensors['image/object/bbox/ymin']
+    ymax = parsed_tensors['image/object/bbox/ymax']
+    height = tf.cast(parsed_tensors['image/height'], dtype=tf.float32)
+    width = tf.cast(parsed_tensors['image/width'], dtype=tf.float32)
+    return tf.cond(
+        tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0),
+        lambda: parsed_tensors['image/object/area'],
+        lambda: (xmax - xmin) * (ymax - ymin) * height * width)
+
+  def _decode_masks(self, parsed_tensors):
+    """Decode a set of PNG masks to the tf.float32 tensors."""
+
+    def _decode_png_mask(png_bytes):
+      mask = tf.squeeze(
+          tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1)
+      mask = tf.cast(mask, dtype=tf.float32)
+      mask.set_shape([None, None])
+      return mask
+
+    height = parsed_tensors['image/height']
+    width = parsed_tensors['image/width']
+    masks = parsed_tensors['image/object/mask']
+    return tf.cond(
+        pred=tf.greater(tf.size(input=masks), 0),
+        true_fn=lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32),
+        false_fn=lambda: tf.zeros([0, height, width], dtype=tf.float32))
+
+  def decode(self, serialized_example):
+    """Decode the serialized example.
+
+    Args:
+      serialized_example: a single serialized tf.Example string.
+
+    Returns:
+      decoded_tensors: a dictionary of tensors with the following fields:
+        - source_id: a string scalar tensor.
+        - image: a uint8 tensor of shape [None, None, 3].
+        - height: an integer scalar tensor.
+        - width: an integer scalar tensor.
+        - groundtruth_classes: a int64 tensor of shape [None].
+        - groundtruth_is_crowd: a bool tensor of shape [None].
+        - groundtruth_area: a float32 tensor of shape [None].
+        - groundtruth_boxes: a float32 tensor of shape [None, 4].
+        - groundtruth_instance_masks: a float32 tensor of shape
+            [None, None, None].
+        - groundtruth_instance_masks_png: a string tensor of shape [None].
+    """
+    parsed_tensors = tf.io.parse_single_example(
+        serialized=serialized_example, features=self._keys_to_features)
+    for k in parsed_tensors:
+      if isinstance(parsed_tensors[k], tf.SparseTensor):
+        if parsed_tensors[k].dtype == tf.string:
+          parsed_tensors[k] = tf.sparse.to_dense(
+              parsed_tensors[k], default_value='')
+        else:
+          parsed_tensors[k] = tf.sparse.to_dense(
+              parsed_tensors[k], default_value=0)
+
+    if self._regenerate_source_id:
+      source_id = _generate_source_id(parsed_tensors['image/encoded'])
+    else:
+      source_id = tf.cond(
+          tf.greater(tf.strings.length(parsed_tensors['image/source_id']), 0),
+          lambda: parsed_tensors['image/source_id'],
+          lambda: _generate_source_id(parsed_tensors['image/encoded']))
+    image = self._decode_image(parsed_tensors)
+    boxes = self._decode_boxes(parsed_tensors)
+    classes = self._decode_classes(parsed_tensors)
+    areas = self._decode_areas(parsed_tensors)
+    is_crowds = tf.cond(
+        tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0),
+        lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool),
+        lambda: tf.zeros_like(classes, dtype=tf.bool))
+    if self._include_mask:
+      masks = self._decode_masks(parsed_tensors)
+
+      if self._mask_binarize_threshold is not None:
+        masks = tf.cast(masks > self._mask_binarize_threshold, tf.float32)
+
+    decoded_tensors = {
+        'source_id': source_id,
+        'image': image,
+        'height': parsed_tensors['image/height'],
+        'width': parsed_tensors['image/width'],
+        'groundtruth_classes': classes,
+        'groundtruth_is_crowd': is_crowds,
+        'groundtruth_area': areas,
+        'groundtruth_boxes': boxes,
+    }
+    if self._include_mask:
+      decoded_tensors.update({
+          'groundtruth_instance_masks': masks,
+          'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'],
+      })
+    return decoded_tensors