Removes unneeded content of the beta folder.

PiperOrigin-RevId: 437276665

Removes unneeded content of the beta folder.
PiperOrigin-RevId: 437276665
e4be7e00 · Yeqing Li · A. Unique TensorFlower · f47405b5 · f47405b5 · f47405b5
Commit e4be7e00 authored Mar 25, 2022 by Yeqing Li Committed by A. Unique TensorFlower Mar 25, 2022
20 changed files
--- a/official/vision/beta/configs/semantic_segmentation.py
+++ b/official/vision/beta/configs/semantic_segmentation.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Semantic segmentation configuration definition."""
-import dataclasses
-import os
-from typing import List, Optional, Union
-import numpy as np
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.modeling import hyperparams
-from official.modeling import optimization
-from official.vision.beta.configs import common
-from official.vision.beta.configs import decoders
-from official.vision.beta.configs import backbones
-@dataclasses.dataclass
-class DataConfig(cfg.DataConfig):
-  """Input config for training."""
-  output_size: List[int] = dataclasses.field(default_factory=list)
-  # If crop_size is specified, image will be resized first to
-  # output_size, then crop of size crop_size will be cropped.
-  crop_size: List[int] = dataclasses.field(default_factory=list)
-  input_path: str = ''
-  global_batch_size: int = 0
-  is_training: bool = True
-  dtype: str = 'float32'
-  shuffle_buffer_size: int = 1000
-  cycle_length: int = 10
-  # If resize_eval_groundtruth is set to False, original image sizes are used
-  # for eval. In that case, groundtruth_padded_size has to be specified too to
-  # allow for batching the variable input sizes of images.
-  resize_eval_groundtruth: bool = True
-  groundtruth_padded_size: List[int] = dataclasses.field(default_factory=list)
-  aug_scale_min: float = 1.0
-  aug_scale_max: float = 1.0
-  aug_rand_hflip: bool = True
-  preserve_aspect_ratio: bool = True
-  aug_policy: Optional[str] = None
-  drop_remainder: bool = True
-  file_type: str = 'tfrecord'
-  decoder: Optional[common.DataDecoder] = common.DataDecoder()
-@dataclasses.dataclass
-class SegmentationHead(hyperparams.Config):
-  """Segmentation head config."""
-  level: int = 3
-  num_convs: int = 2
-  num_filters: int = 256
-  use_depthwise_convolution: bool = False
-  prediction_kernel_size: int = 1
-  upsample_factor: int = 1
-  feature_fusion: Optional[
-      str] = None  # None, deeplabv3plus, panoptic_fpn_fusion or pyramid_fusion
-  # deeplabv3plus feature fusion params
-  low_level: Union[int, str] = 2
-  low_level_num_filters: int = 48
-  # panoptic_fpn_fusion params
-  decoder_min_level: Optional[Union[int, str]] = None
-  decoder_max_level: Optional[Union[int, str]] = None
-@dataclasses.dataclass
-class MaskScoringHead(hyperparams.Config):
-  """Mask Scoring head config."""
-  num_convs: int = 4
-  num_filters: int = 128
-  fc_input_size: List[int] = dataclasses.field(default_factory=list)
-  num_fcs: int = 2
-  fc_dims: int = 1024
-@dataclasses.dataclass
-class SemanticSegmentationModel(hyperparams.Config):
-  """Semantic segmentation model config."""
-  num_classes: int = 0
-  input_size: List[int] = dataclasses.field(default_factory=list)
-  min_level: int = 3
-  max_level: int = 6
-  head: SegmentationHead = SegmentationHead()
-  backbone: backbones.Backbone = backbones.Backbone(
-      type='resnet', resnet=backbones.ResNet())
-  decoder: decoders.Decoder = decoders.Decoder(type='identity')
-  mask_scoring_head: Optional[MaskScoringHead] = None
-  norm_activation: common.NormActivation = common.NormActivation()
-@dataclasses.dataclass
-class Losses(hyperparams.Config):
-  loss_weight: float = 1.0
-  label_smoothing: float = 0.0
-  ignore_label: int = 255
-  class_weights: List[float] = dataclasses.field(default_factory=list)
-  l2_weight_decay: float = 0.0
-  use_groundtruth_dimension: bool = True
-  top_k_percent_pixels: float = 1.0
-@dataclasses.dataclass
-class Evaluation(hyperparams.Config):
-  report_per_class_iou: bool = True
-  report_train_mean_iou: bool = True  # Turning this off can speed up training.
-@dataclasses.dataclass
-class SemanticSegmentationTask(cfg.TaskConfig):
-  """The model config."""
-  model: SemanticSegmentationModel = SemanticSegmentationModel()
-  train_data: DataConfig = DataConfig(is_training=True)
-  validation_data: DataConfig = DataConfig(is_training=False)
-  losses: Losses = Losses()
-  evaluation: Evaluation = Evaluation()
-  train_input_partition_dims: List[int] = dataclasses.field(
-      default_factory=list)
-  eval_input_partition_dims: List[int] = dataclasses.field(
-      default_factory=list)
-  init_checkpoint: Optional[str] = None
-  init_checkpoint_modules: Union[
-      str, List[str]] = 'all'  # all, backbone, and/or decoder
-@exp_factory.register_config_factory('semantic_segmentation')
-def semantic_segmentation() -> cfg.ExperimentConfig:
-  """Semantic segmentation general."""
-  return cfg.ExperimentConfig(
-      task=SemanticSegmentationTask(),
-      trainer=cfg.TrainerConfig(),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-# PASCAL VOC 2012 Dataset
-PASCAL_TRAIN_EXAMPLES = 10582
-PASCAL_VAL_EXAMPLES = 1449
-PASCAL_INPUT_PATH_BASE = 'gs://**/pascal_voc_seg'
-@exp_factory.register_config_factory('seg_deeplabv3_pascal')
-def seg_deeplabv3_pascal() -> cfg.ExperimentConfig:
-  """Image segmentation on pascal voc with resnet deeplabv3."""
-  train_batch_size = 16
-  eval_batch_size = 8
-  steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
-  output_stride = 16
-  aspp_dilation_rates = [12, 24, 36]  # [6, 12, 18] if output_stride = 16
-  multigrid = [1, 2, 4]
-  stem_type = 'v1'
-  level = int(np.math.log2(output_stride))
-  config = cfg.ExperimentConfig(
-      task=SemanticSegmentationTask(
-          model=SemanticSegmentationModel(
-              num_classes=21,
-              input_size=[None, None, 3],
-              backbone=backbones.Backbone(
-                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
-                      model_id=101, output_stride=output_stride,
-                      multigrid=multigrid, stem_type=stem_type)),
-              decoder=decoders.Decoder(
-                  type='aspp', aspp=decoders.ASPP(
-                      level=level, dilation_rates=aspp_dilation_rates)),
-              head=SegmentationHead(level=level, num_convs=0),
-              norm_activation=common.NormActivation(
-                  activation='swish',
-                  norm_momentum=0.9997,
-                  norm_epsilon=1e-3,
-                  use_sync_bn=True)),
-          losses=Losses(l2_weight_decay=1e-4),
-          train_data=DataConfig(
-              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
-              # TODO(arashwan): test changing size to 513 to match deeplab.
-              output_size=[512, 512],
-              is_training=True,
-              global_batch_size=train_batch_size,
-              aug_scale_min=0.5,
-              aug_scale_max=2.0),
-          validation_data=DataConfig(
-              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
-              output_size=[512, 512],
-              is_training=False,
-              global_batch_size=eval_batch_size,
-              resize_eval_groundtruth=False,
-              groundtruth_padded_size=[512, 512],
-              drop_remainder=False),
-          # resnet101
-          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
-          init_checkpoint_modules='backbone'),
-      trainer=cfg.TrainerConfig(
-          steps_per_loop=steps_per_epoch,
-          summary_interval=steps_per_epoch,
-          checkpoint_interval=steps_per_epoch,
-          train_steps=45 * steps_per_epoch,
-          validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
-          validation_interval=steps_per_epoch,
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'sgd',
-                  'sgd': {
-                      'momentum': 0.9
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 0.007,
-                      'decay_steps': 45 * steps_per_epoch,
-                      'end_learning_rate': 0.0,
-                      'power': 0.9
-                  }
-              },
-              'warmup': {
-                  'type': 'linear',
-                  'linear': {
-                      'warmup_steps': 5 * steps_per_epoch,
-                      'warmup_learning_rate': 0
-                  }
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  return config
-@exp_factory.register_config_factory('seg_deeplabv3plus_pascal')
-def seg_deeplabv3plus_pascal() -> cfg.ExperimentConfig:
-  """Image segmentation on pascal voc with resnet deeplabv3+."""
-  train_batch_size = 16
-  eval_batch_size = 8
-  steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
-  output_stride = 16
-  aspp_dilation_rates = [6, 12, 18]
-  multigrid = [1, 2, 4]
-  stem_type = 'v1'
-  level = int(np.math.log2(output_stride))
-  config = cfg.ExperimentConfig(
-      task=SemanticSegmentationTask(
-          model=SemanticSegmentationModel(
-              num_classes=21,
-              input_size=[None, None, 3],
-              backbone=backbones.Backbone(
-                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
-                      model_id=101, output_stride=output_stride,
-                      stem_type=stem_type, multigrid=multigrid)),
-              decoder=decoders.Decoder(
-                  type='aspp',
-                  aspp=decoders.ASPP(
-                      level=level, dilation_rates=aspp_dilation_rates)),
-              head=SegmentationHead(
-                  level=level,
-                  num_convs=2,
-                  feature_fusion='deeplabv3plus',
-                  low_level=2,
-                  low_level_num_filters=48),
-              norm_activation=common.NormActivation(
-                  activation='swish',
-                  norm_momentum=0.9997,
-                  norm_epsilon=1e-3,
-                  use_sync_bn=True)),
-          losses=Losses(l2_weight_decay=1e-4),
-          train_data=DataConfig(
-              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
-              output_size=[512, 512],
-              is_training=True,
-              global_batch_size=train_batch_size,
-              aug_scale_min=0.5,
-              aug_scale_max=2.0),
-          validation_data=DataConfig(
-              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
-              output_size=[512, 512],
-              is_training=False,
-              global_batch_size=eval_batch_size,
-              resize_eval_groundtruth=False,
-              groundtruth_padded_size=[512, 512],
-              drop_remainder=False),
-          # resnet101
-          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
-          init_checkpoint_modules='backbone'),
-      trainer=cfg.TrainerConfig(
-          steps_per_loop=steps_per_epoch,
-          summary_interval=steps_per_epoch,
-          checkpoint_interval=steps_per_epoch,
-          train_steps=45 * steps_per_epoch,
-          validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
-          validation_interval=steps_per_epoch,
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'sgd',
-                  'sgd': {
-                      'momentum': 0.9
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 0.007,
-                      'decay_steps': 45 * steps_per_epoch,
-                      'end_learning_rate': 0.0,
-                      'power': 0.9
-                  }
-              },
-              'warmup': {
-                  'type': 'linear',
-                  'linear': {
-                      'warmup_steps': 5 * steps_per_epoch,
-                      'warmup_learning_rate': 0
-                  }
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  return config
-@exp_factory.register_config_factory('seg_resnetfpn_pascal')
-def seg_resnetfpn_pascal() -> cfg.ExperimentConfig:
-  """Image segmentation on pascal voc with resnet-fpn."""
-  train_batch_size = 256
-  eval_batch_size = 32
-  steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
-  config = cfg.ExperimentConfig(
-      task=SemanticSegmentationTask(
-          model=SemanticSegmentationModel(
-              num_classes=21,
-              input_size=[512, 512, 3],
-              min_level=3,
-              max_level=7,
-              backbone=backbones.Backbone(
-                  type='resnet', resnet=backbones.ResNet(model_id=50)),
-              decoder=decoders.Decoder(type='fpn', fpn=decoders.FPN()),
-              head=SegmentationHead(level=3, num_convs=3),
-              norm_activation=common.NormActivation(
-                  activation='swish',
-                  use_sync_bn=True)),
-          losses=Losses(l2_weight_decay=1e-4),
-          train_data=DataConfig(
-              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
-              is_training=True,
-              global_batch_size=train_batch_size,
-              aug_scale_min=0.2,
-              aug_scale_max=1.5),
-          validation_data=DataConfig(
-              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
-              is_training=False,
-              global_batch_size=eval_batch_size,
-              resize_eval_groundtruth=False,
-              groundtruth_padded_size=[512, 512],
-              drop_remainder=False),
-      ),
-      trainer=cfg.TrainerConfig(
-          steps_per_loop=steps_per_epoch,
-          summary_interval=steps_per_epoch,
-          checkpoint_interval=steps_per_epoch,
-          train_steps=450 * steps_per_epoch,
-          validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
-          validation_interval=steps_per_epoch,
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'sgd',
-                  'sgd': {
-                      'momentum': 0.9
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 0.007,
-                      'decay_steps': 450 * steps_per_epoch,
-                      'end_learning_rate': 0.0,
-                      'power': 0.9
-                  }
-              },
-              'warmup': {
-                  'type': 'linear',
-                  'linear': {
-                      'warmup_steps': 5 * steps_per_epoch,
-                      'warmup_learning_rate': 0
-                  }
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  return config
-@exp_factory.register_config_factory('mnv2_deeplabv3_pascal')
-def mnv2_deeplabv3_pascal() -> cfg.ExperimentConfig:
-  """Image segmentation on pascal with mobilenetv2 deeplabv3."""
-  train_batch_size = 16
-  eval_batch_size = 16
-  steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
-  output_stride = 16
-  aspp_dilation_rates = []
-  level = int(np.math.log2(output_stride))
-  pool_kernel_size = []
-  config = cfg.ExperimentConfig(
-      task=SemanticSegmentationTask(
-          model=SemanticSegmentationModel(
-              num_classes=21,
-              input_size=[None, None, 3],
-              backbone=backbones.Backbone(
-                  type='mobilenet',
-                  mobilenet=backbones.MobileNet(
-                      model_id='MobileNetV2', output_stride=output_stride)),
-              decoder=decoders.Decoder(
-                  type='aspp',
-                  aspp=decoders.ASPP(
-                      level=level,
-                      dilation_rates=aspp_dilation_rates,
-                      pool_kernel_size=pool_kernel_size)),
-              head=SegmentationHead(level=level, num_convs=0),
-              norm_activation=common.NormActivation(
-                  activation='relu',
-                  norm_momentum=0.99,
-                  norm_epsilon=1e-3,
-                  use_sync_bn=True)),
-          losses=Losses(l2_weight_decay=4e-5),
-          train_data=DataConfig(
-              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
-              output_size=[512, 512],
-              is_training=True,
-              global_batch_size=train_batch_size,
-              aug_scale_min=0.5,
-              aug_scale_max=2.0),
-          validation_data=DataConfig(
-              input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
-              output_size=[512, 512],
-              is_training=False,
-              global_batch_size=eval_batch_size,
-              resize_eval_groundtruth=False,
-              groundtruth_padded_size=[512, 512],
-              drop_remainder=False),
-          # mobilenetv2
-          init_checkpoint='gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63',
-          init_checkpoint_modules=['backbone', 'decoder']),
-      trainer=cfg.TrainerConfig(
-          steps_per_loop=steps_per_epoch,
-          summary_interval=steps_per_epoch,
-          checkpoint_interval=steps_per_epoch,
-          train_steps=30000,
-          validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
-          validation_interval=steps_per_epoch,
-          best_checkpoint_eval_metric='mean_iou',
-          best_checkpoint_export_subdir='best_ckpt',
-          best_checkpoint_metric_comp='higher',
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'sgd',
-                  'sgd': {
-                      'momentum': 0.9
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 0.007 * train_batch_size / 16,
-                      'decay_steps': 30000,
-                      'end_learning_rate': 0.0,
-                      'power': 0.9
-                  }
-              },
-              'warmup': {
-                  'type': 'linear',
-                  'linear': {
-                      'warmup_steps': 5 * steps_per_epoch,
-                      'warmup_learning_rate': 0
-                  }
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  return config
-# Cityscapes Dataset (Download and process the dataset yourself)
-CITYSCAPES_TRAIN_EXAMPLES = 2975
-CITYSCAPES_VAL_EXAMPLES = 500
-CITYSCAPES_INPUT_PATH_BASE = 'cityscapes'
-@exp_factory.register_config_factory('seg_deeplabv3plus_cityscapes')
-def seg_deeplabv3plus_cityscapes() -> cfg.ExperimentConfig:
-  """Image segmentation on cityscapes with resnet deeplabv3+."""
-  train_batch_size = 16
-  eval_batch_size = 16
-  steps_per_epoch = CITYSCAPES_TRAIN_EXAMPLES // train_batch_size
-  output_stride = 16
-  aspp_dilation_rates = [6, 12, 18]
-  multigrid = [1, 2, 4]
-  stem_type = 'v1'
-  level = int(np.math.log2(output_stride))
-  config = cfg.ExperimentConfig(
-      task=SemanticSegmentationTask(
-          model=SemanticSegmentationModel(
-              # Cityscapes uses only 19 semantic classes for train/evaluation.
-              # The void (background) class is ignored in train and evaluation.
-              num_classes=19,
-              input_size=[None, None, 3],
-              backbone=backbones.Backbone(
-                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
-                      model_id=101, output_stride=output_stride,
-                      stem_type=stem_type, multigrid=multigrid)),
-              decoder=decoders.Decoder(
-                  type='aspp',
-                  aspp=decoders.ASPP(
-                      level=level, dilation_rates=aspp_dilation_rates,
-                      pool_kernel_size=[512, 1024])),
-              head=SegmentationHead(
-                  level=level,
-                  num_convs=2,
-                  feature_fusion='deeplabv3plus',
-                  low_level=2,
-                  low_level_num_filters=48),
-              norm_activation=common.NormActivation(
-                  activation='swish',
-                  norm_momentum=0.99,
-                  norm_epsilon=1e-3,
-                  use_sync_bn=True)),
-          losses=Losses(l2_weight_decay=1e-4),
-          train_data=DataConfig(
-              input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE,
-                                      'train_fine**'),
-              crop_size=[512, 1024],
-              output_size=[1024, 2048],
-              is_training=True,
-              global_batch_size=train_batch_size,
-              aug_scale_min=0.5,
-              aug_scale_max=2.0),
-          validation_data=DataConfig(
-              input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, 'val_fine*'),
-              output_size=[1024, 2048],
-              is_training=False,
-              global_batch_size=eval_batch_size,
-              resize_eval_groundtruth=True,
-              drop_remainder=False),
-          # resnet101
-          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
-          init_checkpoint_modules='backbone'),
-      trainer=cfg.TrainerConfig(
-          steps_per_loop=steps_per_epoch,
-          summary_interval=steps_per_epoch,
-          checkpoint_interval=steps_per_epoch,
-          train_steps=500 * steps_per_epoch,
-          validation_steps=CITYSCAPES_VAL_EXAMPLES // eval_batch_size,
-          validation_interval=steps_per_epoch,
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'sgd',
-                  'sgd': {
-                      'momentum': 0.9
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 0.01,
-                      'decay_steps': 500 * steps_per_epoch,
-                      'end_learning_rate': 0.0,
-                      'power': 0.9
-                  }
-              },
-              'warmup': {
-                  'type': 'linear',
-                  'linear': {
-                      'warmup_steps': 5 * steps_per_epoch,
-                      'warmup_learning_rate': 0
-                  }
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  return config
-@exp_factory.register_config_factory('mnv2_deeplabv3_cityscapes')
-def mnv2_deeplabv3_cityscapes() -> cfg.ExperimentConfig:
-  """Image segmentation on cityscapes with mobilenetv2 deeplabv3."""
-  train_batch_size = 16
-  eval_batch_size = 16
-  steps_per_epoch = CITYSCAPES_TRAIN_EXAMPLES // train_batch_size
-  output_stride = 16
-  aspp_dilation_rates = []
-  pool_kernel_size = [512, 1024]
-  level = int(np.math.log2(output_stride))
-  config = cfg.ExperimentConfig(
-      task=SemanticSegmentationTask(
-          model=SemanticSegmentationModel(
-              # Cityscapes uses only 19 semantic classes for train/evaluation.
-              # The void (background) class is ignored in train and evaluation.
-              num_classes=19,
-              input_size=[None, None, 3],
-              backbone=backbones.Backbone(
-                  type='mobilenet',
-                  mobilenet=backbones.MobileNet(
-                      model_id='MobileNetV2', output_stride=output_stride)),
-              decoder=decoders.Decoder(
-                  type='aspp',
-                  aspp=decoders.ASPP(
-                      level=level,
-                      dilation_rates=aspp_dilation_rates,
-                      pool_kernel_size=pool_kernel_size)),
-              head=SegmentationHead(level=level, num_convs=0),
-              norm_activation=common.NormActivation(
-                  activation='relu',
-                  norm_momentum=0.99,
-                  norm_epsilon=1e-3,
-                  use_sync_bn=True)),
-          losses=Losses(l2_weight_decay=4e-5),
-          train_data=DataConfig(
-              input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE,
-                                      'train_fine**'),
-              crop_size=[512, 1024],
-              output_size=[1024, 2048],
-              is_training=True,
-              global_batch_size=train_batch_size,
-              aug_scale_min=0.5,
-              aug_scale_max=2.0),
-          validation_data=DataConfig(
-              input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, 'val_fine*'),
-              output_size=[1024, 2048],
-              is_training=False,
-              global_batch_size=eval_batch_size,
-              resize_eval_groundtruth=True,
-              drop_remainder=False),
-          # Coco pre-trained mobilenetv2 checkpoint
-          init_checkpoint='gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63',
-          init_checkpoint_modules='backbone'),
-      trainer=cfg.TrainerConfig(
-          steps_per_loop=steps_per_epoch,
-          summary_interval=steps_per_epoch,
-          checkpoint_interval=steps_per_epoch,
-          train_steps=100000,
-          validation_steps=CITYSCAPES_VAL_EXAMPLES // eval_batch_size,
-          validation_interval=steps_per_epoch,
-          best_checkpoint_eval_metric='mean_iou',
-          best_checkpoint_export_subdir='best_ckpt',
-          best_checkpoint_metric_comp='higher',
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'sgd',
-                  'sgd': {
-                      'momentum': 0.9
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 0.01,
-                      'decay_steps': 100000,
-                      'end_learning_rate': 0.0,
-                      'power': 0.9
-                  }
-              },
-              'warmup': {
-                  'type': 'linear',
-                  'linear': {
-                      'warmup_steps': 5 * steps_per_epoch,
-                      'warmup_learning_rate': 0
-                  }
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  return config
-@exp_factory.register_config_factory('mnv2_deeplabv3plus_cityscapes')
-def mnv2_deeplabv3plus_cityscapes() -> cfg.ExperimentConfig:
-  """Image segmentation on cityscapes with mobilenetv2 deeplabv3plus."""
-  config = mnv2_deeplabv3_cityscapes()
-  config.task.model.head = SegmentationHead(
-      level=4,
-      num_convs=2,
-      feature_fusion='deeplabv3plus',
-      use_depthwise_convolution=True,
-      low_level='2/depthwise',
-      low_level_num_filters=48)
-  config.task.model.backbone.mobilenet.output_intermediate_endpoints = True
-  return config
--- a/official/vision/beta/configs/semantic_segmentation_test.py
+++ b/official/vision/beta/configs/semantic_segmentation_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for semantic_segmentation."""
-# pylint: disable=unused-import
-from absl.testing import parameterized
-import tensorflow as tf
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.vision import beta
-from official.vision.beta.configs import semantic_segmentation as exp_cfg
-class ImageSegmentationConfigTest(tf.test.TestCase, parameterized.TestCase):
-  @parameterized.parameters(('seg_deeplabv3_pascal',),
-                            ('seg_deeplabv3plus_pascal',))
-  def test_semantic_segmentation_configs(self, config_name):
-    config = exp_factory.get_exp_config(config_name)
-    self.assertIsInstance(config, cfg.ExperimentConfig)
-    self.assertIsInstance(config.task, exp_cfg.SemanticSegmentationTask)
-    self.assertIsInstance(config.task.model,
-                          exp_cfg.SemanticSegmentationModel)
-    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
-    config.validate()
-    config.task.train_data.is_training = None
-    with self.assertRaises(KeyError):
-      config.validate()
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/beta/configs/video_classification.py
+++ b/official/vision/beta/configs/video_classification.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Video classification configuration definition."""
-import dataclasses
-from typing import Optional, Tuple
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.modeling import hyperparams
-from official.modeling import optimization
-from official.vision.beta.configs import backbones_3d
-from official.vision.beta.configs import common
-@dataclasses.dataclass
-class DataConfig(cfg.DataConfig):
-  """The base configuration for building datasets."""
-  name: Optional[str] = None
-  file_type: Optional[str] = 'tfrecord'
-  compressed_input: bool = False
-  split: str = 'train'
-  variant_name: Optional[str] = None
-  feature_shape: Tuple[int, ...] = (64, 224, 224, 3)
-  temporal_stride: int = 1
-  random_stride_range: int = 0
-  num_test_clips: int = 1
-  num_test_crops: int = 1
-  num_classes: int = -1
-  num_examples: int = -1
-  global_batch_size: int = 128
-  data_format: str = 'channels_last'
-  dtype: str = 'float32'
-  one_hot: bool = True
-  shuffle_buffer_size: int = 64
-  cache: bool = False
-  input_path: str = ''
-  is_training: bool = True
-  cycle_length: int = 10
-  drop_remainder: bool = True
-  min_image_size: int = 256
-  is_multilabel: bool = False
-  output_audio: bool = False
-  audio_feature: str = ''
-  audio_feature_shape: Tuple[int, ...] = (-1,)
-  aug_min_aspect_ratio: float = 0.5
-  aug_max_aspect_ratio: float = 2.0
-  aug_min_area_ratio: float = 0.49
-  aug_max_area_ratio: float = 1.0
-  aug_type: Optional[str] = None  # 'autoaug', 'randaug', or None
-  image_field_key: str = 'image/encoded'
-  label_field_key: str = 'clip/label/index'
-def kinetics400(is_training):
-  """Generated Kinectics 400 dataset configs."""
-  return DataConfig(
-      name='kinetics400',
-      num_classes=400,
-      is_training=is_training,
-      split='train' if is_training else 'valid',
-      drop_remainder=is_training,
-      num_examples=215570 if is_training else 17706,
-      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
-def kinetics600(is_training):
-  """Generated Kinectics 600 dataset configs."""
-  return DataConfig(
-      name='kinetics600',
-      num_classes=600,
-      is_training=is_training,
-      split='train' if is_training else 'valid',
-      drop_remainder=is_training,
-      num_examples=366016 if is_training else 27780,
-      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
-def kinetics700(is_training):
-  """Generated Kinectics 600 dataset configs."""
-  return DataConfig(
-      name='kinetics700',
-      num_classes=700,
-      is_training=is_training,
-      split='train' if is_training else 'valid',
-      drop_remainder=is_training,
-      num_examples=522883 if is_training else 33441,
-      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
-def kinetics700_2020(is_training):
-  """Generated Kinectics 600 dataset configs."""
-  return DataConfig(
-      name='kinetics700',
-      num_classes=700,
-      is_training=is_training,
-      split='train' if is_training else 'valid',
-      drop_remainder=is_training,
-      num_examples=535982 if is_training else 33640,
-      feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
-@dataclasses.dataclass
-class VideoClassificationModel(hyperparams.Config):
-  """The model config."""
-  model_type: str = 'video_classification'
-  backbone: backbones_3d.Backbone3D = backbones_3d.Backbone3D(
-      type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50())
-  norm_activation: common.NormActivation = common.NormActivation(
-      use_sync_bn=False)
-  dropout_rate: float = 0.2
-  aggregate_endpoints: bool = False
-  require_endpoints: Optional[Tuple[str, ...]] = None
-@dataclasses.dataclass
-class Losses(hyperparams.Config):
-  one_hot: bool = True
-  label_smoothing: float = 0.0
-  l2_weight_decay: float = 0.0
-@dataclasses.dataclass
-class Metrics(hyperparams.Config):
-  use_per_class_recall: bool = False
-@dataclasses.dataclass
-class VideoClassificationTask(cfg.TaskConfig):
-  """The task config."""
-  model: VideoClassificationModel = VideoClassificationModel()
-  train_data: DataConfig = DataConfig(is_training=True, drop_remainder=True)
-  validation_data: DataConfig = DataConfig(
-      is_training=False, drop_remainder=False)
-  losses: Losses = Losses()
-  metrics: Metrics = Metrics()
-  init_checkpoint: Optional[str] = None
-  init_checkpoint_modules: str = 'all'  # all or backbone
-  # Spatial Partitioning fields.
-  train_input_partition_dims: Optional[Tuple[int, ...]] = None
-  eval_input_partition_dims: Optional[Tuple[int, ...]] = None
-def add_trainer(experiment: cfg.ExperimentConfig,
-                train_batch_size: int,
-                eval_batch_size: int,
-                learning_rate: float = 1.6,
-                train_epochs: int = 44,
-                warmup_epochs: int = 5):
-  """Add and config a trainer to the experiment config."""
-  if experiment.task.train_data.num_examples <= 0:
-    raise ValueError('Wrong train dataset size {!r}'.format(
-        experiment.task.train_data))
-  if experiment.task.validation_data.num_examples <= 0:
-    raise ValueError('Wrong validation dataset size {!r}'.format(
-        experiment.task.validation_data))
-  experiment.task.train_data.global_batch_size = train_batch_size
-  experiment.task.validation_data.global_batch_size = eval_batch_size
-  steps_per_epoch = experiment.task.train_data.num_examples // train_batch_size
-  experiment.trainer = cfg.TrainerConfig(
-      steps_per_loop=steps_per_epoch,
-      summary_interval=steps_per_epoch,
-      checkpoint_interval=steps_per_epoch,
-      train_steps=train_epochs * steps_per_epoch,
-      validation_steps=experiment.task.validation_data.num_examples //
-      eval_batch_size,
-      validation_interval=steps_per_epoch,
-      optimizer_config=optimization.OptimizationConfig({
-          'optimizer': {
-              'type': 'sgd',
-              'sgd': {
-                  'momentum': 0.9,
-                  'nesterov': True,
-              }
-          },
-          'learning_rate': {
-              'type': 'cosine',
-              'cosine': {
-                  'initial_learning_rate': learning_rate,
-                  'decay_steps': train_epochs * steps_per_epoch,
-              }
-          },
-          'warmup': {
-              'type': 'linear',
-              'linear': {
-                  'warmup_steps': warmup_epochs * steps_per_epoch,
-                  'warmup_learning_rate': 0
-              }
-          }
-      }))
-  return experiment
-@exp_factory.register_config_factory('video_classification')
-def video_classification() -> cfg.ExperimentConfig:
-  """Video classification general."""
-  return cfg.ExperimentConfig(
-      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
-      task=VideoClassificationTask(),
-      trainer=cfg.TrainerConfig(),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-          'task.train_data.num_classes == task.validation_data.num_classes',
-      ])
-@exp_factory.register_config_factory('video_classification_ucf101')
-def video_classification_ucf101() -> cfg.ExperimentConfig:
-  """Video classification on UCF-101 with resnet."""
-  train_dataset = DataConfig(
-      name='ucf101',
-      num_classes=101,
-      is_training=True,
-      split='train',
-      drop_remainder=True,
-      num_examples=9537,
-      temporal_stride=2,
-      feature_shape=(32, 224, 224, 3))
-  train_dataset.tfds_name = 'ucf101'
-  train_dataset.tfds_split = 'train'
-  validation_dataset = DataConfig(
-      name='ucf101',
-      num_classes=101,
-      is_training=True,
-      split='test',
-      drop_remainder=False,
-      num_examples=3783,
-      temporal_stride=2,
-      feature_shape=(32, 224, 224, 3))
-  validation_dataset.tfds_name = 'ucf101'
-  validation_dataset.tfds_split = 'test'
-  task = VideoClassificationTask(
-      model=VideoClassificationModel(
-          backbone=backbones_3d.Backbone3D(
-              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
-          norm_activation=common.NormActivation(
-              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
-      losses=Losses(l2_weight_decay=1e-4),
-      train_data=train_dataset,
-      validation_data=validation_dataset)
-  config = cfg.ExperimentConfig(
-      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
-      task=task,
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-          'task.train_data.num_classes == task.validation_data.num_classes',
-      ])
-  add_trainer(
-      config,
-      train_batch_size=64,
-      eval_batch_size=16,
-      learning_rate=0.8,
-      train_epochs=100)
-  return config
-@exp_factory.register_config_factory('video_classification_kinetics400')
-def video_classification_kinetics400() -> cfg.ExperimentConfig:
-  """Video classification on Kinectics 400 with resnet."""
-  train_dataset = kinetics400(is_training=True)
-  validation_dataset = kinetics400(is_training=False)
-  task = VideoClassificationTask(
-      model=VideoClassificationModel(
-          backbone=backbones_3d.Backbone3D(
-              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
-          norm_activation=common.NormActivation(
-              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
-      losses=Losses(l2_weight_decay=1e-4),
-      train_data=train_dataset,
-      validation_data=validation_dataset)
-  config = cfg.ExperimentConfig(
-      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
-      task=task,
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-          'task.train_data.num_classes == task.validation_data.num_classes',
-      ])
-  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
-  return config
-@exp_factory.register_config_factory('video_classification_kinetics600')
-def video_classification_kinetics600() -> cfg.ExperimentConfig:
-  """Video classification on Kinectics 600 with resnet."""
-  train_dataset = kinetics600(is_training=True)
-  validation_dataset = kinetics600(is_training=False)
-  task = VideoClassificationTask(
-      model=VideoClassificationModel(
-          backbone=backbones_3d.Backbone3D(
-              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
-          norm_activation=common.NormActivation(
-              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
-      losses=Losses(l2_weight_decay=1e-4),
-      train_data=train_dataset,
-      validation_data=validation_dataset)
-  config = cfg.ExperimentConfig(
-      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
-      task=task,
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-          'task.train_data.num_classes == task.validation_data.num_classes',
-      ])
-  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
-  return config
-@exp_factory.register_config_factory('video_classification_kinetics700')
-def video_classification_kinetics700() -> cfg.ExperimentConfig:
-  """Video classification on Kinectics 700 with resnet."""
-  train_dataset = kinetics700(is_training=True)
-  validation_dataset = kinetics700(is_training=False)
-  task = VideoClassificationTask(
-      model=VideoClassificationModel(
-          backbone=backbones_3d.Backbone3D(
-              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
-          norm_activation=common.NormActivation(
-              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
-      losses=Losses(l2_weight_decay=1e-4),
-      train_data=train_dataset,
-      validation_data=validation_dataset)
-  config = cfg.ExperimentConfig(
-      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
-      task=task,
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-          'task.train_data.num_classes == task.validation_data.num_classes',
-      ])
-  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
-  return config
-@exp_factory.register_config_factory('video_classification_kinetics700_2020')
-def video_classification_kinetics700_2020() -> cfg.ExperimentConfig:
-  """Video classification on Kinectics 700 2020 with resnet."""
-  train_dataset = kinetics700_2020(is_training=True)
-  validation_dataset = kinetics700_2020(is_training=False)
-  task = VideoClassificationTask(
-      model=VideoClassificationModel(
-          backbone=backbones_3d.Backbone3D(
-              type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
-          norm_activation=common.NormActivation(
-              norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
-      losses=Losses(l2_weight_decay=1e-4),
-      train_data=train_dataset,
-      validation_data=validation_dataset)
-  config = cfg.ExperimentConfig(
-      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
-      task=task,
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-          'task.train_data.num_classes == task.validation_data.num_classes',
-      ])
-  add_trainer(config, train_batch_size=1024, eval_batch_size=64)
-  return config
--- a/official/vision/beta/configs/video_classification_test.py
+++ b/official/vision/beta/configs/video_classification_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for video_classification."""
-# pylint: disable=unused-import
-from absl.testing import parameterized
-import tensorflow as tf
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.vision import beta
-from official.vision.beta.configs import video_classification as exp_cfg
-class VideoClassificationConfigTest(tf.test.TestCase, parameterized.TestCase):
-  @parameterized.parameters(('video_classification',),
-                            ('video_classification_kinetics600',))
-  def test_video_classification_configs(self, config_name):
-    config = exp_factory.get_exp_config(config_name)
-    self.assertIsInstance(config, cfg.ExperimentConfig)
-    self.assertIsInstance(config.task, exp_cfg.VideoClassificationTask)
-    self.assertIsInstance(config.task.model, exp_cfg.VideoClassificationModel)
-    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
-    config.validate()
-    config.task.train_data.is_training = None
-    with self.assertRaises(KeyError):
-      config.validate()
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/beta/data/__init__.py
+++ b/official/vision/beta/data/__init__.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/official/vision/beta/data/create_coco_tf_record.py
+++ b/official/vision/beta/data/create_coco_tf_record.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-r"""Convert raw COCO dataset to TFRecord format.
-This scripts follows the label map decoder format and supports detection
-boxes, instance masks and captions.
-Example usage:
-    python create_coco_tf_record.py --logtostderr \
-      --image_dir="${TRAIN_IMAGE_DIR}" \
-      --image_info_file="${TRAIN_IMAGE_INFO_FILE}" \
-      --object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
-      --caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
-      --output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
-      --num_shards=100
-"""
-import collections
-import json
-import logging
-import os
-from absl import app  # pylint:disable=unused-import
-from absl import flags
-import numpy as np
-from pycocotools import mask
-import tensorflow as tf
-import multiprocessing as mp
-from official.vision.beta.data import tfrecord_lib
-flags.DEFINE_boolean(
-    'include_masks', False, 'Whether to include instance segmentations masks '
-    '(PNG encoded) in the result. default: False.')
-flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
-flags.DEFINE_string(
-    'image_info_file', '', 'File containing image information. '
-    'Tf Examples in the output files correspond to the image '
-    'info entries in this file. If this file is not provided '
-    'object_annotations_file is used if present. Otherwise, '
-    'caption_annotations_file is used to get image info.')
-flags.DEFINE_string(
-    'object_annotations_file', '', 'File containing object '
-    'annotations - boxes and instance masks.')
-flags.DEFINE_string('caption_annotations_file', '', 'File containing image '
-                    'captions.')
-flags.DEFINE_string('panoptic_annotations_file', '', 'File containing panoptic '
-                    'annotations.')
-flags.DEFINE_string('panoptic_masks_dir', '',
-                    'Directory containing panoptic masks annotations.')
-flags.DEFINE_boolean(
-    'include_panoptic_masks', False, 'Whether to include category and '
-    'instance masks in the result. These are required to run the PQ evaluator '
-    'default: False.')
-flags.DEFINE_string('output_file_prefix', '/tmp/train', 'Path to output file')
-flags.DEFINE_integer('num_shards', 32, 'Number of shards for output file.')
-FLAGS = flags.FLAGS
-logger = tf.get_logger()
-logger.setLevel(logging.INFO)
-_VOID_LABEL = 0
-_VOID_INSTANCE_ID = 0
-_THING_CLASS_ID = 1
-_STUFF_CLASSES_OFFSET = 90
-def coco_segmentation_to_mask_png(segmentation, height, width, is_crowd):
-  """Encode a COCO mask segmentation as PNG string."""
-  run_len_encoding = mask.frPyObjects(segmentation, height, width)
-  binary_mask = mask.decode(run_len_encoding)
-  if not is_crowd:
-    binary_mask = np.amax(binary_mask, axis=2)
-  return tfrecord_lib.encode_mask_as_png(binary_mask)
-def generate_coco_panoptics_masks(segments_info, mask_path,
-                                  include_panoptic_masks,
-                                  is_category_thing):
-  """Creates masks for panoptic segmentation task.
-  Args:
-    segments_info: a list of dicts, where each dict has keys: [u'id',
-      u'category_id', u'area', u'bbox', u'iscrowd'], detailing information for
-      each segment in the panoptic mask.
-    mask_path: path to the panoptic mask.
-    include_panoptic_masks: bool, when set to True, category and instance
-      masks are included in the outputs. Set this to True, when using
-      the Panoptic Quality evaluator.
-    is_category_thing: a dict with category ids as keys and, 0/1 as values to
-      represent "stuff" and "things" classes respectively.
-  Returns:
-    A dict with with keys: [u'semantic_segmentation_mask', u'category_mask',
-      u'instance_mask']. The dict contains 'category_mask' and 'instance_mask'
-      only if `include_panoptic_eval_masks` is set to True.
-  """
-  rgb_mask = tfrecord_lib.read_image(mask_path)
-  r, g, b = np.split(rgb_mask, 3, axis=-1)
-  # decode rgb encoded panoptic mask to get segments ids
-  # refer https://cocodataset.org/#format-data
-  segments_encoded_mask = (r + g * 256 + b * (256**2)).squeeze()
-  semantic_segmentation_mask = np.ones_like(
-      segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
-  if include_panoptic_masks:
-    category_mask = np.ones_like(
-        segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
-    instance_mask = np.ones_like(
-        segments_encoded_mask, dtype=np.uint8) * _VOID_INSTANCE_ID
-  for idx, segment in enumerate(segments_info):
-    segment_id = segment['id']
-    category_id = segment['category_id']
-    if is_category_thing[category_id]:
-      encoded_category_id = _THING_CLASS_ID
-      instance_id = idx + 1
-    else:
-      encoded_category_id = category_id - _STUFF_CLASSES_OFFSET
-      instance_id = _VOID_INSTANCE_ID
-    segment_mask = (segments_encoded_mask == segment_id)
-    semantic_segmentation_mask[segment_mask] = encoded_category_id
-    if include_panoptic_masks:
-      category_mask[segment_mask] = category_id
-      instance_mask[segment_mask] = instance_id
-  outputs = {
-      'semantic_segmentation_mask': tfrecord_lib.encode_mask_as_png(
-          semantic_segmentation_mask)
-      }
-  if include_panoptic_masks:
-    outputs.update({
-        'category_mask': tfrecord_lib.encode_mask_as_png(category_mask),
-        'instance_mask': tfrecord_lib.encode_mask_as_png(instance_mask)
-        })
-  return outputs
-def coco_annotations_to_lists(bbox_annotations, id_to_name_map,
-                              image_height, image_width, include_masks):
-  """Converts COCO annotations to feature lists."""
-  data = dict((k, list()) for k in
-              ['xmin', 'xmax', 'ymin', 'ymax', 'is_crowd',
-               'category_id', 'category_names', 'area'])
-  if include_masks:
-    data['encoded_mask_png'] = []
-  num_annotations_skipped = 0
-  for object_annotations in bbox_annotations:
-    (x, y, width, height) = tuple(object_annotations['bbox'])
-    if width <= 0 or height <= 0:
-      num_annotations_skipped += 1
-      continue
-    if x + width > image_width or y + height > image_height:
-      num_annotations_skipped += 1
-      continue
-    data['xmin'].append(float(x) / image_width)
-    data['xmax'].append(float(x + width) / image_width)
-    data['ymin'].append(float(y) / image_height)
-    data['ymax'].append(float(y + height) / image_height)
-    data['is_crowd'].append(object_annotations['iscrowd'])
-    category_id = int(object_annotations['category_id'])
-    data['category_id'].append(category_id)
-    data['category_names'].append(id_to_name_map[category_id].encode('utf8'))
-    data['area'].append(object_annotations['area'])
-    if include_masks:
-      data['encoded_mask_png'].append(
-          coco_segmentation_to_mask_png(object_annotations['segmentation'],
-                                        image_height, image_width,
-                                        object_annotations['iscrowd'])
-      )
-  return data, num_annotations_skipped
-def bbox_annotations_to_feature_dict(
-    bbox_annotations, image_height, image_width, id_to_name_map, include_masks):
-  """Convert COCO annotations to an encoded feature dict."""
-  data, num_skipped = coco_annotations_to_lists(
-      bbox_annotations, id_to_name_map, image_height, image_width,
-      include_masks)
-  feature_dict = {
-      'image/object/bbox/xmin':
-          tfrecord_lib.convert_to_feature(data['xmin']),
-      'image/object/bbox/xmax':
-          tfrecord_lib.convert_to_feature(data['xmax']),
-      'image/object/bbox/ymin':
-          tfrecord_lib.convert_to_feature(data['ymin']),
-      'image/object/bbox/ymax':
-          tfrecord_lib.convert_to_feature(data['ymax']),
-      'image/object/class/text':
-          tfrecord_lib.convert_to_feature(data['category_names']),
-      'image/object/class/label':
-          tfrecord_lib.convert_to_feature(data['category_id']),
-      'image/object/is_crowd':
-          tfrecord_lib.convert_to_feature(data['is_crowd']),
-      'image/object/area':
-          tfrecord_lib.convert_to_feature(data['area']),
-  }
-  if include_masks:
-    feature_dict['image/object/mask'] = (
-        tfrecord_lib.convert_to_feature(data['encoded_mask_png']))
-  return feature_dict, num_skipped
-def encode_caption_annotations(caption_annotations):
-  captions = []
-  for caption_annotation in caption_annotations:
-    captions.append(caption_annotation['caption'].encode('utf8'))
-  return captions
-def create_tf_example(image,
-                      image_dirs,
-                      panoptic_masks_dir=None,
-                      bbox_annotations=None,
-                      id_to_name_map=None,
-                      caption_annotations=None,
-                      panoptic_annotation=None,
-                      is_category_thing=None,
-                      include_panoptic_masks=False,
-                      include_masks=False):
-  """Converts image and annotations to a tf.Example proto.
-  Args:
-    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
-      u'width', u'date_captured', u'flickr_url', u'id']
-    image_dirs: list of directories containing the image files.
-    panoptic_masks_dir: `str` of the panoptic masks directory.
-    bbox_annotations:
-      list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
-        u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
-        coordinates in the official COCO dataset are given as [x, y, width,
-        height] tuples using absolute coordinates where x, y represent the
-        top-left (0-indexed) corner.  This function converts to the format
-        expected by the Tensorflow Object Detection API (which is which is
-        [ymin, xmin, ymax, xmax] with coordinates normalized relative to image
-        size).
-    id_to_name_map: a dict mapping category IDs to string names.
-    caption_annotations:
-      list of dict with keys: [u'id', u'image_id', u'str'].
-    panoptic_annotation: dict with keys: [u'image_id', u'file_name',
-      u'segments_info']. Where the value for segments_info is a list of dicts,
-      with each dict containing information for a single segment in the mask.
-    is_category_thing: `bool`, whether it is a category thing.
-    include_panoptic_masks: `bool`, whether to include panoptic masks.
-    include_masks: Whether to include instance segmentations masks
-      (PNG encoded) in the result. default: False.
-  Returns:
-    example: The converted tf.Example
-    num_annotations_skipped: Number of (invalid) annotations that were ignored.
-  Raises:
-    ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
-      does not exist, or is not unique across image directories.
-  """
-  image_height = image['height']
-  image_width = image['width']
-  filename = image['file_name']
-  image_id = image['id']
-  if len(image_dirs) > 1:
-    full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
-    full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
-    if not full_existing_paths:
-      raise ValueError(
-          '{} does not exist across image directories.'.format(filename))
-    if len(full_existing_paths) > 1:
-      raise ValueError(
-          '{} is not unique across image directories'.format(filename))
-    full_path, = full_existing_paths
-  # If there is only one image directory, it's not worth checking for existence,
-  # since trying to open the file will raise an informative error message if it
-  # does not exist.
-  else:
-    image_dir, = image_dirs
-    full_path = os.path.join(image_dir, filename)
-  with tf.io.gfile.GFile(full_path, 'rb') as fid:
-    encoded_jpg = fid.read()
-  feature_dict = tfrecord_lib.image_info_to_feature_dict(
-      image_height, image_width, filename, image_id, encoded_jpg, 'jpg')
-  num_annotations_skipped = 0
-  if bbox_annotations:
-    box_feature_dict, num_skipped = bbox_annotations_to_feature_dict(
-        bbox_annotations, image_height, image_width, id_to_name_map,
-        include_masks)
-    num_annotations_skipped += num_skipped
-    feature_dict.update(box_feature_dict)
-  if caption_annotations:
-    encoded_captions = encode_caption_annotations(caption_annotations)
-    feature_dict.update(
-        {'image/caption': tfrecord_lib.convert_to_feature(encoded_captions)})
-  if panoptic_annotation:
-    segments_info = panoptic_annotation['segments_info']
-    panoptic_mask_filename = os.path.join(
-        panoptic_masks_dir,
-        panoptic_annotation['file_name'])
-    encoded_panoptic_masks = generate_coco_panoptics_masks(
-        segments_info, panoptic_mask_filename, include_panoptic_masks,
-        is_category_thing)
-    feature_dict.update(
-        {'image/segmentation/class/encoded': tfrecord_lib.convert_to_feature(
-            encoded_panoptic_masks['semantic_segmentation_mask'])})
-    if include_panoptic_masks:
-      feature_dict.update({
-          'image/panoptic/category_mask': tfrecord_lib.convert_to_feature(
-              encoded_panoptic_masks['category_mask']),
-          'image/panoptic/instance_mask': tfrecord_lib.convert_to_feature(
-              encoded_panoptic_masks['instance_mask'])
-            })
-  example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
-  return example, num_annotations_skipped
-def _load_object_annotations(object_annotations_file):
-  """Loads object annotation JSON file."""
-  with tf.io.gfile.GFile(object_annotations_file, 'r') as fid:
-    obj_annotations = json.load(fid)
-  images = obj_annotations['images']
-  id_to_name_map = dict((element['id'], element['name']) for element in
-                        obj_annotations['categories'])
-  img_to_obj_annotation = collections.defaultdict(list)
-  logging.info('Building bounding box index.')
-  for annotation in obj_annotations['annotations']:
-    image_id = annotation['image_id']
-    img_to_obj_annotation[image_id].append(annotation)
-  missing_annotation_count = 0
-  for image in images:
-    image_id = image['id']
-    if image_id not in img_to_obj_annotation:
-      missing_annotation_count += 1
-  logging.info('%d images are missing bboxes.', missing_annotation_count)
-  return img_to_obj_annotation, id_to_name_map
-def _load_caption_annotations(caption_annotations_file):
-  """Loads caption annotation JSON file."""
-  with tf.io.gfile.GFile(caption_annotations_file, 'r') as fid:
-    caption_annotations = json.load(fid)
-  img_to_caption_annotation = collections.defaultdict(list)
-  logging.info('Building caption index.')
-  for annotation in caption_annotations['annotations']:
-    image_id = annotation['image_id']
-    img_to_caption_annotation[image_id].append(annotation)
-  missing_annotation_count = 0
-  images = caption_annotations['images']
-  for image in images:
-    image_id = image['id']
-    if image_id not in img_to_caption_annotation:
-      missing_annotation_count += 1
-  logging.info('%d images are missing captions.', missing_annotation_count)
-  return img_to_caption_annotation
-def _load_panoptic_annotations(panoptic_annotations_file):
-  """Loads panoptic annotation from file."""
-  with tf.io.gfile.GFile(panoptic_annotations_file, 'r') as fid:
-    panoptic_annotations = json.load(fid)
-  img_to_panoptic_annotation = dict()
-  logging.info('Building panoptic index.')
-  for annotation in panoptic_annotations['annotations']:
-    image_id = annotation['image_id']
-    img_to_panoptic_annotation[image_id] = annotation
-  is_category_thing = dict()
-  for category_info in panoptic_annotations['categories']:
-    is_category_thing[category_info['id']] = category_info['isthing'] == 1
-  missing_annotation_count = 0
-  images = panoptic_annotations['images']
-  for image in images:
-    image_id = image['id']
-    if image_id not in img_to_panoptic_annotation:
-      missing_annotation_count += 1
-  logging.info(
-      '%d images are missing panoptic annotations.', missing_annotation_count)
-  return img_to_panoptic_annotation, is_category_thing
-def _load_images_info(images_info_file):
-  with tf.io.gfile.GFile(images_info_file, 'r') as fid:
-    info_dict = json.load(fid)
-  return info_dict['images']
-def generate_annotations(images, image_dirs,
-                         panoptic_masks_dir=None,
-                         img_to_obj_annotation=None,
-                         img_to_caption_annotation=None,
-                         img_to_panoptic_annotation=None,
-                         is_category_thing=None,
-                         id_to_name_map=None,
-                         include_panoptic_masks=False,
-                         include_masks=False):
-  """Generator for COCO annotations."""
-  for image in images:
-    object_annotation = (img_to_obj_annotation.get(image['id'], None) if
-                         img_to_obj_annotation else None)
-    caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
-                         img_to_caption_annotation else None)
-    panoptic_annotation = (img_to_panoptic_annotation.get(image['id'], None) if
-                           img_to_panoptic_annotation else None)
-    yield (image, image_dirs, panoptic_masks_dir, object_annotation,
-           id_to_name_map, caption_annotaion, panoptic_annotation,
-           is_category_thing, include_panoptic_masks, include_masks)
-def _create_tf_record_from_coco_annotations(images_info_file,
-                                            image_dirs,
-                                            output_path,
-                                            num_shards,
-                                            object_annotations_file=None,
-                                            caption_annotations_file=None,
-                                            panoptic_masks_dir=None,
-                                            panoptic_annotations_file=None,
-                                            include_panoptic_masks=False,
-                                            include_masks=False):
-  """Loads COCO annotation json files and converts to tf.Record format.
-  Args:
-    images_info_file: JSON file containing image info. The number of tf.Examples
-      in the output tf Record files is exactly equal to the number of image info
-      entries in this file. This can be any of train/val/test annotation json
-      files Eg. 'image_info_test-dev2017.json',
-      'instance_annotations_train2017.json',
-      'caption_annotations_train2017.json', etc.
-    image_dirs: List of directories containing the image files.
-    output_path: Path to output tf.Record file.
-    num_shards: Number of output files to create.
-    object_annotations_file: JSON file containing bounding box annotations.
-    caption_annotations_file: JSON file containing caption annotations.
-    panoptic_masks_dir: Directory containing panoptic masks.
-    panoptic_annotations_file: JSON file containing panoptic annotations.
-    include_panoptic_masks: Whether to include 'category_mask'
-      and 'instance_mask', which is required by the panoptic quality evaluator.
-    include_masks: Whether to include instance segmentations masks
-      (PNG encoded) in the result. default: False.
-  """
-  logging.info('writing to output path: %s', output_path)
-  images = _load_images_info(images_info_file)
-  img_to_obj_annotation = None
-  img_to_caption_annotation = None
-  id_to_name_map = None
-  img_to_panoptic_annotation = None
-  is_category_thing = None
-  if object_annotations_file:
-    img_to_obj_annotation, id_to_name_map = (
-        _load_object_annotations(object_annotations_file))
-  if caption_annotations_file:
-    img_to_caption_annotation = (
-        _load_caption_annotations(caption_annotations_file))
-  if panoptic_annotations_file:
-    img_to_panoptic_annotation, is_category_thing = (
-        _load_panoptic_annotations(panoptic_annotations_file))
-  coco_annotations_iter = generate_annotations(
-      images=images,
-      image_dirs=image_dirs,
-      panoptic_masks_dir=panoptic_masks_dir,
-      img_to_obj_annotation=img_to_obj_annotation,
-      img_to_caption_annotation=img_to_caption_annotation,
-      img_to_panoptic_annotation=img_to_panoptic_annotation,
-      is_category_thing=is_category_thing,
-      id_to_name_map=id_to_name_map,
-      include_panoptic_masks=include_panoptic_masks,
-      include_masks=include_masks)
-  num_skipped = tfrecord_lib.write_tf_record_dataset(
-      output_path, coco_annotations_iter, create_tf_example, num_shards)
-  logging.info('Finished writing, skipped %d annotations.', num_skipped)
-def main(_):
-  assert FLAGS.image_dir, '`image_dir` missing.'
-  assert (FLAGS.image_info_file or FLAGS.object_annotations_file or
-          FLAGS.caption_annotations_file), ('All annotation files are '
-                                            'missing.')
-  if FLAGS.image_info_file:
-    images_info_file = FLAGS.image_info_file
-  elif FLAGS.object_annotations_file:
-    images_info_file = FLAGS.object_annotations_file
-  else:
-    images_info_file = FLAGS.caption_annotations_file
-  directory = os.path.dirname(FLAGS.output_file_prefix)
-  if not tf.io.gfile.isdir(directory):
-    tf.io.gfile.makedirs(directory)
-  _create_tf_record_from_coco_annotations(images_info_file, FLAGS.image_dir,
-                                          FLAGS.output_file_prefix,
-                                          FLAGS.num_shards,
-                                          FLAGS.object_annotations_file,
-                                          FLAGS.caption_annotations_file,
-                                          FLAGS.panoptic_masks_dir,
-                                          FLAGS.panoptic_annotations_file,
-                                          FLAGS.include_panoptic_masks,
-                                          FLAGS.include_masks)
-if __name__ == '__main__':
-  app.run(main)
--- a/official/vision/beta/data/process_coco_few_shot.sh
+++ b/official/vision/beta/data/process_coco_few_shot.sh
-#!/bin/bash
-#
-# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
-tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
-base_image_dir="/tmp/coco_images"
-output_dir="/tmp/coco_few_shot"
-while getopts ":i:o:" o; do
-  case "${o}" in
-    o) output_dir=${OPTARG} ;;
-    i) base_image_dir=${OPTARG} ;;
-    *) echo "Usage: ${0} [-i <base_image_dir>] [-o <output_dir>]" 1>&2; exit 1 ;;
-  esac
-done
-cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
-wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
-    -P "${tmp_dir}" -A "trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json" \
-    "http://${cocosplit_url}/"
-mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
-rm -rf "${tmp_dir}/${cocosplit_url}/"
-python process_coco_few_shot_json_files.py \
-    --logtostderr --workdir="${tmp_dir}"
-for seed in {0..9}; do
-  for shots in 1 3 5 10 30; do
-    python create_coco_tf_record.py \
-        --logtostderr \
-        --image_dir="${base_image_dir}/train2014" \
-        --image_dir="${base_image_dir}/val2014" \
-        --image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
-        --object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
-        --caption_annotations_file="" \
-        --output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
-        --num_shards=4
-  done
-done
-python create_coco_tf_record.py \
-    --logtostderr \
-    --image_dir="${base_image_dir}/train2014" \
-    --image_dir="${base_image_dir}/val2014" \
-    --image_info_file="${tmp_dir}/datasplit/5k.json" \
-    --object_annotations_file="${tmp_dir}/datasplit/5k.json" \
-    --caption_annotations_file="" \
-    --output_file_prefix="${output_dir}/5k" \
-    --num_shards=10
-python create_coco_tf_record.py \
-    --logtostderr \
-    --image_dir="${base_image_dir}/train2014" \
-    --image_dir="${base_image_dir}/val2014" \
-    --image_info_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
-    --object_annotations_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
-    --caption_annotations_file="" \
-    --output_file_prefix="${output_dir}/trainvalno5k_base" \
-    --num_shards=200
-python create_coco_tf_record.py \
-    --logtostderr \
-    --image_dir="${base_image_dir}/train2014" \
-    --image_dir="${base_image_dir}/val2014" \
-    --image_info_file="${tmp_dir}/datasplit/5k_base.json" \
-    --object_annotations_file="${tmp_dir}/datasplit/5k_base.json" \
-    --caption_annotations_file="" \
-    --output_file_prefix="${output_dir}/5k_base" \
-    --num_shards=10
-rm -rf "${tmp_dir}"
--- a/official/vision/beta/data/process_coco_few_shot_json_files.py
+++ b/official/vision/beta/data/process_coco_few_shot_json_files.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Processes the JSON files for COCO few-shot.
-We assume that `workdir` mirrors the contents of
-http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
-files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
-"Frustratingly Simple Few-Shot Object Detection" paper uses.
-"""
-import collections
-import itertools
-import json
-import logging
-import os
-from absl import app
-from absl import flags
-import tensorflow as tf
-logger = tf.get_logger()
-logger.setLevel(logging.INFO)
-flags.DEFINE_string('workdir', None, 'Working directory.')
-FLAGS = flags.FLAGS
-CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
-              'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
-              'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
-              'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
-              'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
-              'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
-              'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
-              'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
-              'parking meter', 'person', 'pizza', 'potted plant',
-              'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
-              'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
-              'stop sign', 'suitcase', 'surfboard', 'teddy bear',
-              'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
-              'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
-              'wine glass', 'zebra']
-SEEDS = list(range(10))
-SHOTS = [1, 3, 5, 10, 30]
-FILE_SUFFIXES = collections.defaultdict(list)
-for _seed, _shots in itertools.product(SEEDS, SHOTS):
-  for _category in CATEGORIES:
-    FILE_SUFFIXES[(_seed, _shots)].append(
-        '{}full_box_{}shot_{}_trainval.json'.format(
-            # http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
-            #
-            #   datasplit/
-            #     trainvalno5k.json
-            #     5k.json
-            #   full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
-            #   seed{1-9}/
-            #     full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
-            #
-            # This means that the JSON files for seed0 are located in the root
-            # directory rather than in a `seed?/` subdirectory, hence the
-            # conditional expression below.
-            '' if _seed == 0 else 'seed{}/'.format(_seed),
-            _shots,
-            _category))
-# Base class IDs, as defined in
-# https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/evaluation/coco_evaluation.py#L60-L65
-BASE_CLASS_IDS = [8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
-                  35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51,
-                  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 65, 70, 73, 74, 75,
-                  76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
-def main(unused_argv):
-  workdir = FLAGS.workdir
-  # Filter novel class annotations from the training and validation sets.
-  for name in ('trainvalno5k', '5k'):
-    file_path = os.path.join(workdir, 'datasplit', '{}.json'.format(name))
-    with tf.io.gfile.GFile(file_path, 'r') as f:
-      json_dict = json.load(f)
-    json_dict['annotations'] = [a for a in json_dict['annotations']
-                                if a['category_id'] in BASE_CLASS_IDS]
-    output_path = os.path.join(
-        workdir, 'datasplit', '{}_base.json'.format(name))
-    with tf.io.gfile.GFile(output_path, 'w') as f:
-      json.dump(json_dict, f)
-  for seed, shots in itertools.product(SEEDS, SHOTS):
-    # Retrieve all examples for a given seed and shots setting.
-    file_paths = [os.path.join(workdir, suffix)
-                  for suffix in FILE_SUFFIXES[(seed, shots)]]
-    json_dicts = []
-    for file_path in file_paths:
-      with tf.io.gfile.GFile(file_path, 'r') as f:
-        json_dicts.append(json.load(f))
-    # Make sure that all JSON files for a given seed and shots setting have the
-    # same metadata. We count on this to fuse them later on.
-    metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
-                       'categories': d['categories']} for d in json_dicts]
-    if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
-      raise RuntimeError(
-          'JSON files for {} shots (seed {}) '.format(shots, seed) +
-          'have different info, licences, or categories fields')
-    # Retrieve images across all JSON files.
-    images = sum((d['images'] for d in json_dicts), [])
-    # Remove duplicate image entries.
-    images = list({image['id']: image for image in images}.values())
-    output_dict = {
-        'info': json_dicts[0]['info'],
-        'licenses': json_dicts[0]['licenses'],
-        'categories': json_dicts[0]['categories'],
-        'images': images,
-        'annotations': sum((d['annotations'] for d in json_dicts), [])
-    }
-    output_path = os.path.join(workdir,
-                               '{}shot_seed{}.json'.format(shots, seed))
-    with tf.io.gfile.GFile(output_path, 'w') as f:
-      json.dump(output_dict, f)
-    logger.info('Processed %d shots (seed %d) and saved to %s',
-                shots, seed, output_path)
-if __name__ == '__main__':
-  flags.mark_flag_as_required('workdir')
-  app.run(main)
--- a/official/vision/beta/data/process_coco_panoptic.sh
+++ b/official/vision/beta/data/process_coco_panoptic.sh
-#!/bin/bash
-sudo apt update
-sudo apt install unzip aria2 -y
-DATA_DIR=$1
-aria2c -j 8 -Z \
-  http://images.cocodataset.org/annotations/annotations_trainval2017.zip \
-  http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip \
-  http://images.cocodataset.org/zips/train2017.zip \
-  http://images.cocodataset.org/zips/val2017.zip \
-  --dir=$DATA_DIR;
-unzip $DATA_DIR/"*".zip -d $DATA_DIR;
-mkdir $DATA_DIR/zips && mv $DATA_DIR/*.zip $DATA_DIR/zips;
-unzip $DATA_DIR/annotations/panoptic_train2017.zip -d $DATA_DIR
-unzip $DATA_DIR/annotations/panoptic_val2017.zip -d $DATA_DIR
-python3 official/vision/beta/data/create_coco_tf_record.py \
-  --logtostderr  \
-  --image_dir="$DATA_DIR/val2017" \
-  --object_annotations_file="$DATA_DIR/annotations/instances_val2017.json"  \
-  --output_file_prefix="$DATA_DIR/tfrecords/val"  \
-  --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_val2017.json" \
-  --panoptic_masks_dir="$DATA_DIR/panoptic_val2017" \
-  --num_shards=8 \
-  --include_masks \
-  --include_panoptic_masks
-python3 official/vision/beta/data/create_coco_tf_record.py \
-  --logtostderr  \
-  --image_dir="$DATA_DIR/train2017" \
-  --object_annotations_file="$DATA_DIR/annotations/instances_train2017.json"  \
-  --output_file_prefix="$DATA_DIR/tfrecords/train"  \
-  --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_train2017.json" \
-  --panoptic_masks_dir="$DATA_DIR/panoptic_train2017" \
-  --num_shards=32 \
-  --include_masks \
-  --include_panoptic_masks
--- a/official/vision/beta/data/tfrecord_lib.py
+++ b/official/vision/beta/data/tfrecord_lib.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Helper functions for creating TFRecord datasets."""
-import hashlib
-import io
-import itertools
-from absl import logging
-import numpy as np
-from PIL import Image
-import tensorflow as tf
-import multiprocessing as mp
-def convert_to_feature(value, value_type=None):
-  """Converts the given python object to a tf.train.Feature.
-  Args:
-    value: int, float, bytes or a list of them.
-    value_type: optional, if specified, forces the feature to be of the given
-      type. Otherwise, type is inferred automatically. Can be one of
-      ['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list']
-  Returns:
-    feature: A tf.train.Feature object.
-  """
-  if value_type is None:
-    element = value[0] if isinstance(value, list) else value
-    if isinstance(element, bytes):
-      value_type = 'bytes'
-    elif isinstance(element, (int, np.integer)):
-      value_type = 'int64'
-    elif isinstance(element, (float, np.floating)):
-      value_type = 'float'
-    else:
-      raise ValueError('Cannot convert type {} to feature'.
-                       format(type(element)))
-    if isinstance(value, list):
-      value_type = value_type + '_list'
-  if value_type == 'int64':
-    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
-  elif value_type == 'int64_list':
-    value = np.asarray(value).astype(np.int64).reshape(-1)
-    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
-  elif value_type == 'float':
-    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
-  elif value_type == 'float_list':
-    value = np.asarray(value).astype(np.float32).reshape(-1)
-    return tf.train.Feature(float_list=tf.train.FloatList(value=value))
-  elif value_type == 'bytes':
-    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
-  elif value_type == 'bytes_list':
-    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
-  else:
-    raise ValueError('Unknown value_type parameter - {}'.format(value_type))
-def image_info_to_feature_dict(height, width, filename, image_id,
-                               encoded_str, encoded_format):
-  """Convert image information to a dict of features."""
-  key = hashlib.sha256(encoded_str).hexdigest()
-  return {
-      'image/height': convert_to_feature(height),
-      'image/width': convert_to_feature(width),
-      'image/filename': convert_to_feature(filename.encode('utf8')),
-      'image/source_id': convert_to_feature(str(image_id).encode('utf8')),
-      'image/key/sha256': convert_to_feature(key.encode('utf8')),
-      'image/encoded': convert_to_feature(encoded_str),
-      'image/format': convert_to_feature(encoded_format.encode('utf8')),
-  }
-def read_image(image_path):
-  pil_image = Image.open(image_path)
-  return np.asarray(pil_image)
-def encode_mask_as_png(mask):
-  pil_image = Image.fromarray(mask)
-  output_io = io.BytesIO()
-  pil_image.save(output_io, format='PNG')
-  return output_io.getvalue()
-def write_tf_record_dataset(output_path, annotation_iterator,
-                            process_func, num_shards,
-                            use_multiprocessing=True, unpack_arguments=True):
-  """Iterates over annotations, processes them and writes into TFRecords.
-  Args:
-    output_path: The prefix path to create TF record files.
-    annotation_iterator: An iterator of tuples containing details about the
-      dataset.
-    process_func: A function which takes the elements from the tuples of
-      annotation_iterator as arguments and returns a tuple of (tf.train.Example,
-      int). The integer indicates the number of annotations that were skipped.
-    num_shards: int, the number of shards to write for the dataset.
-    use_multiprocessing:
-      Whether or not to use multiple processes to write TF Records.
-    unpack_arguments:
-      Whether to unpack the tuples from annotation_iterator as individual
-        arguments to the process func or to pass the returned value as it is.
-  Returns:
-    num_skipped: The total number of skipped annotations.
-  """
-  writers = [
-      tf.io.TFRecordWriter(
-          output_path + '-%05d-of-%05d.tfrecord' % (i, num_shards))
-      for i in range(num_shards)
-  ]
-  total_num_annotations_skipped = 0
-  if use_multiprocessing:
-    pool = mp.Pool()
-    if unpack_arguments:
-      tf_example_iterator = pool.starmap(process_func, annotation_iterator)
-    else:
-      tf_example_iterator = pool.imap(process_func, annotation_iterator)
-  else:
-    if unpack_arguments:
-      tf_example_iterator = itertools.starmap(process_func, annotation_iterator)
-    else:
-      tf_example_iterator = map(process_func, annotation_iterator)
-  for idx, (tf_example, num_annotations_skipped) in enumerate(
-      tf_example_iterator):
-    if idx % 100 == 0:
-      logging.info('On image %d', idx)
-    total_num_annotations_skipped += num_annotations_skipped
-    writers[idx % num_shards].write(tf_example.SerializeToString())
-  if use_multiprocessing:
-    pool.close()
-    pool.join()
-  for writer in writers:
-    writer.close()
-  logging.info('Finished writing, skipped %d annotations.',
-               total_num_annotations_skipped)
-  return total_num_annotations_skipped
-def check_and_make_dir(directory):
-  """Creates the directory if it doesn't exist."""
-  if not tf.io.gfile.isdir(directory):
-    tf.io.gfile.makedirs(directory)
--- a/official/vision/beta/data/tfrecord_lib_test.py
+++ b/official/vision/beta/data/tfrecord_lib_test.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for tfrecord_lib."""
-import os
-from absl import flags
-from absl.testing import parameterized
-import tensorflow as tf
-from official.vision.beta.data import tfrecord_lib
-FLAGS = flags.FLAGS
-def process_sample(x):
-  d = {'x': x}
-  return tf.train.Example(features=tf.train.Features(feature=d)), 0
-def parse_function(example_proto):
-  feature_description = {
-      'x': tf.io.FixedLenFeature([], tf.int64, default_value=-1)
-  }
-  return tf.io.parse_single_example(example_proto, feature_description)
-class TfrecordLibTest(parameterized.TestCase):
-  def test_write_tf_record_dataset(self):
-    data = [(tfrecord_lib.convert_to_feature(i),) for i in range(17)]
-    path = os.path.join(FLAGS.test_tmpdir, 'train')
-    tfrecord_lib.write_tf_record_dataset(
-        path, data, process_sample, 3, use_multiprocessing=False)
-    tfrecord_files = tf.io.gfile.glob(path + '*')
-    self.assertLen(tfrecord_files, 3)
-    dataset = tf.data.TFRecordDataset(tfrecord_files)
-    dataset = dataset.map(parse_function)
-    read_values = set(d['x'] for d in dataset.as_numpy_iterator())
-    self.assertSetEqual(read_values, set(range(17)))
-  def test_convert_to_feature_float(self):
-    proto = tfrecord_lib.convert_to_feature(0.0)
-    self.assertEqual(proto.float_list.value[0], 0.0)
-  def test_convert_to_feature_int(self):
-    proto = tfrecord_lib.convert_to_feature(0)
-    self.assertEqual(proto.int64_list.value[0], 0)
-  def test_convert_to_feature_bytes(self):
-    proto = tfrecord_lib.convert_to_feature(b'123')
-    self.assertEqual(proto.bytes_list.value[0], b'123')
-  def test_convert_to_feature_float_list(self):
-    proto = tfrecord_lib.convert_to_feature([0.0, 1.0])
-    self.assertSequenceAlmostEqual(proto.float_list.value, [0.0, 1.0])
-  def test_convert_to_feature_int_list(self):
-    proto = tfrecord_lib.convert_to_feature([0, 1])
-    self.assertSequenceAlmostEqual(proto.int64_list.value, [0, 1])
-  def test_convert_to_feature_bytes_list(self):
-    proto = tfrecord_lib.convert_to_feature([b'123', b'456'])
-    self.assertSequenceAlmostEqual(proto.bytes_list.value, [b'123', b'456'])
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/vision/beta/dataloaders/__init__.py
+++ b/official/vision/beta/dataloaders/__init__.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/official/vision/beta/dataloaders/classification_input.py
+++ b/official/vision/beta/dataloaders/classification_input.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Classification decoder and parser."""
-from typing import Any, Dict, List, Optional
-# Import libraries
-import tensorflow as tf
-from official.vision.beta.configs import common
-from official.vision.beta.dataloaders import decoder
-from official.vision.beta.dataloaders import parser
-from official.vision.beta.ops import augment
-from official.vision.beta.ops import preprocess_ops
-MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-DEFAULT_IMAGE_FIELD_KEY = 'image/encoded'
-DEFAULT_LABEL_FIELD_KEY = 'image/class/label'
-class Decoder(decoder.Decoder):
-  """A tf.Example decoder for classification task."""
-  def __init__(self,
-               image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
-               label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
-               is_multilabel: bool = False,
-               keys_to_features: Optional[Dict[str, Any]] = None):
-    if not keys_to_features:
-      keys_to_features = {
-          image_field_key:
-              tf.io.FixedLenFeature((), tf.string, default_value=''),
-      }
-      if is_multilabel:
-        keys_to_features.update(
-            {label_field_key: tf.io.VarLenFeature(dtype=tf.int64)})
-      else:
-        keys_to_features.update({
-            label_field_key:
-                tf.io.FixedLenFeature((), tf.int64, default_value=-1)
-        })
-    self._keys_to_features = keys_to_features
-  def decode(self, serialized_example):
-    return tf.io.parse_single_example(
-        serialized_example, self._keys_to_features)
-class Parser(parser.Parser):
-  """Parser to parse an image and its annotations into a dictionary of tensors."""
-  def __init__(self,
-               output_size: List[int],
-               num_classes: float,
-               image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
-               label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
-               decode_jpeg_only: bool = True,
-               aug_rand_hflip: bool = True,
-               aug_type: Optional[common.Augmentation] = None,
-               color_jitter: float = 0.,
-               random_erasing: Optional[common.RandomErasing] = None,
-               is_multilabel: bool = False,
-               dtype: str = 'float32'):
-    """Initializes parameters for parsing annotations in the dataset.
-    Args:
-      output_size: `Tensor` or `list` for [height, width] of output image. The
-        output_size should be divided by the largest feature stride 2^max_level.
-      num_classes: `float`, number of classes.
-      image_field_key: `str`, the key name to encoded image in tf.Example.
-      label_field_key: `str`, the key name to label in tf.Example.
-      decode_jpeg_only: `bool`, if True, only JPEG format is decoded, this is
-        faster than decoding other types. Default is True.
-      aug_rand_hflip: `bool`, if True, augment training with random
-        horizontal flip.
-      aug_type: An optional Augmentation object to choose from AutoAugment and
-        RandAugment.
-      color_jitter: Magnitude of color jitter. If > 0, the value is used to
-        generate random scale factor for brightness, contrast and saturation.
-        See `preprocess_ops.color_jitter` for more details.
-      random_erasing: if not None, augment input image by random erasing. See
-        `augment.RandomErasing` for more details.
-      is_multilabel: A `bool`, whether or not each example has multiple labels.
-      dtype: `str`, cast output image in dtype. It can be 'float32', 'float16',
-        or 'bfloat16'.
-    """
-    self._output_size = output_size
-    self._aug_rand_hflip = aug_rand_hflip
-    self._num_classes = num_classes
-    self._image_field_key = image_field_key
-    if dtype == 'float32':
-      self._dtype = tf.float32
-    elif dtype == 'float16':
-      self._dtype = tf.float16
-    elif dtype == 'bfloat16':
-      self._dtype = tf.bfloat16
-    else:
-      raise ValueError('dtype {!r} is not supported!'.format(dtype))
-    if aug_type:
-      if aug_type.type == 'autoaug':
-        self._augmenter = augment.AutoAugment(
-            augmentation_name=aug_type.autoaug.augmentation_name,
-            cutout_const=aug_type.autoaug.cutout_const,
-            translate_const=aug_type.autoaug.translate_const)
-      elif aug_type.type == 'randaug':
-        self._augmenter = augment.RandAugment(
-            num_layers=aug_type.randaug.num_layers,
-            magnitude=aug_type.randaug.magnitude,
-            cutout_const=aug_type.randaug.cutout_const,
-            translate_const=aug_type.randaug.translate_const,
-            prob_to_apply=aug_type.randaug.prob_to_apply,
-            exclude_ops=aug_type.randaug.exclude_ops)
-      else:
-        raise ValueError('Augmentation policy {} not supported.'.format(
-            aug_type.type))
-    else:
-      self._augmenter = None
-    self._label_field_key = label_field_key
-    self._color_jitter = color_jitter
-    if random_erasing:
-      self._random_erasing = augment.RandomErasing(
-          probability=random_erasing.probability,
-          min_area=random_erasing.min_area,
-          max_area=random_erasing.max_area,
-          min_aspect=random_erasing.min_aspect,
-          max_aspect=random_erasing.max_aspect,
-          min_count=random_erasing.min_count,
-          max_count=random_erasing.max_count,
-          trials=random_erasing.trials)
-    else:
-      self._random_erasing = None
-    self._is_multilabel = is_multilabel
-    self._decode_jpeg_only = decode_jpeg_only
-  def _parse_train_data(self, decoded_tensors):
-    """Parses data for training."""
-    image = self._parse_train_image(decoded_tensors)
-    label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
-    if self._is_multilabel:
-      if isinstance(label, tf.sparse.SparseTensor):
-        label = tf.sparse.to_dense(label)
-      label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
-    return image, label
-  def _parse_eval_data(self, decoded_tensors):
-    """Parses data for evaluation."""
-    image = self._parse_eval_image(decoded_tensors)
-    label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
-    if self._is_multilabel:
-      if isinstance(label, tf.sparse.SparseTensor):
-        label = tf.sparse.to_dense(label)
-      label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
-    return image, label
-  def _parse_train_image(self, decoded_tensors):
-    """Parses image data for training."""
-    image_bytes = decoded_tensors[self._image_field_key]
-    if self._decode_jpeg_only:
-      image_shape = tf.image.extract_jpeg_shape(image_bytes)
-      # Crops image.
-      cropped_image = preprocess_ops.random_crop_image_v2(
-          image_bytes, image_shape)
-      image = tf.cond(
-          tf.reduce_all(tf.equal(tf.shape(cropped_image), image_shape)),
-          lambda: preprocess_ops.center_crop_image_v2(image_bytes, image_shape),
-          lambda: cropped_image)
-    else:
-      # Decodes image.
-      image = tf.io.decode_image(image_bytes, channels=3)
-      image.set_shape([None, None, 3])
-      # Crops image.
-      cropped_image = preprocess_ops.random_crop_image(image)
-      image = tf.cond(
-          tf.reduce_all(tf.equal(tf.shape(cropped_image), tf.shape(image))),
-          lambda: preprocess_ops.center_crop_image(image),
-          lambda: cropped_image)
-    if self._aug_rand_hflip:
-      image = tf.image.random_flip_left_right(image)
-    # Color jitter.
-    if self._color_jitter > 0:
-      image = preprocess_ops.color_jitter(image, self._color_jitter,
-                                          self._color_jitter,
-                                          self._color_jitter)
-    # Resizes image.
-    image = tf.image.resize(
-        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
-    image.set_shape([self._output_size[0], self._output_size[1], 3])
-    # Apply autoaug or randaug.
-    if self._augmenter is not None:
-      image = self._augmenter.distort(image)
-    # Normalizes image with mean and std pixel values.
-    image = preprocess_ops.normalize_image(image,
-                                           offset=MEAN_RGB,
-                                           scale=STDDEV_RGB)
-    # Random erasing after the image has been normalized
-    if self._random_erasing is not None:
-      image = self._random_erasing.distort(image)
-    # Convert image to self._dtype.
-    image = tf.image.convert_image_dtype(image, self._dtype)
-    return image
-  def _parse_eval_image(self, decoded_tensors):
-    """Parses image data for evaluation."""
-    image_bytes = decoded_tensors[self._image_field_key]
-    if self._decode_jpeg_only:
-      image_shape = tf.image.extract_jpeg_shape(image_bytes)
-      # Center crops.
-      image = preprocess_ops.center_crop_image_v2(image_bytes, image_shape)
-    else:
-      # Decodes image.
-      image = tf.io.decode_image(image_bytes, channels=3)
-      image.set_shape([None, None, 3])
-      # Center crops.
-      image = preprocess_ops.center_crop_image(image)
-    image = tf.image.resize(
-        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
-    image.set_shape([self._output_size[0], self._output_size[1], 3])
-    # Normalizes image with mean and std pixel values.
-    image = preprocess_ops.normalize_image(image,
-                                           offset=MEAN_RGB,
-                                           scale=STDDEV_RGB)
-    # Convert image to self._dtype.
-    image = tf.image.convert_image_dtype(image, self._dtype)
-    return image
-  @classmethod
-  def inference_fn(cls,
-                   image: tf.Tensor,
-                   input_image_size: List[int],
-                   num_channels: int = 3) -> tf.Tensor:
-    """Builds image model inputs for serving."""
-    image = tf.cast(image, dtype=tf.float32)
-    image = preprocess_ops.center_crop_image(image)
-    image = tf.image.resize(
-        image, input_image_size, method=tf.image.ResizeMethod.BILINEAR)
-    # Normalizes image with mean and std pixel values.
-    image = preprocess_ops.normalize_image(
-        image, offset=MEAN_RGB, scale=STDDEV_RGB)
-    image.set_shape(input_image_size + [num_channels])
-    return image
--- a/official/vision/beta/dataloaders/decoder.py
+++ b/official/vision/beta/dataloaders/decoder.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The generic decoder interface."""
-import abc
-class Decoder(object):
-  """Decodes the raw data into tensors."""
-  __metaclass__ = abc.ABCMeta
-  @abc.abstractmethod
-  def decode(self, serialized_example):
-    """Decodes the serialized example into tensors.
-    Args:
-      serialized_example: a serialized string tensor that encodes the data.
-    Returns:
-      decoded_tensors: a dict of Tensors.
-    """
-    pass
--- a/official/vision/beta/dataloaders/input_reader.py
+++ b/official/vision/beta/dataloaders/input_reader.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Dataset reader for vision model garden."""
-from typing import Any, Callable, Optional, Tuple
-import tensorflow as tf
-from official.core import config_definitions as cfg
-from official.core import input_reader
-def calculate_batch_sizes(total_batch_size: int,
-                          pseudo_label_ratio: float) -> Tuple[int, int]:
-  """Calculates labeled and pseudo-labeled dataset batch sizes.
-  Returns (labeled_batch_size, pseudo_labeled_batch_size) given a
-  total batch size and pseudo-label data ratio.
-  Args:
-   total_batch_size: The total batch size for all data.
-   pseudo_label_ratio: A non-negative float ratio of pseudo-labeled
-     to labeled data in a batch.
-  Returns:
-    (labeled_batch_size, pseudo_labeled_batch_size) as ints.
-  Raises:
-    ValueError: If total_batch_size is negative.
-    ValueError: If pseudo_label_ratio is negative.
-  """
-  if total_batch_size < 0:
-    raise ValueError('Invalid total_batch_size: {}'.format(total_batch_size))
-  if pseudo_label_ratio < 0.0:
-    raise ValueError(
-        'Invalid pseudo_label_ratio: {}'.format(pseudo_label_ratio))
-  ratio_factor = pseudo_label_ratio / (1.0 + pseudo_label_ratio)
-  pseudo_labeled_batch_size = int(round(total_batch_size * ratio_factor))
-  labeled_batch_size = total_batch_size - pseudo_labeled_batch_size
-  return labeled_batch_size, pseudo_labeled_batch_size
-class CombinationDatasetInputReader(input_reader.InputReader):
-  """Combination dataset input reader."""
-  def __init__(self,
-               params: cfg.DataConfig,
-               dataset_fn=tf.data.TFRecordDataset,
-               pseudo_label_dataset_fn=tf.data.TFRecordDataset,
-               decoder_fn: Optional[Callable[..., Any]] = None,
-               sample_fn: Optional[Callable[..., Any]] = None,
-               parser_fn: Optional[Callable[..., Any]] = None,
-               transform_and_batch_fn: Optional[Callable[
-                   [tf.data.Dataset, Optional[tf.distribute.InputContext]],
-                   tf.data.Dataset]] = None,
-               postprocess_fn: Optional[Callable[..., Any]] = None):
-    """Initializes an CombinationDatasetInputReader instance.
-    This class mixes a labeled and pseudo-labeled dataset. The params
-    must contain "pseudo_label_data.input_path" to specify the
-    pseudo-label dataset files and "pseudo_label_data.data_ratio"
-    to specify a per-batch mixing ratio of pseudo-label examples to
-    labeled dataset examples.
-    Args:
-      params: A config_definitions.DataConfig object.
-      dataset_fn: A `tf.data.Dataset` that consumes the input files. For
-        example, it can be `tf.data.TFRecordDataset`.
-      pseudo_label_dataset_fn: A `tf.data.Dataset` that consumes the input
-        files. For example, it can be `tf.data.TFRecordDataset`.
-      decoder_fn: An optional `callable` that takes the serialized data string
-        and decodes them into the raw tensor dictionary.
-      sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
-        input and outputs the transformed dataset. It performs sampling on the
-        decoded raw tensors dict before the parser_fn.
-      parser_fn: An optional `callable` that takes the decoded raw tensors dict
-        and parse them into a dictionary of tensors that can be consumed by the
-        model. It will be executed after decoder_fn.
-      transform_and_batch_fn: An optional `callable` that takes a
-        `tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
-        input, and returns a `tf.data.Dataset` object. It will be executed after
-        `parser_fn` to transform and batch the dataset; if None, after
-        `parser_fn` is executed, the dataset will be batched into per-replica
-        batch size.
-      postprocess_fn: A optional `callable` that processes batched tensors. It
-        will be executed after batching.
-    Raises:
-      ValueError: If drop_remainder is False.
-    """
-    super().__init__(params=params,
-                     dataset_fn=dataset_fn,
-                     decoder_fn=decoder_fn,
-                     sample_fn=sample_fn,
-                     parser_fn=parser_fn,
-                     transform_and_batch_fn=transform_and_batch_fn,
-                     postprocess_fn=postprocess_fn)
-    self._pseudo_label_file_pattern = params.pseudo_label_data.input_path
-    self._pseudo_label_dataset_fn = pseudo_label_dataset_fn
-    self._pseudo_label_data_ratio = params.pseudo_label_data.data_ratio
-    self._pseudo_label_matched_files = input_reader.match_files(
-        self._pseudo_label_file_pattern)
-    if not self._drop_remainder:
-      raise ValueError(
-          'Must use drop_remainder=True with CombinationDatasetInputReader')
-  def read(
-      self,
-      input_context: Optional[tf.distribute.InputContext] = None
-  ) -> tf.data.Dataset:
-    """Generates a tf.data.Dataset object."""
-    labeled_batch_size, pl_batch_size = calculate_batch_sizes(
-        self._global_batch_size, self._pseudo_label_data_ratio)
-    if not labeled_batch_size and pl_batch_size:
-      raise ValueError(
-          'Invalid batch_size: {} and pseudo_label_data_ratio: {}, '
-          'resulting in a 0 batch size for one of the datasets.'.format(
-              self._global_batch_size, self._pseudo_label_data_ratio))
-    def _read_decode_and_parse_dataset(matched_files, dataset_fn, batch_size,
-                                       input_context, tfds_builder):
-      dataset = self._read_data_source(matched_files, dataset_fn, input_context,
-                                       tfds_builder)
-      return self._decode_and_parse_dataset(dataset, batch_size, input_context)
-    labeled_dataset = _read_decode_and_parse_dataset(
-        matched_files=self._matched_files,
-        dataset_fn=self._dataset_fn,
-        batch_size=labeled_batch_size,
-        input_context=input_context,
-        tfds_builder=self._tfds_builder)
-    pseudo_labeled_dataset = _read_decode_and_parse_dataset(
-        matched_files=self._pseudo_label_matched_files,
-        dataset_fn=self._pseudo_label_dataset_fn,
-        batch_size=pl_batch_size,
-        input_context=input_context,
-        tfds_builder=False)
-    def concat_fn(d1, d2):
-      return tf.nest.map_structure(
-          lambda x1, x2: tf.concat([x1, x2], axis=0), d1, d2)
-    dataset_concat = tf.data.Dataset.zip(
-        (labeled_dataset, pseudo_labeled_dataset))
-    dataset_concat = dataset_concat.map(
-        concat_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    def maybe_map_fn(dataset, fn):
-      return dataset if fn is None else dataset.map(
-          fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    dataset_concat = maybe_map_fn(dataset_concat, self._postprocess_fn)
-    dataset_concat = self._maybe_apply_data_service(dataset_concat,
-                                                    input_context)
-    if self._deterministic is not None:
-      options = tf.data.Options()
-      options.experimental_deterministic = self._deterministic
-      dataset_concat = dataset_concat.with_options(options)
-    return dataset_concat.prefetch(tf.data.experimental.AUTOTUNE)
--- a/official/vision/beta/dataloaders/input_reader_factory.py
+++ b/official/vision/beta/dataloaders/input_reader_factory.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Factory for getting TF-Vision input readers."""
-from official.common import dataset_fn as dataset_fn_util
-from official.core import config_definitions as cfg
-from official.core import input_reader as core_input_reader
-from official.vision.beta.dataloaders import input_reader as vision_input_reader
-def input_reader_generator(params: cfg.DataConfig,
-                           **kwargs) -> core_input_reader.InputReader:
-  """Instantiates an input reader class according to the params.
-  Args:
-    params: A config_definitions.DataConfig object.
-    **kwargs: Additional arguments passed to input reader initialization.
-  Returns:
-    An InputReader object.
-  """
-  if params.is_training and params.get('pseudo_label_data', False):
-    return vision_input_reader.CombinationDatasetInputReader(
-        params,
-        pseudo_label_dataset_fn=dataset_fn_util.pick_dataset_fn(
-            params.pseudo_label_data.file_type),
-        **kwargs)
-  else:
-    return core_input_reader.InputReader(params, **kwargs)
--- a/official/vision/beta/dataloaders/maskrcnn_input.py
+++ b/official/vision/beta/dataloaders/maskrcnn_input.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data parser and processing for Mask R-CNN."""
-# Import libraries
-import tensorflow as tf
-from official.vision.beta.dataloaders import parser
-from official.vision.beta.dataloaders import utils
-from official.vision.beta.ops import anchor
-from official.vision.beta.ops import box_ops
-from official.vision.beta.ops import preprocess_ops
-class Parser(parser.Parser):
-  """Parser to parse an image and its annotations into a dictionary of tensors."""
-  def __init__(self,
-               output_size,
-               min_level,
-               max_level,
-               num_scales,
-               aspect_ratios,
-               anchor_size,
-               rpn_match_threshold=0.7,
-               rpn_unmatched_threshold=0.3,
-               rpn_batch_size_per_im=256,
-               rpn_fg_fraction=0.5,
-               aug_rand_hflip=False,
-               aug_scale_min=1.0,
-               aug_scale_max=1.0,
-               skip_crowd_during_training=True,
-               max_num_instances=100,
-               include_mask=False,
-               mask_crop_size=112,
-               dtype='float32'):
-    """Initializes parameters for parsing annotations in the dataset.
-    Args:
-      output_size: `Tensor` or `list` for [height, width] of output image. The
-        output_size should be divided by the largest feature stride 2^max_level.
-      min_level: `int` number of minimum level of the output feature pyramid.
-      max_level: `int` number of maximum level of the output feature pyramid.
-      num_scales: `int` number representing intermediate scales added
-        on each level. For instances, num_scales=2 adds one additional
-        intermediate anchor scales [2^0, 2^0.5] on each level.
-      aspect_ratios: `list` of float numbers representing the aspect raito
-        anchors added on each level. The number indicates the ratio of width to
-        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
-        on each scale level.
-      anchor_size: `float` number representing the scale of size of the base
-        anchor to the feature stride 2^level.
-      rpn_match_threshold:
-      rpn_unmatched_threshold:
-      rpn_batch_size_per_im:
-      rpn_fg_fraction:
-      aug_rand_hflip: `bool`, if True, augment training with random
-        horizontal flip.
-      aug_scale_min: `float`, the minimum scale applied to `output_size` for
-        data augmentation during training.
-      aug_scale_max: `float`, the maximum scale applied to `output_size` for
-        data augmentation during training.
-      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
-        `is_crowd` equals to 1.
-      max_num_instances: `int` number of maximum number of instances in an
-        image. The groundtruth data will be padded to `max_num_instances`.
-      include_mask: a bool to indicate whether parse mask groundtruth.
-      mask_crop_size: the size which groundtruth mask is cropped to.
-      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
-    """
-    self._max_num_instances = max_num_instances
-    self._skip_crowd_during_training = skip_crowd_during_training
-    # Anchor.
-    self._output_size = output_size
-    self._min_level = min_level
-    self._max_level = max_level
-    self._num_scales = num_scales
-    self._aspect_ratios = aspect_ratios
-    self._anchor_size = anchor_size
-    # Target assigning.
-    self._rpn_match_threshold = rpn_match_threshold
-    self._rpn_unmatched_threshold = rpn_unmatched_threshold
-    self._rpn_batch_size_per_im = rpn_batch_size_per_im
-    self._rpn_fg_fraction = rpn_fg_fraction
-    # Data augmentation.
-    self._aug_rand_hflip = aug_rand_hflip
-    self._aug_scale_min = aug_scale_min
-    self._aug_scale_max = aug_scale_max
-    # Mask.
-    self._include_mask = include_mask
-    self._mask_crop_size = mask_crop_size
-    # Image output dtype.
-    self._dtype = dtype
-  def _parse_train_data(self, data):
-    """Parses data for training.
-    Args:
-      data: the decoded tensor dictionary from TfExampleDecoder.
-    Returns:
-      image: image tensor that is preproessed to have normalized value and
-        dimension [output_size[0], output_size[1], 3]
-      labels: a dictionary of tensors used for training. The following describes
-        {key: value} pairs in the dictionary.
-        image_info: a 2D `Tensor` that encodes the information of the image and
-          the applied preprocessing. It is in the format of
-          [[original_height, original_width], [scaled_height, scaled_width],
-        anchor_boxes: ordered dictionary with keys
-          [min_level, min_level+1, ..., max_level]. The values are tensor with
-          shape [height_l, width_l, 4] representing anchor boxes at each level.
-        rpn_score_targets: ordered dictionary with keys
-          [min_level, min_level+1, ..., max_level]. The values are tensor with
-          shape [height_l, width_l, anchors_per_location]. The height_l and
-          width_l represent the dimension of class logits at l-th level.
-        rpn_box_targets: ordered dictionary with keys
-          [min_level, min_level+1, ..., max_level]. The values are tensor with
-          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
-          width_l represent the dimension of bounding box regression output at
-          l-th level.
-        gt_boxes: Groundtruth bounding box annotations. The box is represented
-           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
-           image that is fed to the network. The tennsor is padded with -1 to
-           the fixed dimension [self._max_num_instances, 4].
-        gt_classes: Groundtruth classes annotations. The tennsor is padded
-          with -1 to the fixed dimension [self._max_num_instances].
-        gt_masks: groundtrugh masks cropped by the bounding box and
-          resized to a fixed size determined by mask_crop_size.
-    """
-    classes = data['groundtruth_classes']
-    boxes = data['groundtruth_boxes']
-    if self._include_mask:
-      masks = data['groundtruth_instance_masks']
-    is_crowds = data['groundtruth_is_crowd']
-    # Skips annotations with `is_crowd` = True.
-    if self._skip_crowd_during_training:
-      num_groundtruths = tf.shape(classes)[0]
-      with tf.control_dependencies([num_groundtruths, is_crowds]):
-        indices = tf.cond(
-            tf.greater(tf.size(is_crowds), 0),
-            lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
-            lambda: tf.cast(tf.range(num_groundtruths), tf.int64))
-      classes = tf.gather(classes, indices)
-      boxes = tf.gather(boxes, indices)
-      if self._include_mask:
-        masks = tf.gather(masks, indices)
-    # Gets original image and its size.
-    image = data['image']
-    image_shape = tf.shape(image)[0:2]
-    # Normalizes image with mean and std pixel values.
-    image = preprocess_ops.normalize_image(image)
-    # Flips image randomly during training.
-    if self._aug_rand_hflip:
-      if self._include_mask:
-        image, boxes, masks = preprocess_ops.random_horizontal_flip(
-            image, boxes, masks)
-      else:
-        image, boxes, _ = preprocess_ops.random_horizontal_flip(
-            image, boxes)
-    # Converts boxes from normalized coordinates to pixel coordinates.
-    # Now the coordinates of boxes are w.r.t. the original image.
-    boxes = box_ops.denormalize_boxes(boxes, image_shape)
-    # Resizes and crops image.
-    image, image_info = preprocess_ops.resize_and_crop_image(
-        image,
-        self._output_size,
-        padded_size=preprocess_ops.compute_padded_size(
-            self._output_size, 2 ** self._max_level),
-        aug_scale_min=self._aug_scale_min,
-        aug_scale_max=self._aug_scale_max)
-    image_height, image_width, _ = image.get_shape().as_list()
-    # Resizes and crops boxes.
-    # Now the coordinates of boxes are w.r.t the scaled image.
-    image_scale = image_info[2, :]
-    offset = image_info[3, :]
-    boxes = preprocess_ops.resize_and_crop_boxes(
-        boxes, image_scale, image_info[1, :], offset)
-    # Filters out ground truth boxes that are all zeros.
-    indices = box_ops.get_non_empty_box_indices(boxes)
-    boxes = tf.gather(boxes, indices)
-    classes = tf.gather(classes, indices)
-    if self._include_mask:
-      masks = tf.gather(masks, indices)
-      # Transfer boxes to the original image space and do normalization.
-      cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
-      cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
-      cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape)
-      num_masks = tf.shape(masks)[0]
-      masks = tf.image.crop_and_resize(
-          tf.expand_dims(masks, axis=-1),
-          cropped_boxes,
-          box_indices=tf.range(num_masks, dtype=tf.int32),
-          crop_size=[self._mask_crop_size, self._mask_crop_size],
-          method='bilinear')
-      masks = tf.squeeze(masks, axis=-1)
-    # Assigns anchor targets.
-    # Note that after the target assignment, box targets are absolute pixel
-    # offsets w.r.t. the scaled image.
-    input_anchor = anchor.build_anchor_generator(
-        min_level=self._min_level,
-        max_level=self._max_level,
-        num_scales=self._num_scales,
-        aspect_ratios=self._aspect_ratios,
-        anchor_size=self._anchor_size)
-    anchor_boxes = input_anchor(image_size=(image_height, image_width))
-    anchor_labeler = anchor.RpnAnchorLabeler(
-        self._rpn_match_threshold,
-        self._rpn_unmatched_threshold,
-        self._rpn_batch_size_per_im,
-        self._rpn_fg_fraction)
-    rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
-        anchor_boxes, boxes,
-        tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))
-    # Casts input image to self._dtype
-    image = tf.cast(image, dtype=self._dtype)
-    # Packs labels for model_fn outputs.
-    labels = {
-        'anchor_boxes':
-            anchor_boxes,
-        'image_info':
-            image_info,
-        'rpn_score_targets':
-            rpn_score_targets,
-        'rpn_box_targets':
-            rpn_box_targets,
-        'gt_boxes':
-            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
-                                                     self._max_num_instances,
-                                                     -1),
-        'gt_classes':
-            preprocess_ops.clip_or_pad_to_fixed_size(classes,
-                                                     self._max_num_instances,
-                                                     -1),
-    }
-    if self._include_mask:
-      labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size(
-          masks, self._max_num_instances, -1)
-    return image, labels
-  def _parse_eval_data(self, data):
-    """Parses data for evaluation.
-    Args:
-      data: the decoded tensor dictionary from TfExampleDecoder.
-    Returns:
-      A dictionary of {'images': image, 'labels': labels} where
-        image: image tensor that is preproessed to have normalized value and
-          dimension [output_size[0], output_size[1], 3]
-        labels: a dictionary of tensors used for training. The following
-          describes {key: value} pairs in the dictionary.
-          source_ids: Source image id. Default value -1 if the source id is
-            empty in the groundtruth annotation.
-          image_info: a 2D `Tensor` that encodes the information of the image
-            and the applied preprocessing. It is in the format of
-            [[original_height, original_width], [scaled_height, scaled_width],
-          anchor_boxes: ordered dictionary with keys
-            [min_level, min_level+1, ..., max_level]. The values are tensor with
-            shape [height_l, width_l, 4] representing anchor boxes at each
-            level.
-    """
-    # Gets original image and its size.
-    image = data['image']
-    image_shape = tf.shape(image)[0:2]
-    # Normalizes image with mean and std pixel values.
-    image = preprocess_ops.normalize_image(image)
-    # Resizes and crops image.
-    image, image_info = preprocess_ops.resize_and_crop_image(
-        image,
-        self._output_size,
-        padded_size=preprocess_ops.compute_padded_size(
-            self._output_size, 2 ** self._max_level),
-        aug_scale_min=1.0,
-        aug_scale_max=1.0)
-    image_height, image_width, _ = image.get_shape().as_list()
-    # Casts input image to self._dtype
-    image = tf.cast(image, dtype=self._dtype)
-    # Converts boxes from normalized coordinates to pixel coordinates.
-    boxes = box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape)
-    # Compute Anchor boxes.
-    input_anchor = anchor.build_anchor_generator(
-        min_level=self._min_level,
-        max_level=self._max_level,
-        num_scales=self._num_scales,
-        aspect_ratios=self._aspect_ratios,
-        anchor_size=self._anchor_size)
-    anchor_boxes = input_anchor(image_size=(image_height, image_width))
-    labels = {
-        'image_info': image_info,
-        'anchor_boxes': anchor_boxes,
-    }
-    groundtruths = {
-        'source_id': data['source_id'],
-        'height': data['height'],
-        'width': data['width'],
-        'num_detections': tf.shape(data['groundtruth_classes'])[0],
-        'boxes': boxes,
-        'classes': data['groundtruth_classes'],
-        'areas': data['groundtruth_area'],
-        'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
-    }
-    groundtruths['source_id'] = utils.process_source_id(
-        groundtruths['source_id'])
-    groundtruths = utils.pad_groundtruths_to_fixed_size(
-        groundtruths, self._max_num_instances)
-    labels['groundtruths'] = groundtruths
-    return image, labels
--- a/official/vision/beta/dataloaders/parser.py
+++ b/official/vision/beta/dataloaders/parser.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The generic parser interface."""
-import abc
-class Parser(object):
-  """Parses data and produces tensors to be consumed by models."""
-  __metaclass__ = abc.ABCMeta
-  @abc.abstractmethod
-  def _parse_train_data(self, decoded_tensors):
-    """Generates images and labels that are usable for model training.
-    Args:
-      decoded_tensors: a dict of Tensors produced by the decoder.
-    Returns:
-      images: the image tensor.
-      labels: a dict of Tensors that contains labels.
-    """
-    pass
-  @abc.abstractmethod
-  def _parse_eval_data(self, decoded_tensors):
-    """Generates images and labels that are usable for model evaluation.
-    Args:
-      decoded_tensors: a dict of Tensors produced by the decoder.
-    Returns:
-      images: the image tensor.
-      labels: a dict of Tensors that contains labels.
-    """
-    pass
-  def parse_fn(self, is_training):
-    """Returns a parse fn that reads and parses raw tensors from the decoder.
-    Args:
-      is_training: a `bool` to indicate whether it is in training mode.
-    Returns:
-      parse: a `callable` that takes the serialized example and generate the
-        images, labels tuple where labels is a dict of Tensors that contains
-        labels.
-    """
-    def parse(decoded_tensors):
-      """Parses the serialized example data."""
-      if is_training:
-        return self._parse_train_data(decoded_tensors)
-      else:
-        return self._parse_eval_data(decoded_tensors)
-    return parse
-  @classmethod
-  def inference_fn(cls, inputs):
-    """Parses inputs for predictions.
-    Args:
-      inputs: A Tensor, or dictionary of Tensors.
-    Returns:
-      processed_inputs: An input tensor to the model.
-    """
-    pass
--- a/official/vision/beta/dataloaders/retinanet_input.py
+++ b/official/vision/beta/dataloaders/retinanet_input.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data parser and processing for RetinaNet.
-Parse image and ground truths in a dataset to training targets and package them
-into (image, labels) tuple for RetinaNet.
-"""
-# Import libraries
-from absl import logging
-import tensorflow as tf
-from official.vision.beta.dataloaders import parser
-from official.vision.beta.dataloaders import utils
-from official.vision.beta.ops import anchor
-from official.vision.beta.ops import augment
-from official.vision.beta.ops import box_ops
-from official.vision.beta.ops import preprocess_ops
-class Parser(parser.Parser):
-  """Parser to parse an image and its annotations into a dictionary of tensors."""
-  def __init__(self,
-               output_size,
-               min_level,
-               max_level,
-               num_scales,
-               aspect_ratios,
-               anchor_size,
-               match_threshold=0.5,
-               unmatched_threshold=0.5,
-               aug_type=None,
-               aug_rand_hflip=False,
-               aug_scale_min=1.0,
-               aug_scale_max=1.0,
-               use_autoaugment=False,
-               autoaugment_policy_name='v0',
-               skip_crowd_during_training=True,
-               max_num_instances=100,
-               dtype='bfloat16',
-               mode=None):
-    """Initializes parameters for parsing annotations in the dataset.
-    Args:
-      output_size: `Tensor` or `list` for [height, width] of output image. The
-        output_size should be divided by the largest feature stride 2^max_level.
-      min_level: `int` number of minimum level of the output feature pyramid.
-      max_level: `int` number of maximum level of the output feature pyramid.
-      num_scales: `int` number representing intermediate scales added on each
-        level. For instances, num_scales=2 adds one additional intermediate
-        anchor scales [2^0, 2^0.5] on each level.
-      aspect_ratios: `list` of float numbers representing the aspect raito
-        anchors added on each level. The number indicates the ratio of width to
-        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
-        on each scale level.
-      anchor_size: `float` number representing the scale of size of the base
-        anchor to the feature stride 2^level.
-      match_threshold: `float` number between 0 and 1 representing the
-        lower-bound threshold to assign positive labels for anchors. An anchor
-        with a score over the threshold is labeled positive.
-      unmatched_threshold: `float` number between 0 and 1 representing the
-        upper-bound threshold to assign negative labels for anchors. An anchor
-        with a score below the threshold is labeled negative.
-      aug_type: An optional Augmentation object to choose from AutoAugment and
-        RandAugment.
-      aug_rand_hflip: `bool`, if True, augment training with random horizontal
-        flip.
-      aug_scale_min: `float`, the minimum scale applied to `output_size` for
-        data augmentation during training.
-      aug_scale_max: `float`, the maximum scale applied to `output_size` for
-        data augmentation during training.
-      use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
-        during training.
-      autoaugment_policy_name: `string` that specifies the name of the
-        AutoAugment policy that will be used during training.
-      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
-        `is_crowd` equals to 1.
-      max_num_instances: `int` number of maximum number of instances in an
-        image. The groundtruth data will be padded to `max_num_instances`.
-      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
-      mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
-        prediction with groundtruths in the outputs.
-    """
-    self._mode = mode
-    self._max_num_instances = max_num_instances
-    self._skip_crowd_during_training = skip_crowd_during_training
-    # Anchor.
-    self._output_size = output_size
-    self._min_level = min_level
-    self._max_level = max_level
-    self._num_scales = num_scales
-    self._aspect_ratios = aspect_ratios
-    self._anchor_size = anchor_size
-    self._match_threshold = match_threshold
-    self._unmatched_threshold = unmatched_threshold
-    # Data augmentation.
-    self._aug_rand_hflip = aug_rand_hflip
-    self._aug_scale_min = aug_scale_min
-    self._aug_scale_max = aug_scale_max
-    # Data augmentation with AutoAugment or RandAugment.
-    self._augmenter = None
-    if aug_type is not None:
-      if aug_type.type == 'autoaug':
-        logging.info('Using AutoAugment.')
-        self._augmenter = augment.AutoAugment(
-            augmentation_name=aug_type.autoaug.augmentation_name,
-            cutout_const=aug_type.autoaug.cutout_const,
-            translate_const=aug_type.autoaug.translate_const)
-      elif aug_type.type == 'randaug':
-        logging.info('Using RandAugment.')
-        self._augmenter = augment.RandAugment.build_for_detection(
-            num_layers=aug_type.randaug.num_layers,
-            magnitude=aug_type.randaug.magnitude,
-            cutout_const=aug_type.randaug.cutout_const,
-            translate_const=aug_type.randaug.translate_const,
-            prob_to_apply=aug_type.randaug.prob_to_apply,
-            exclude_ops=aug_type.randaug.exclude_ops)
-      else:
-        raise ValueError(f'Augmentation policy {aug_type.type} not supported.')
-    # Deprecated. Data Augmentation with AutoAugment.
-    self._use_autoaugment = use_autoaugment
-    self._autoaugment_policy_name = autoaugment_policy_name
-    # Data type.
-    self._dtype = dtype
-  def _parse_train_data(self, data):
-    """Parses data for training and evaluation."""
-    classes = data['groundtruth_classes']
-    boxes = data['groundtruth_boxes']
-    # If not empty, `attributes` is a dict of (name, ground_truth) pairs.
-    # `ground_gruth` of attributes is assumed in shape [N, attribute_size].
-    # TODO(xianzhi): support parsing attributes weights.
-    attributes = data.get('groundtruth_attributes', {})
-    is_crowds = data['groundtruth_is_crowd']
-    # Skips annotations with `is_crowd` = True.
-    if self._skip_crowd_during_training:
-      num_groundtrtuhs = tf.shape(input=classes)[0]
-      with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
-        indices = tf.cond(
-            pred=tf.greater(tf.size(input=is_crowds), 0),
-            true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
-            false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
-      classes = tf.gather(classes, indices)
-      boxes = tf.gather(boxes, indices)
-      for k, v in attributes.items():
-        attributes[k] = tf.gather(v, indices)
-    # Gets original image.
-    image = data['image']
-    # Apply autoaug or randaug.
-    if self._augmenter is not None:
-      image, boxes = self._augmenter.distort_with_boxes(image, boxes)
-    image_shape = tf.shape(input=image)[0:2]
-    # Normalizes image with mean and std pixel values.
-    image = preprocess_ops.normalize_image(image)
-    # Flips image randomly during training.
-    if self._aug_rand_hflip:
-      image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
-    # Converts boxes from normalized coordinates to pixel coordinates.
-    boxes = box_ops.denormalize_boxes(boxes, image_shape)
-    # Resizes and crops image.
-    image, image_info = preprocess_ops.resize_and_crop_image(
-        image,
-        self._output_size,
-        padded_size=preprocess_ops.compute_padded_size(self._output_size,
-                                                       2**self._max_level),
-        aug_scale_min=self._aug_scale_min,
-        aug_scale_max=self._aug_scale_max)
-    image_height, image_width, _ = image.get_shape().as_list()
-    # Resizes and crops boxes.
-    image_scale = image_info[2, :]
-    offset = image_info[3, :]
-    boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
-                                                 image_info[1, :], offset)
-    # Filters out ground truth boxes that are all zeros.
-    indices = box_ops.get_non_empty_box_indices(boxes)
-    boxes = tf.gather(boxes, indices)
-    classes = tf.gather(classes, indices)
-    for k, v in attributes.items():
-      attributes[k] = tf.gather(v, indices)
-    # Assigns anchors.
-    input_anchor = anchor.build_anchor_generator(
-        min_level=self._min_level,
-        max_level=self._max_level,
-        num_scales=self._num_scales,
-        aspect_ratios=self._aspect_ratios,
-        anchor_size=self._anchor_size)
-    anchor_boxes = input_anchor(image_size=(image_height, image_width))
-    anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
-                                          self._unmatched_threshold)
-    (cls_targets, box_targets, att_targets, cls_weights,
-     box_weights) = anchor_labeler.label_anchors(
-         anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)
-    # Casts input image to desired data type.
-    image = tf.cast(image, dtype=self._dtype)
-    # Packs labels for model_fn outputs.
-    labels = {
-        'cls_targets': cls_targets,
-        'box_targets': box_targets,
-        'anchor_boxes': anchor_boxes,
-        'cls_weights': cls_weights,
-        'box_weights': box_weights,
-        'image_info': image_info,
-    }
-    if att_targets:
-      labels['attribute_targets'] = att_targets
-    return image, labels
-  def _parse_eval_data(self, data):
-    """Parses data for training and evaluation."""
-    groundtruths = {}
-    classes = data['groundtruth_classes']
-    boxes = data['groundtruth_boxes']
-    # If not empty, `attributes` is a dict of (name, ground_truth) pairs.
-    # `ground_gruth` of attributes is assumed in shape [N, attribute_size].
-    # TODO(xianzhi): support parsing attributes weights.
-    attributes = data.get('groundtruth_attributes', {})
-    # Gets original image and its size.
-    image = data['image']
-    image_shape = tf.shape(input=image)[0:2]
-    # Normalizes image with mean and std pixel values.
-    image = preprocess_ops.normalize_image(image)
-    # Converts boxes from normalized coordinates to pixel coordinates.
-    boxes = box_ops.denormalize_boxes(boxes, image_shape)
-    # Resizes and crops image.
-    image, image_info = preprocess_ops.resize_and_crop_image(
-        image,
-        self._output_size,
-        padded_size=preprocess_ops.compute_padded_size(self._output_size,
-                                                       2**self._max_level),
-        aug_scale_min=1.0,
-        aug_scale_max=1.0)
-    image_height, image_width, _ = image.get_shape().as_list()
-    # Resizes and crops boxes.
-    image_scale = image_info[2, :]
-    offset = image_info[3, :]
-    boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
-                                                 image_info[1, :], offset)
-    # Filters out ground truth boxes that are all zeros.
-    indices = box_ops.get_non_empty_box_indices(boxes)
-    boxes = tf.gather(boxes, indices)
-    classes = tf.gather(classes, indices)
-    for k, v in attributes.items():
-      attributes[k] = tf.gather(v, indices)
-    # Assigns anchors.
-    input_anchor = anchor.build_anchor_generator(
-        min_level=self._min_level,
-        max_level=self._max_level,
-        num_scales=self._num_scales,
-        aspect_ratios=self._aspect_ratios,
-        anchor_size=self._anchor_size)
-    anchor_boxes = input_anchor(image_size=(image_height, image_width))
-    anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
-                                          self._unmatched_threshold)
-    (cls_targets, box_targets, att_targets, cls_weights,
-     box_weights) = anchor_labeler.label_anchors(
-         anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)
-    # Casts input image to desired data type.
-    image = tf.cast(image, dtype=self._dtype)
-    # Sets up groundtruth data for evaluation.
-    groundtruths = {
-        'source_id': data['source_id'],
-        'height': data['height'],
-        'width': data['width'],
-        'num_detections': tf.shape(data['groundtruth_classes']),
-        'image_info': image_info,
-        'boxes': box_ops.denormalize_boxes(
-            data['groundtruth_boxes'], image_shape),
-        'classes': data['groundtruth_classes'],
-        'areas': data['groundtruth_area'],
-        'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
-    }
-    if 'groundtruth_attributes' in data:
-      groundtruths['attributes'] = data['groundtruth_attributes']
-    groundtruths['source_id'] = utils.process_source_id(
-        groundtruths['source_id'])
-    groundtruths = utils.pad_groundtruths_to_fixed_size(
-        groundtruths, self._max_num_instances)
-    # Packs labels for model_fn outputs.
-    labels = {
-        'cls_targets': cls_targets,
-        'box_targets': box_targets,
-        'anchor_boxes': anchor_boxes,
-        'cls_weights': cls_weights,
-        'box_weights': box_weights,
-        'image_info': image_info,
-        'groundtruths': groundtruths,
-    }
-    if att_targets:
-      labels['attribute_targets'] = att_targets
-    return image, labels
--- a/official/vision/beta/dataloaders/segmentation_input.py
+++ b/official/vision/beta/dataloaders/segmentation_input.py
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data parser and processing for segmentation datasets."""
-import tensorflow as tf
-from official.vision.beta.dataloaders import decoder
-from official.vision.beta.dataloaders import parser
-from official.vision.beta.ops import preprocess_ops
-class Decoder(decoder.Decoder):
-  """A tf.Example decoder for segmentation task."""
-  def __init__(self):
-    self._keys_to_features = {
-        'image/encoded': tf.io.FixedLenFeature((), tf.string, default_value=''),
-        'image/height': tf.io.FixedLenFeature((), tf.int64, default_value=0),
-        'image/width': tf.io.FixedLenFeature((), tf.int64, default_value=0),
-        'image/segmentation/class/encoded':
-            tf.io.FixedLenFeature((), tf.string, default_value='')
-    }
-  def decode(self, serialized_example):
-    return tf.io.parse_single_example(
-        serialized_example, self._keys_to_features)
-class Parser(parser.Parser):
-  """Parser to parse an image and its annotations into a dictionary of tensors.
-  """
-  def __init__(self,
-               output_size,
-               crop_size=None,
-               resize_eval_groundtruth=True,
-               groundtruth_padded_size=None,
-               ignore_label=255,
-               aug_rand_hflip=False,
-               preserve_aspect_ratio=True,
-               aug_scale_min=1.0,
-               aug_scale_max=1.0,
-               dtype='float32'):
-    """Initializes parameters for parsing annotations in the dataset.
-    Args:
-      output_size: `Tensor` or `list` for [height, width] of output image. The
-        output_size should be divided by the largest feature stride 2^max_level.
-      crop_size: `Tensor` or `list` for [height, width] of the crop. If
-        specified a training crop of size crop_size is returned. This is useful
-        for cropping original images during training while evaluating on
-        original image sizes.
-      resize_eval_groundtruth: `bool`, if True, eval groundtruth masks are
-        resized to output_size.
-      groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
-        resize_eval_groundtruth is set to False, the groundtruth masks are
-        padded to this size.
-      ignore_label: `int` the pixel with ignore label will not used for training
-        and evaluation.
-      aug_rand_hflip: `bool`, if True, augment training with random
-        horizontal flip.
-      preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved,
-        otherwise, the image is resized to output_size.
-      aug_scale_min: `float`, the minimum scale applied to `output_size` for
-        data augmentation during training.
-      aug_scale_max: `float`, the maximum scale applied to `output_size` for
-        data augmentation during training.
-      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
-    """
-    self._output_size = output_size
-    self._crop_size = crop_size
-    self._resize_eval_groundtruth = resize_eval_groundtruth
-    if (not resize_eval_groundtruth) and (groundtruth_padded_size is None):
-      raise ValueError('groundtruth_padded_size ([height, width]) needs to be'
-                       'specified when resize_eval_groundtruth is False.')
-    self._groundtruth_padded_size = groundtruth_padded_size
-    self._ignore_label = ignore_label
-    self._preserve_aspect_ratio = preserve_aspect_ratio
-    # Data augmentation.
-    self._aug_rand_hflip = aug_rand_hflip
-    self._aug_scale_min = aug_scale_min
-    self._aug_scale_max = aug_scale_max
-    # dtype.
-    self._dtype = dtype
-  def _prepare_image_and_label(self, data):
-    """Prepare normalized image and label."""
-    image = tf.io.decode_image(data['image/encoded'], channels=3)
-    label = tf.io.decode_image(data['image/segmentation/class/encoded'],
-                               channels=1)
-    height = data['image/height']
-    width = data['image/width']
-    image = tf.reshape(image, (height, width, 3))
-    label = tf.reshape(label, (1, height, width))
-    label = tf.cast(label, tf.float32)
-    # Normalizes image with mean and std pixel values.
-    image = preprocess_ops.normalize_image(image)
-    if not self._preserve_aspect_ratio:
-      label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
-      image = tf.image.resize(image, self._output_size, method='bilinear')
-      label = tf.image.resize(label, self._output_size, method='nearest')
-      label = tf.reshape(label[:, :, -1], [1] + self._output_size)
-    return image, label
-  def _parse_train_data(self, data):
-    """Parses data for training and evaluation."""
-    image, label = self._prepare_image_and_label(data)
-    if self._crop_size:
-      label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
-      # If output_size is specified, resize image, and label to desired
-      # output_size.
-      if self._output_size:
-        image = tf.image.resize(image, self._output_size, method='bilinear')
-        label = tf.image.resize(label, self._output_size, method='nearest')
-      image_mask = tf.concat([image, label], axis=2)
-      image_mask_crop = tf.image.random_crop(image_mask,
-                                             self._crop_size + [4])
-      image = image_mask_crop[:, :, :-1]
-      label = tf.reshape(image_mask_crop[:, :, -1], [1] + self._crop_size)
-    # Flips image randomly during training.
-    if self._aug_rand_hflip:
-      image, _, label = preprocess_ops.random_horizontal_flip(
-          image, masks=label)
-    train_image_size = self._crop_size if self._crop_size else self._output_size
-    # Resizes and crops image.
-    image, image_info = preprocess_ops.resize_and_crop_image(
-        image,
-        train_image_size,
-        train_image_size,
-        aug_scale_min=self._aug_scale_min,
-        aug_scale_max=self._aug_scale_max)
-    # Resizes and crops boxes.
-    image_scale = image_info[2, :]
-    offset = image_info[3, :]
-    # Pad label and make sure the padded region assigned to the ignore label.
-    # The label is first offset by +1 and then padded with 0.
-    label += 1
-    label = tf.expand_dims(label, axis=3)
-    label = preprocess_ops.resize_and_crop_masks(
-        label, image_scale, train_image_size, offset)
-    label -= 1
-    label = tf.where(tf.equal(label, -1),
-                     self._ignore_label * tf.ones_like(label), label)
-    label = tf.squeeze(label, axis=0)
-    valid_mask = tf.not_equal(label, self._ignore_label)
-    labels = {
-        'masks': label,
-        'valid_masks': valid_mask,
-        'image_info': image_info,
-    }
-    # Cast image as self._dtype
-    image = tf.cast(image, dtype=self._dtype)
-    return image, labels
-  def _parse_eval_data(self, data):
-    """Parses data for training and evaluation."""
-    image, label = self._prepare_image_and_label(data)
-    # The label is first offset by +1 and then padded with 0.
-    label += 1
-    label = tf.expand_dims(label, axis=3)
-    # Resizes and crops image.
-    image, image_info = preprocess_ops.resize_and_crop_image(
-        image, self._output_size, self._output_size)
-    if self._resize_eval_groundtruth:
-      # Resizes eval masks to match input image sizes. In that case, mean IoU
-      # is computed on output_size not the original size of the images.
-      image_scale = image_info[2, :]
-      offset = image_info[3, :]
-      label = preprocess_ops.resize_and_crop_masks(label, image_scale,
-                                                   self._output_size, offset)
-    else:
-      label = tf.image.pad_to_bounding_box(
-          label, 0, 0, self._groundtruth_padded_size[0],
-          self._groundtruth_padded_size[1])
-    label -= 1
-    label = tf.where(tf.equal(label, -1),
-                     self._ignore_label * tf.ones_like(label), label)
-    label = tf.squeeze(label, axis=0)
-    valid_mask = tf.not_equal(label, self._ignore_label)
-    labels = {
-        'masks': label,
-        'valid_masks': valid_mask,
-        'image_info': image_info
-    }
-    # Cast image as self._dtype
-    image = tf.cast(image, dtype=self._dtype)
-    return image, labels