panoptic_deeplab.py

# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Panoptic Deeplab configuration definition."""
import dataclasses
import os
from typing import List, Optional, Union

import numpy as np

from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import hyperparams
from official.modeling import optimization
from official.vision.configs import common
from official.vision.configs import decoders
from official.vision.configs import backbones


_COCO_INPUT_PATH_BASE = 'coco/tfrecords'
_COCO_TRAIN_EXAMPLES = 118287
_COCO_VAL_EXAMPLES = 5000


@dataclasses.dataclass
class Parser(hyperparams.Config):
  """Panoptic deeplab parser."""
  ignore_label: int = 0
  # If resize_eval_groundtruth is set to False, original image sizes are used
  # for eval. In that case, groundtruth_padded_size has to be specified too to
  # allow for batching the variable input sizes of images.
  resize_eval_groundtruth: bool = True
  groundtruth_padded_size: List[int] = dataclasses.field(default_factory=list)
  aug_scale_min: float = 1.0
  aug_scale_max: float = 1.0
  aug_rand_hflip: bool = True
  aug_type: common.Augmentation = common.Augmentation()
  sigma: float = 8.0
  small_instance_area_threshold: int = 4096
  small_instance_weight: float = 3.0
  dtype = 'float32'


@dataclasses.dataclass
class TfExampleDecoder(common.TfExampleDecoder):
  """A simple TF Example decoder config."""
  panoptic_category_mask_key: str = 'image/panoptic/category_mask'
  panoptic_instance_mask_key: str = 'image/panoptic/instance_mask'


@dataclasses.dataclass
class DataDecoder(common.DataDecoder):
  """Data decoder config."""
  simple_decoder: TfExampleDecoder = TfExampleDecoder()


@dataclasses.dataclass
class DataConfig(cfg.DataConfig):
  """Input config for training."""
  decoder: DataDecoder = DataDecoder()
  parser: Parser = Parser()
  input_path: str = ''
  drop_remainder: bool = True
  file_type: str = 'tfrecord'
  is_training: bool = True
  global_batch_size: int = 1


@dataclasses.dataclass
class PanopticDeeplabHead(hyperparams.Config):
  """Panoptic Deeplab head config."""
  level: int = 3
  num_convs: int = 2
  num_filters: int = 256
  kernel_size: int = 5
  use_depthwise_convolution: bool = False
  upsample_factor: int = 1
  low_level: List[int] = dataclasses.field(default_factory=lambda: [3, 2])
  low_level_num_filters: List[int] = dataclasses.field(
      default_factory=lambda: [64, 32])
  fusion_num_output_filters: int = 256


@dataclasses.dataclass
class SemanticHead(PanopticDeeplabHead):
  """Semantic head config."""
  prediction_kernel_size: int = 1


@dataclasses.dataclass
class InstanceHead(PanopticDeeplabHead):
  """Instance head config."""
  prediction_kernel_size: int = 1


@dataclasses.dataclass
class PanopticDeeplabPostProcessor(hyperparams.Config):
  """Panoptic Deeplab PostProcessing config."""
  output_size: List[int] = dataclasses.field(
      default_factory=list)
  center_score_threshold: float = 0.1
  thing_class_ids: List[int] = dataclasses.field(default_factory=list)
  label_divisor: int = 256 * 256 * 256
  stuff_area_limit: int = 4096
  ignore_label: int = 0
  nms_kernel: int = 7
  keep_k_centers: int = 200
  rescale_predictions: bool = True


@dataclasses.dataclass
class PanopticDeeplab(hyperparams.Config):
  """Panoptic Deeplab model config."""
  num_classes: int = 2
  input_size: List[int] = dataclasses.field(default_factory=list)
  min_level: int = 3
  max_level: int = 6
  norm_activation: common.NormActivation = common.NormActivation()
  backbone: backbones.Backbone = backbones.Backbone(
      type='resnet', resnet=backbones.ResNet())
  decoder: decoders.Decoder = decoders.Decoder(type='aspp')
  semantic_head: SemanticHead = SemanticHead()
  instance_head: InstanceHead = InstanceHead()
  shared_decoder: bool = False
  generate_panoptic_masks: bool = True
  post_processor: PanopticDeeplabPostProcessor = PanopticDeeplabPostProcessor()


@dataclasses.dataclass
class Losses(hyperparams.Config):
  label_smoothing: float = 0.0
  ignore_label: int = 0
  class_weights: List[float] = dataclasses.field(default_factory=list)
  l2_weight_decay: float = 1e-4
  top_k_percent_pixels: float = 0.15
  segmentation_loss_weight: float = 1.0
  center_heatmap_loss_weight: float = 200
  center_offset_loss_weight: float = 0.01


@dataclasses.dataclass
class Evaluation(hyperparams.Config):
  """Evaluation config."""
  ignored_label: int = 0
  max_instances_per_category: int = 256
  offset: int = 256 * 256 * 256
  is_thing: List[float] = dataclasses.field(
      default_factory=list)
  rescale_predictions: bool = True
  report_per_class_pq: bool = False

  report_per_class_iou: bool = False
  report_train_mean_iou: bool = True  # Turning this off can speed up training.


@dataclasses.dataclass
class PanopticDeeplabTask(cfg.TaskConfig):
  """Panoptic deeplab task config."""
  model: PanopticDeeplab = PanopticDeeplab()
  train_data: DataConfig = DataConfig(is_training=True)
  validation_data: DataConfig = DataConfig(
      is_training=False,
      drop_remainder=False)
  losses: Losses = Losses()
  init_checkpoint: Optional[str] = None
  init_checkpoint_modules: Union[
      str, List[str]] = 'all'  # all, backbone, and/or decoder
  evaluation: Evaluation = Evaluation()


@exp_factory.register_config_factory('panoptic_deeplab_resnet_coco')
def panoptic_deeplab_resnet_coco() -> cfg.ExperimentConfig:
  """COCO panoptic segmentation with Panoptic Deeplab."""
  train_steps = 200000
  train_batch_size = 64
  eval_batch_size = 1
  steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size
  validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size

  num_panoptic_categories = 201
  num_thing_categories = 91
  ignore_label = 0

  is_thing = [False]
  for idx in range(1, num_panoptic_categories):
    is_thing.append(True if idx <= num_thing_categories else False)

  input_size = [640, 640, 3]
  output_stride = 16
  aspp_dilation_rates = [6, 12, 18]
  multigrid = [1, 2, 4]
  stem_type = 'v1'
  level = int(np.math.log2(output_stride))

  config = cfg.ExperimentConfig(
      runtime=cfg.RuntimeConfig(
          mixed_precision_dtype='bfloat16', enable_xla=True),
      task=PanopticDeeplabTask(
          init_checkpoint='gs://tf_model_garden/vision/panoptic/panoptic_deeplab/imagenet/resnet50_v1/ckpt-436800',  # pylint: disable=line-too-long
          init_checkpoint_modules=['backbone'],
          model=PanopticDeeplab(
              num_classes=num_panoptic_categories,
              input_size=input_size,
              backbone=backbones.Backbone(
                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
                      model_id=50,
                      stem_type=stem_type,
                      output_stride=output_stride,
                      multigrid=multigrid,
                      se_ratio=0.25,
                      last_stage_repeats=1,
                      stochastic_depth_drop_rate=0.2)),
              decoder=decoders.Decoder(
                  type='aspp',
                  aspp=decoders.ASPP(
                      level=level,
                      num_filters=256,
                      pool_kernel_size=input_size[:2],
                      dilation_rates=aspp_dilation_rates,
                      use_depthwise_convolution=True,
                      dropout_rate=0.1)),
              semantic_head=SemanticHead(
                  level=level,
                  num_convs=1,
                  num_filters=256,
                  kernel_size=5,
                  use_depthwise_convolution=True,
                  upsample_factor=1,
                  low_level=[3, 2],
                  low_level_num_filters=[64, 32],
                  fusion_num_output_filters=256,
                  prediction_kernel_size=1),
              instance_head=InstanceHead(
                  level=level,
                  num_convs=1,
                  num_filters=32,
                  kernel_size=5,
                  use_depthwise_convolution=True,
                  upsample_factor=1,
                  low_level=[3, 2],
                  low_level_num_filters=[32, 16],
                  fusion_num_output_filters=128,
                  prediction_kernel_size=1),
              shared_decoder=False,
              generate_panoptic_masks=True,
              post_processor=PanopticDeeplabPostProcessor(
                  output_size=input_size[:2],
                  center_score_threshold=0.1,
                  thing_class_ids=list(range(1, num_thing_categories)),
                  label_divisor=256,
                  stuff_area_limit=4096,
                  ignore_label=ignore_label,
                  nms_kernel=41,
                  keep_k_centers=200,
                  rescale_predictions=True)),
          losses=Losses(
              label_smoothing=0.0,
              ignore_label=ignore_label,
              l2_weight_decay=0.0,
              top_k_percent_pixels=0.2,
              segmentation_loss_weight=1.0,
              center_heatmap_loss_weight=200,
              center_offset_loss_weight=0.01),
          train_data=DataConfig(
              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'),
              is_training=True,
              global_batch_size=train_batch_size,
              parser=Parser(
                  aug_scale_min=0.5,
                  aug_scale_max=1.5,
                  aug_rand_hflip=True,
                  aug_type=common.Augmentation(
                      type='autoaug',
                      autoaug=common.AutoAugment(
                          augmentation_name='panoptic_deeplab_policy')),
                  sigma=8.0,
                  small_instance_area_threshold=4096,
                  small_instance_weight=3.0)),
          validation_data=DataConfig(
              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'),
              is_training=False,
              global_batch_size=eval_batch_size,
              parser=Parser(
                  resize_eval_groundtruth=False,
                  groundtruth_padded_size=[640, 640],
                  aug_scale_min=1.0,
                  aug_scale_max=1.0,
                  aug_rand_hflip=False,
                  aug_type=None,
                  sigma=8.0,
                  small_instance_area_threshold=4096,
                  small_instance_weight=3.0),
              drop_remainder=False),
          evaluation=Evaluation(
              ignored_label=ignore_label,
              max_instances_per_category=256,
              offset=256*256*256,
              is_thing=is_thing,
              rescale_predictions=True,
              report_per_class_pq=False,
              report_per_class_iou=False,
              report_train_mean_iou=False)),
      trainer=cfg.TrainerConfig(
          train_steps=train_steps,
          validation_steps=validation_steps,
          validation_interval=steps_per_epoch,
          steps_per_loop=steps_per_epoch,
          summary_interval=steps_per_epoch,
          checkpoint_interval=steps_per_epoch,
          optimizer_config=optimization.OptimizationConfig({
              'optimizer': {
                  'type': 'adam',
              },
              'learning_rate': {
                  'type': 'polynomial',
                  'polynomial': {
                      'initial_learning_rate': 0.0005,
                      'decay_steps': train_steps,
                      'end_learning_rate': 0.0,
                      'power': 0.9
                  }
              },
              'warmup': {
                  'type': 'linear',
                  'linear': {
                      'warmup_steps': 2000,
                      'warmup_learning_rate': 0
                  }
              }
          })),
      restrictions=[
          'task.train_data.is_training != None',
          'task.validation_data.is_training != None'
      ])
  return config


@exp_factory.register_config_factory('panoptic_deeplab_mobilenetv3_large_coco')
def panoptic_deeplab_mobilenetv3_large_coco() -> cfg.ExperimentConfig:
  """COCO panoptic segmentation with Panoptic Deeplab."""
  train_steps = 200000
  train_batch_size = 64
  eval_batch_size = 1
  steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size
  validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size

  num_panoptic_categories = 201
  num_thing_categories = 91
  ignore_label = 0

  is_thing = [False]
  for idx in range(1, num_panoptic_categories):
    is_thing.append(True if idx <= num_thing_categories else False)

  input_size = [640, 640, 3]
  output_stride = 16
  aspp_dilation_rates = [6, 12, 18]
  level = int(np.math.log2(output_stride))

  config = cfg.ExperimentConfig(
      runtime=cfg.RuntimeConfig(
          mixed_precision_dtype='float32', enable_xla=True),
      task=PanopticDeeplabTask(
          init_checkpoint='gs://tf_model_garden/vision/panoptic/panoptic_deeplab/imagenet/mobilenetv3_large/ckpt-156000',
          init_checkpoint_modules=['backbone'],
          model=PanopticDeeplab(
              num_classes=num_panoptic_categories,
              input_size=input_size,
              backbone=backbones.Backbone(
                  type='mobilenet', mobilenet=backbones.MobileNet(
                      model_id='MobileNetV3Large',
                      filter_size_scale=1.0,
                      stochastic_depth_drop_rate=0.0,
                      output_stride=output_stride)),
              decoder=decoders.Decoder(
                  type='aspp',
                  aspp=decoders.ASPP(
                      level=level,
                      num_filters=256,
                      pool_kernel_size=input_size[:2],
                      dilation_rates=aspp_dilation_rates,
                      use_depthwise_convolution=True,
                      dropout_rate=0.1)),
              semantic_head=SemanticHead(
                  level=level,
                  num_convs=1,
                  num_filters=256,
                  kernel_size=5,
                  use_depthwise_convolution=True,
                  upsample_factor=1,
                  low_level=[3, 2],
                  low_level_num_filters=[64, 32],
                  fusion_num_output_filters=256,
                  prediction_kernel_size=1),
              instance_head=InstanceHead(
                  level=level,
                  num_convs=1,
                  num_filters=32,
                  kernel_size=5,
                  use_depthwise_convolution=True,
                  upsample_factor=1,
                  low_level=[3, 2],
                  low_level_num_filters=[32, 16],
                  fusion_num_output_filters=128,
                  prediction_kernel_size=1),
              shared_decoder=False,
              generate_panoptic_masks=True,
              post_processor=PanopticDeeplabPostProcessor(
                  output_size=input_size[:2],
                  center_score_threshold=0.1,
                  thing_class_ids=list(range(1, num_thing_categories)),
                  label_divisor=256,
                  stuff_area_limit=4096,
                  ignore_label=ignore_label,
                  nms_kernel=41,
                  keep_k_centers=200,
                  rescale_predictions=True)),
          losses=Losses(
              label_smoothing=0.0,
              ignore_label=ignore_label,
              l2_weight_decay=0.0,
              top_k_percent_pixels=0.2,
              segmentation_loss_weight=1.0,
              center_heatmap_loss_weight=200,
              center_offset_loss_weight=0.01),
          train_data=DataConfig(
              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'),
              is_training=True,
              global_batch_size=train_batch_size,
              parser=Parser(
                  aug_scale_min=0.5,
                  aug_scale_max=2.0,
                  aug_rand_hflip=True,
                  aug_type=common.Augmentation(
                      type='autoaug',
                      autoaug=common.AutoAugment(
                          augmentation_name='panoptic_deeplab_policy')),
                  sigma=8.0,
                  small_instance_area_threshold=4096,
                  small_instance_weight=3.0)),
          validation_data=DataConfig(
              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'),
              is_training=False,
              global_batch_size=eval_batch_size,
              parser=Parser(
                  resize_eval_groundtruth=False,
                  groundtruth_padded_size=[640, 640],
                  aug_scale_min=1.0,
                  aug_scale_max=1.0,
                  aug_rand_hflip=False,
                  aug_type=None,
                  sigma=8.0,
                  small_instance_area_threshold=4096,
                  small_instance_weight=3.0),
              drop_remainder=False),
          evaluation=Evaluation(
              ignored_label=ignore_label,
              max_instances_per_category=256,
              offset=256*256*256,
              is_thing=is_thing,
              rescale_predictions=True,
              report_per_class_pq=False,
              report_per_class_iou=False,
              report_train_mean_iou=False)),
      trainer=cfg.TrainerConfig(
          train_steps=train_steps,
          validation_steps=validation_steps,
          validation_interval=steps_per_epoch,
          steps_per_loop=steps_per_epoch,
          summary_interval=steps_per_epoch,
          checkpoint_interval=steps_per_epoch,
          optimizer_config=optimization.OptimizationConfig({
              'optimizer': {
                  'type': 'adam',
              },
              'learning_rate': {
                  'type': 'polynomial',
                  'polynomial': {
                      'initial_learning_rate': 0.001,
                      'decay_steps': train_steps,
                      'end_learning_rate': 0.0,
                      'power': 0.9
                  }
              },
              'warmup': {
                  'type': 'linear',
                  'linear': {
                      'warmup_steps': 2000,
                      'warmup_learning_rate': 0
                  }
              }
          })),
      restrictions=[
          'task.train_data.is_training != None',
          'task.validation_data.is_training != None'
      ])
  return config


@exp_factory.register_config_factory('panoptic_deeplab_mobilenetv3_small_coco')
def panoptic_deeplab_mobilenetv3_small_coco() -> cfg.ExperimentConfig:
  """COCO panoptic segmentation with Panoptic Deeplab."""
  train_steps = 200000
  train_batch_size = 64
  eval_batch_size = 1
  steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size
  validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size

  num_panoptic_categories = 201
  num_thing_categories = 91
  ignore_label = 0

  is_thing = [False]
  for idx in range(1, num_panoptic_categories):
    is_thing.append(True if idx <= num_thing_categories else False)

  input_size = [640, 640, 3]
  output_stride = 16
  aspp_dilation_rates = [6, 12, 18]
  level = int(np.math.log2(output_stride))

  config = cfg.ExperimentConfig(
      runtime=cfg.RuntimeConfig(
          mixed_precision_dtype='float32', enable_xla=True),
      task=PanopticDeeplabTask(
          init_checkpoint='gs://tf_model_garden/vision/panoptic/panoptic_deeplab/imagenet/mobilenetv3_small/ckpt-312000',
          init_checkpoint_modules=['backbone'],
          model=PanopticDeeplab(
              num_classes=num_panoptic_categories,
              input_size=input_size,
              backbone=backbones.Backbone(
                  type='mobilenet', mobilenet=backbones.MobileNet(
                      model_id='MobileNetV3Small',
                      filter_size_scale=1.0,
                      stochastic_depth_drop_rate=0.0,
                      output_stride=output_stride)),
              decoder=decoders.Decoder(
                  type='aspp',
                  aspp=decoders.ASPP(
                      level=level,
                      num_filters=256,
                      pool_kernel_size=input_size[:2],
                      dilation_rates=aspp_dilation_rates,
                      use_depthwise_convolution=True,
                      dropout_rate=0.1)),
              semantic_head=SemanticHead(
                  level=level,
                  num_convs=1,
                  num_filters=256,
                  kernel_size=5,
                  use_depthwise_convolution=True,
                  upsample_factor=1,
                  low_level=[3, 2],
                  low_level_num_filters=[64, 32],
                  fusion_num_output_filters=256,
                  prediction_kernel_size=1),
              instance_head=InstanceHead(
                  level=level,
                  num_convs=1,
                  num_filters=32,
                  kernel_size=5,
                  use_depthwise_convolution=True,
                  upsample_factor=1,
                  low_level=[3, 2],
                  low_level_num_filters=[32, 16],
                  fusion_num_output_filters=128,
                  prediction_kernel_size=1),
              shared_decoder=False,
              generate_panoptic_masks=True,
              post_processor=PanopticDeeplabPostProcessor(
                  output_size=input_size[:2],
                  center_score_threshold=0.1,
                  thing_class_ids=list(range(1, num_thing_categories)),
                  label_divisor=256,
                  stuff_area_limit=4096,
                  ignore_label=ignore_label,
                  nms_kernel=41,
                  keep_k_centers=200,
                  rescale_predictions=True)),
          losses=Losses(
              label_smoothing=0.0,
              ignore_label=ignore_label,
              l2_weight_decay=0.0,
              top_k_percent_pixels=0.2,
              segmentation_loss_weight=1.0,
              center_heatmap_loss_weight=200,
              center_offset_loss_weight=0.01),
          train_data=DataConfig(
              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'),
              is_training=True,
              global_batch_size=train_batch_size,
              parser=Parser(
                  aug_scale_min=0.5,
                  aug_scale_max=2.0,
                  aug_rand_hflip=True,
                  aug_type=common.Augmentation(
                      type='autoaug',
                      autoaug=common.AutoAugment(
                          augmentation_name='panoptic_deeplab_policy')),
                  sigma=8.0,
                  small_instance_area_threshold=4096,
                  small_instance_weight=3.0)),
          validation_data=DataConfig(
              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'),
              is_training=False,
              global_batch_size=eval_batch_size,
              parser=Parser(
                  resize_eval_groundtruth=False,
                  groundtruth_padded_size=[640, 640],
                  aug_scale_min=1.0,
                  aug_scale_max=1.0,
                  aug_rand_hflip=False,
                  aug_type=None,
                  sigma=8.0,
                  small_instance_area_threshold=4096,
                  small_instance_weight=3.0),
              drop_remainder=False),
          evaluation=Evaluation(
              ignored_label=ignore_label,
              max_instances_per_category=256,
              offset=256*256*256,
              is_thing=is_thing,
              rescale_predictions=True,
              report_per_class_pq=False,
              report_per_class_iou=False,
              report_train_mean_iou=False)),
      trainer=cfg.TrainerConfig(
          train_steps=train_steps,
          validation_steps=validation_steps,
          validation_interval=steps_per_epoch,
          steps_per_loop=steps_per_epoch,
          summary_interval=steps_per_epoch,
          checkpoint_interval=steps_per_epoch,
          optimizer_config=optimization.OptimizationConfig({
              'optimizer': {
                  'type': 'adam',
              },
              'learning_rate': {
                  'type': 'polynomial',
                  'polynomial': {
                      'initial_learning_rate': 0.001,
                      'decay_steps': train_steps,
                      'end_learning_rate': 0.0,
                      'power': 0.9
                  }
              },
              'warmup': {
                  'type': 'linear',
                  'linear': {
                      'warmup_steps': 2000,
                      'warmup_learning_rate': 0
                  }
              }
          })),
      restrictions=[
          'task.train_data.is_training != None',
          'task.validation_data.is_training != None'
      ])
  return config