# Copyright 2022 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Panoptic Deeplab configuration definition.""" import dataclasses import os from typing import List, Optional, Union import numpy as np from official.core import config_definitions as cfg from official.core import exp_factory from official.modeling import hyperparams from official.modeling import optimization from official.vision.configs import common from official.vision.configs import decoders from official.vision.configs import backbones _COCO_INPUT_PATH_BASE = 'coco/tfrecords' _COCO_TRAIN_EXAMPLES = 118287 _COCO_VAL_EXAMPLES = 5000 @dataclasses.dataclass class Parser(hyperparams.Config): """Panoptic deeplab parser.""" ignore_label: int = 0 # If resize_eval_groundtruth is set to False, original image sizes are used # for eval. In that case, groundtruth_padded_size has to be specified too to # allow for batching the variable input sizes of images. resize_eval_groundtruth: bool = True groundtruth_padded_size: List[int] = dataclasses.field(default_factory=list) aug_scale_min: float = 1.0 aug_scale_max: float = 1.0 aug_rand_hflip: bool = True aug_type: common.Augmentation = common.Augmentation() sigma: float = 8.0 small_instance_area_threshold: int = 4096 small_instance_weight: float = 3.0 dtype = 'float32' @dataclasses.dataclass class TfExampleDecoder(common.TfExampleDecoder): """A simple TF Example decoder config.""" panoptic_category_mask_key: str = 'image/panoptic/category_mask' panoptic_instance_mask_key: str = 'image/panoptic/instance_mask' @dataclasses.dataclass class DataDecoder(common.DataDecoder): """Data decoder config.""" simple_decoder: TfExampleDecoder = TfExampleDecoder() @dataclasses.dataclass class DataConfig(cfg.DataConfig): """Input config for training.""" decoder: DataDecoder = DataDecoder() parser: Parser = Parser() input_path: str = '' drop_remainder: bool = True file_type: str = 'tfrecord' is_training: bool = True global_batch_size: int = 1 @dataclasses.dataclass class PanopticDeeplabHead(hyperparams.Config): """Panoptic Deeplab head config.""" level: int = 3 num_convs: int = 2 num_filters: int = 256 kernel_size: int = 5 use_depthwise_convolution: bool = False upsample_factor: int = 1 low_level: List[int] = dataclasses.field(default_factory=lambda: [3, 2]) low_level_num_filters: List[int] = dataclasses.field( default_factory=lambda: [64, 32]) fusion_num_output_filters: int = 256 @dataclasses.dataclass class SemanticHead(PanopticDeeplabHead): """Semantic head config.""" prediction_kernel_size: int = 1 @dataclasses.dataclass class InstanceHead(PanopticDeeplabHead): """Instance head config.""" prediction_kernel_size: int = 1 @dataclasses.dataclass class PanopticDeeplabPostProcessor(hyperparams.Config): """Panoptic Deeplab PostProcessing config.""" output_size: List[int] = dataclasses.field( default_factory=list) center_score_threshold: float = 0.1 thing_class_ids: List[int] = dataclasses.field(default_factory=list) label_divisor: int = 256 * 256 * 256 stuff_area_limit: int = 4096 ignore_label: int = 0 nms_kernel: int = 7 keep_k_centers: int = 200 rescale_predictions: bool = True @dataclasses.dataclass class PanopticDeeplab(hyperparams.Config): """Panoptic Deeplab model config.""" num_classes: int = 2 input_size: List[int] = dataclasses.field(default_factory=list) min_level: int = 3 max_level: int = 6 norm_activation: common.NormActivation = common.NormActivation() backbone: backbones.Backbone = backbones.Backbone( type='resnet', resnet=backbones.ResNet()) decoder: decoders.Decoder = decoders.Decoder(type='aspp') semantic_head: SemanticHead = SemanticHead() instance_head: InstanceHead = InstanceHead() shared_decoder: bool = False generate_panoptic_masks: bool = True post_processor: PanopticDeeplabPostProcessor = PanopticDeeplabPostProcessor() @dataclasses.dataclass class Losses(hyperparams.Config): label_smoothing: float = 0.0 ignore_label: int = 0 class_weights: List[float] = dataclasses.field(default_factory=list) l2_weight_decay: float = 1e-4 top_k_percent_pixels: float = 0.15 segmentation_loss_weight: float = 1.0 center_heatmap_loss_weight: float = 200 center_offset_loss_weight: float = 0.01 @dataclasses.dataclass class Evaluation(hyperparams.Config): """Evaluation config.""" ignored_label: int = 0 max_instances_per_category: int = 256 offset: int = 256 * 256 * 256 is_thing: List[float] = dataclasses.field( default_factory=list) rescale_predictions: bool = True report_per_class_pq: bool = False report_per_class_iou: bool = False report_train_mean_iou: bool = True # Turning this off can speed up training. @dataclasses.dataclass class PanopticDeeplabTask(cfg.TaskConfig): """Panoptic deeplab task config.""" model: PanopticDeeplab = PanopticDeeplab() train_data: DataConfig = DataConfig(is_training=True) validation_data: DataConfig = DataConfig( is_training=False, drop_remainder=False) losses: Losses = Losses() init_checkpoint: Optional[str] = None init_checkpoint_modules: Union[ str, List[str]] = 'all' # all, backbone, and/or decoder evaluation: Evaluation = Evaluation() @exp_factory.register_config_factory('panoptic_deeplab_resnet_coco') def panoptic_deeplab_resnet_coco() -> cfg.ExperimentConfig: """COCO panoptic segmentation with Panoptic Deeplab.""" train_steps = 200000 train_batch_size = 64 eval_batch_size = 1 steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size num_panoptic_categories = 201 num_thing_categories = 91 ignore_label = 0 is_thing = [False] for idx in range(1, num_panoptic_categories): is_thing.append(True if idx <= num_thing_categories else False) input_size = [640, 640, 3] output_stride = 16 aspp_dilation_rates = [6, 12, 18] multigrid = [1, 2, 4] stem_type = 'v1' level = int(np.math.log2(output_stride)) config = cfg.ExperimentConfig( runtime=cfg.RuntimeConfig( mixed_precision_dtype='bfloat16', enable_xla=True), task=PanopticDeeplabTask( init_checkpoint='gs://tf_model_garden/vision/panoptic/panoptic_deeplab/imagenet/resnet50_v1/ckpt-436800', # pylint: disable=line-too-long init_checkpoint_modules=['backbone'], model=PanopticDeeplab( num_classes=num_panoptic_categories, input_size=input_size, backbone=backbones.Backbone( type='dilated_resnet', dilated_resnet=backbones.DilatedResNet( model_id=50, stem_type=stem_type, output_stride=output_stride, multigrid=multigrid, se_ratio=0.25, last_stage_repeats=1, stochastic_depth_drop_rate=0.2)), decoder=decoders.Decoder( type='aspp', aspp=decoders.ASPP( level=level, num_filters=256, pool_kernel_size=input_size[:2], dilation_rates=aspp_dilation_rates, use_depthwise_convolution=True, dropout_rate=0.1)), semantic_head=SemanticHead( level=level, num_convs=1, num_filters=256, kernel_size=5, use_depthwise_convolution=True, upsample_factor=1, low_level=[3, 2], low_level_num_filters=[64, 32], fusion_num_output_filters=256, prediction_kernel_size=1), instance_head=InstanceHead( level=level, num_convs=1, num_filters=32, kernel_size=5, use_depthwise_convolution=True, upsample_factor=1, low_level=[3, 2], low_level_num_filters=[32, 16], fusion_num_output_filters=128, prediction_kernel_size=1), shared_decoder=False, generate_panoptic_masks=True, post_processor=PanopticDeeplabPostProcessor( output_size=input_size[:2], center_score_threshold=0.1, thing_class_ids=list(range(1, num_thing_categories)), label_divisor=256, stuff_area_limit=4096, ignore_label=ignore_label, nms_kernel=41, keep_k_centers=200, rescale_predictions=True)), losses=Losses( label_smoothing=0.0, ignore_label=ignore_label, l2_weight_decay=0.0, top_k_percent_pixels=0.2, segmentation_loss_weight=1.0, center_heatmap_loss_weight=200, center_offset_loss_weight=0.01), train_data=DataConfig( input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'), is_training=True, global_batch_size=train_batch_size, parser=Parser( aug_scale_min=0.5, aug_scale_max=1.5, aug_rand_hflip=True, aug_type=common.Augmentation( type='autoaug', autoaug=common.AutoAugment( augmentation_name='panoptic_deeplab_policy')), sigma=8.0, small_instance_area_threshold=4096, small_instance_weight=3.0)), validation_data=DataConfig( input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'), is_training=False, global_batch_size=eval_batch_size, parser=Parser( resize_eval_groundtruth=False, groundtruth_padded_size=[640, 640], aug_scale_min=1.0, aug_scale_max=1.0, aug_rand_hflip=False, aug_type=None, sigma=8.0, small_instance_area_threshold=4096, small_instance_weight=3.0), drop_remainder=False), evaluation=Evaluation( ignored_label=ignore_label, max_instances_per_category=256, offset=256*256*256, is_thing=is_thing, rescale_predictions=True, report_per_class_pq=False, report_per_class_iou=False, report_train_mean_iou=False)), trainer=cfg.TrainerConfig( train_steps=train_steps, validation_steps=validation_steps, validation_interval=steps_per_epoch, steps_per_loop=steps_per_epoch, summary_interval=steps_per_epoch, checkpoint_interval=steps_per_epoch, optimizer_config=optimization.OptimizationConfig({ 'optimizer': { 'type': 'adam', }, 'learning_rate': { 'type': 'polynomial', 'polynomial': { 'initial_learning_rate': 0.0005, 'decay_steps': train_steps, 'end_learning_rate': 0.0, 'power': 0.9 } }, 'warmup': { 'type': 'linear', 'linear': { 'warmup_steps': 2000, 'warmup_learning_rate': 0 } } })), restrictions=[ 'task.train_data.is_training != None', 'task.validation_data.is_training != None' ]) return config @exp_factory.register_config_factory('panoptic_deeplab_mobilenetv3_large_coco') def panoptic_deeplab_mobilenetv3_large_coco() -> cfg.ExperimentConfig: """COCO panoptic segmentation with Panoptic Deeplab.""" train_steps = 200000 train_batch_size = 64 eval_batch_size = 1 steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size num_panoptic_categories = 201 num_thing_categories = 91 ignore_label = 0 is_thing = [False] for idx in range(1, num_panoptic_categories): is_thing.append(True if idx <= num_thing_categories else False) input_size = [640, 640, 3] output_stride = 16 aspp_dilation_rates = [6, 12, 18] level = int(np.math.log2(output_stride)) config = cfg.ExperimentConfig( runtime=cfg.RuntimeConfig( mixed_precision_dtype='float32', enable_xla=True), task=PanopticDeeplabTask( init_checkpoint='gs://tf_model_garden/vision/panoptic/panoptic_deeplab/imagenet/mobilenetv3_large/ckpt-156000', init_checkpoint_modules=['backbone'], model=PanopticDeeplab( num_classes=num_panoptic_categories, input_size=input_size, backbone=backbones.Backbone( type='mobilenet', mobilenet=backbones.MobileNet( model_id='MobileNetV3Large', filter_size_scale=1.0, stochastic_depth_drop_rate=0.0, output_stride=output_stride)), decoder=decoders.Decoder( type='aspp', aspp=decoders.ASPP( level=level, num_filters=256, pool_kernel_size=input_size[:2], dilation_rates=aspp_dilation_rates, use_depthwise_convolution=True, dropout_rate=0.1)), semantic_head=SemanticHead( level=level, num_convs=1, num_filters=256, kernel_size=5, use_depthwise_convolution=True, upsample_factor=1, low_level=[3, 2], low_level_num_filters=[64, 32], fusion_num_output_filters=256, prediction_kernel_size=1), instance_head=InstanceHead( level=level, num_convs=1, num_filters=32, kernel_size=5, use_depthwise_convolution=True, upsample_factor=1, low_level=[3, 2], low_level_num_filters=[32, 16], fusion_num_output_filters=128, prediction_kernel_size=1), shared_decoder=False, generate_panoptic_masks=True, post_processor=PanopticDeeplabPostProcessor( output_size=input_size[:2], center_score_threshold=0.1, thing_class_ids=list(range(1, num_thing_categories)), label_divisor=256, stuff_area_limit=4096, ignore_label=ignore_label, nms_kernel=41, keep_k_centers=200, rescale_predictions=True)), losses=Losses( label_smoothing=0.0, ignore_label=ignore_label, l2_weight_decay=0.0, top_k_percent_pixels=0.2, segmentation_loss_weight=1.0, center_heatmap_loss_weight=200, center_offset_loss_weight=0.01), train_data=DataConfig( input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'), is_training=True, global_batch_size=train_batch_size, parser=Parser( aug_scale_min=0.5, aug_scale_max=2.0, aug_rand_hflip=True, aug_type=common.Augmentation( type='autoaug', autoaug=common.AutoAugment( augmentation_name='panoptic_deeplab_policy')), sigma=8.0, small_instance_area_threshold=4096, small_instance_weight=3.0)), validation_data=DataConfig( input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'), is_training=False, global_batch_size=eval_batch_size, parser=Parser( resize_eval_groundtruth=False, groundtruth_padded_size=[640, 640], aug_scale_min=1.0, aug_scale_max=1.0, aug_rand_hflip=False, aug_type=None, sigma=8.0, small_instance_area_threshold=4096, small_instance_weight=3.0), drop_remainder=False), evaluation=Evaluation( ignored_label=ignore_label, max_instances_per_category=256, offset=256*256*256, is_thing=is_thing, rescale_predictions=True, report_per_class_pq=False, report_per_class_iou=False, report_train_mean_iou=False)), trainer=cfg.TrainerConfig( train_steps=train_steps, validation_steps=validation_steps, validation_interval=steps_per_epoch, steps_per_loop=steps_per_epoch, summary_interval=steps_per_epoch, checkpoint_interval=steps_per_epoch, optimizer_config=optimization.OptimizationConfig({ 'optimizer': { 'type': 'adam', }, 'learning_rate': { 'type': 'polynomial', 'polynomial': { 'initial_learning_rate': 0.001, 'decay_steps': train_steps, 'end_learning_rate': 0.0, 'power': 0.9 } }, 'warmup': { 'type': 'linear', 'linear': { 'warmup_steps': 2000, 'warmup_learning_rate': 0 } } })), restrictions=[ 'task.train_data.is_training != None', 'task.validation_data.is_training != None' ]) return config @exp_factory.register_config_factory('panoptic_deeplab_mobilenetv3_small_coco') def panoptic_deeplab_mobilenetv3_small_coco() -> cfg.ExperimentConfig: """COCO panoptic segmentation with Panoptic Deeplab.""" train_steps = 200000 train_batch_size = 64 eval_batch_size = 1 steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size num_panoptic_categories = 201 num_thing_categories = 91 ignore_label = 0 is_thing = [False] for idx in range(1, num_panoptic_categories): is_thing.append(True if idx <= num_thing_categories else False) input_size = [640, 640, 3] output_stride = 16 aspp_dilation_rates = [6, 12, 18] level = int(np.math.log2(output_stride)) config = cfg.ExperimentConfig( runtime=cfg.RuntimeConfig( mixed_precision_dtype='float32', enable_xla=True), task=PanopticDeeplabTask( init_checkpoint='gs://tf_model_garden/vision/panoptic/panoptic_deeplab/imagenet/mobilenetv3_small/ckpt-312000', init_checkpoint_modules=['backbone'], model=PanopticDeeplab( num_classes=num_panoptic_categories, input_size=input_size, backbone=backbones.Backbone( type='mobilenet', mobilenet=backbones.MobileNet( model_id='MobileNetV3Small', filter_size_scale=1.0, stochastic_depth_drop_rate=0.0, output_stride=output_stride)), decoder=decoders.Decoder( type='aspp', aspp=decoders.ASPP( level=level, num_filters=256, pool_kernel_size=input_size[:2], dilation_rates=aspp_dilation_rates, use_depthwise_convolution=True, dropout_rate=0.1)), semantic_head=SemanticHead( level=level, num_convs=1, num_filters=256, kernel_size=5, use_depthwise_convolution=True, upsample_factor=1, low_level=[3, 2], low_level_num_filters=[64, 32], fusion_num_output_filters=256, prediction_kernel_size=1), instance_head=InstanceHead( level=level, num_convs=1, num_filters=32, kernel_size=5, use_depthwise_convolution=True, upsample_factor=1, low_level=[3, 2], low_level_num_filters=[32, 16], fusion_num_output_filters=128, prediction_kernel_size=1), shared_decoder=False, generate_panoptic_masks=True, post_processor=PanopticDeeplabPostProcessor( output_size=input_size[:2], center_score_threshold=0.1, thing_class_ids=list(range(1, num_thing_categories)), label_divisor=256, stuff_area_limit=4096, ignore_label=ignore_label, nms_kernel=41, keep_k_centers=200, rescale_predictions=True)), losses=Losses( label_smoothing=0.0, ignore_label=ignore_label, l2_weight_decay=0.0, top_k_percent_pixels=0.2, segmentation_loss_weight=1.0, center_heatmap_loss_weight=200, center_offset_loss_weight=0.01), train_data=DataConfig( input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'), is_training=True, global_batch_size=train_batch_size, parser=Parser( aug_scale_min=0.5, aug_scale_max=2.0, aug_rand_hflip=True, aug_type=common.Augmentation( type='autoaug', autoaug=common.AutoAugment( augmentation_name='panoptic_deeplab_policy')), sigma=8.0, small_instance_area_threshold=4096, small_instance_weight=3.0)), validation_data=DataConfig( input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'), is_training=False, global_batch_size=eval_batch_size, parser=Parser( resize_eval_groundtruth=False, groundtruth_padded_size=[640, 640], aug_scale_min=1.0, aug_scale_max=1.0, aug_rand_hflip=False, aug_type=None, sigma=8.0, small_instance_area_threshold=4096, small_instance_weight=3.0), drop_remainder=False), evaluation=Evaluation( ignored_label=ignore_label, max_instances_per_category=256, offset=256*256*256, is_thing=is_thing, rescale_predictions=True, report_per_class_pq=False, report_per_class_iou=False, report_train_mean_iou=False)), trainer=cfg.TrainerConfig( train_steps=train_steps, validation_steps=validation_steps, validation_interval=steps_per_epoch, steps_per_loop=steps_per_epoch, summary_interval=steps_per_epoch, checkpoint_interval=steps_per_epoch, optimizer_config=optimization.OptimizationConfig({ 'optimizer': { 'type': 'adam', }, 'learning_rate': { 'type': 'polynomial', 'polynomial': { 'initial_learning_rate': 0.001, 'decay_steps': train_steps, 'end_learning_rate': 0.0, 'power': 0.9 } }, 'warmup': { 'type': 'linear', 'linear': { 'warmup_steps': 2000, 'warmup_learning_rate': 0 } } })), restrictions=[ 'task.train_data.is_training != None', 'task.validation_data.is_training != None' ]) return config