Unverified Commit 0225b135 authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

parents 7479dbb8 4c571a3c
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for image_classification."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official import vision
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.configs import image_classification as exp_cfg
class ImageClassificationConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
('resnet_imagenet',),
('resnet_rs_imagenet',),
('revnet_imagenet',),
('mobilenet_imagenet'),
)
def test_image_classification_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.ImageClassificationTask)
self.assertIsInstance(config.task.model,
exp_cfg.ImageClassificationModel)
self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
config.validate()
config.task.train_data.is_training = None
with self.assertRaises(KeyError):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""R-CNN(-RS) configuration definition."""
import dataclasses
import os
from typing import List, Optional, Union
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import hyperparams
from official.modeling import optimization
from official.vision.configs import common
from official.vision.configs import decoders
from official.vision.configs import backbones
# pylint: disable=missing-class-docstring
@dataclasses.dataclass
class Parser(hyperparams.Config):
num_channels: int = 3
match_threshold: float = 0.5
unmatched_threshold: float = 0.5
aug_rand_hflip: bool = False
aug_scale_min: float = 1.0
aug_scale_max: float = 1.0
skip_crowd_during_training: bool = True
max_num_instances: int = 100
rpn_match_threshold: float = 0.7
rpn_unmatched_threshold: float = 0.3
rpn_batch_size_per_im: int = 256
rpn_fg_fraction: float = 0.5
mask_crop_size: int = 112
@dataclasses.dataclass
class DataConfig(cfg.DataConfig):
"""Input config for training."""
input_path: str = ''
global_batch_size: int = 0
is_training: bool = False
dtype: str = 'bfloat16'
decoder: common.DataDecoder = common.DataDecoder()
parser: Parser = Parser()
shuffle_buffer_size: int = 10000
file_type: str = 'tfrecord'
drop_remainder: bool = True
# Number of examples in the data set, it's used to create the annotation file.
num_examples: int = -1
@dataclasses.dataclass
class Anchor(hyperparams.Config):
num_scales: int = 1
aspect_ratios: List[float] = dataclasses.field(
default_factory=lambda: [0.5, 1.0, 2.0])
anchor_size: float = 8.0
@dataclasses.dataclass
class RPNHead(hyperparams.Config):
num_convs: int = 1
num_filters: int = 256
use_separable_conv: bool = False
@dataclasses.dataclass
class DetectionHead(hyperparams.Config):
num_convs: int = 4
num_filters: int = 256
use_separable_conv: bool = False
num_fcs: int = 1
fc_dims: int = 1024
class_agnostic_bbox_pred: bool = False # Has to be True for Cascade RCNN.
# If additional IoUs are passed in 'cascade_iou_thresholds'
# then ensemble the class probabilities from all heads.
cascade_class_ensemble: bool = False
@dataclasses.dataclass
class ROIGenerator(hyperparams.Config):
pre_nms_top_k: int = 2000
pre_nms_score_threshold: float = 0.0
pre_nms_min_size_threshold: float = 0.0
nms_iou_threshold: float = 0.7
num_proposals: int = 1000
test_pre_nms_top_k: int = 1000
test_pre_nms_score_threshold: float = 0.0
test_pre_nms_min_size_threshold: float = 0.0
test_nms_iou_threshold: float = 0.7
test_num_proposals: int = 1000
use_batched_nms: bool = False
@dataclasses.dataclass
class ROISampler(hyperparams.Config):
mix_gt_boxes: bool = True
num_sampled_rois: int = 512
foreground_fraction: float = 0.25
foreground_iou_threshold: float = 0.5
background_iou_high_threshold: float = 0.5
background_iou_low_threshold: float = 0.0
# IoU thresholds for additional FRCNN heads in Cascade mode.
# `foreground_iou_threshold` is the first threshold.
cascade_iou_thresholds: Optional[List[float]] = None
@dataclasses.dataclass
class ROIAligner(hyperparams.Config):
crop_size: int = 7
sample_offset: float = 0.5
@dataclasses.dataclass
class DetectionGenerator(hyperparams.Config):
apply_nms: bool = True
pre_nms_top_k: int = 5000
pre_nms_score_threshold: float = 0.05
nms_iou_threshold: float = 0.5
max_num_detections: int = 100
nms_version: str = 'v2' # `v2`, `v1`, `batched`
use_cpu_nms: bool = False
soft_nms_sigma: Optional[float] = None # Only works when nms_version='v1'.
@dataclasses.dataclass
class MaskHead(hyperparams.Config):
upsample_factor: int = 2
num_convs: int = 4
num_filters: int = 256
use_separable_conv: bool = False
class_agnostic: bool = False
@dataclasses.dataclass
class MaskSampler(hyperparams.Config):
num_sampled_masks: int = 128
@dataclasses.dataclass
class MaskROIAligner(hyperparams.Config):
crop_size: int = 14
sample_offset: float = 0.5
@dataclasses.dataclass
class MaskRCNN(hyperparams.Config):
num_classes: int = 0
input_size: List[int] = dataclasses.field(default_factory=list)
min_level: int = 2
max_level: int = 6
anchor: Anchor = Anchor()
include_mask: bool = True
backbone: backbones.Backbone = backbones.Backbone(
type='resnet', resnet=backbones.ResNet())
decoder: decoders.Decoder = decoders.Decoder(
type='fpn', fpn=decoders.FPN())
rpn_head: RPNHead = RPNHead()
detection_head: DetectionHead = DetectionHead()
roi_generator: ROIGenerator = ROIGenerator()
roi_sampler: ROISampler = ROISampler()
roi_aligner: ROIAligner = ROIAligner()
detection_generator: DetectionGenerator = DetectionGenerator()
mask_head: Optional[MaskHead] = MaskHead()
mask_sampler: Optional[MaskSampler] = MaskSampler()
mask_roi_aligner: Optional[MaskROIAligner] = MaskROIAligner()
norm_activation: common.NormActivation = common.NormActivation(
norm_momentum=0.997,
norm_epsilon=0.0001,
use_sync_bn=True)
@dataclasses.dataclass
class Losses(hyperparams.Config):
loss_weight: float = 1.0
rpn_huber_loss_delta: float = 1. / 9.
frcnn_huber_loss_delta: float = 1.
l2_weight_decay: float = 0.0
rpn_score_weight: float = 1.0
rpn_box_weight: float = 1.0
frcnn_class_weight: float = 1.0
frcnn_box_weight: float = 1.0
mask_weight: float = 1.0
@dataclasses.dataclass
class MaskRCNNTask(cfg.TaskConfig):
model: MaskRCNN = MaskRCNN()
train_data: DataConfig = DataConfig(is_training=True)
validation_data: DataConfig = DataConfig(is_training=False,
drop_remainder=False)
losses: Losses = Losses()
init_checkpoint: Optional[str] = None
init_checkpoint_modules: Union[
str, List[str]] = 'all' # all, backbone, and/or decoder
annotation_file: Optional[str] = None
per_category_metrics: bool = False
# If set, we only use masks for the specified class IDs.
allowed_mask_class_ids: Optional[List[int]] = None
# If set, the COCO metrics will be computed.
use_coco_metrics: bool = True
# If set, the Waymo Open Dataset evaluator would be used.
use_wod_metrics: bool = False
COCO_INPUT_PATH_BASE = 'coco'
@exp_factory.register_config_factory('fasterrcnn_resnetfpn_coco')
def fasterrcnn_resnetfpn_coco() -> cfg.ExperimentConfig:
"""COCO object detection with Faster R-CNN."""
steps_per_epoch = 500
coco_val_samples = 5000
train_batch_size = 64
eval_batch_size = 8
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=MaskRCNNTask(
init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080',
init_checkpoint_modules='backbone',
annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
'instances_val2017.json'),
model=MaskRCNN(
num_classes=91,
input_size=[1024, 1024, 3],
include_mask=False,
mask_head=None,
mask_sampler=None,
mask_roi_aligner=None),
losses=Losses(l2_weight_decay=0.00004),
train_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
is_training=True,
global_batch_size=train_batch_size,
parser=Parser(
aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.25)),
validation_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
is_training=False,
global_batch_size=eval_batch_size,
drop_remainder=False)),
trainer=cfg.TrainerConfig(
train_steps=22500,
validation_steps=coco_val_samples // eval_batch_size,
validation_interval=steps_per_epoch,
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [15000, 20000],
'values': [0.12, 0.012, 0.0012],
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 500,
'warmup_learning_rate': 0.0067
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('maskrcnn_resnetfpn_coco')
def maskrcnn_resnetfpn_coco() -> cfg.ExperimentConfig:
"""COCO object detection with Mask R-CNN."""
steps_per_epoch = 500
coco_val_samples = 5000
train_batch_size = 64
eval_batch_size = 8
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(
mixed_precision_dtype='bfloat16', enable_xla=True),
task=MaskRCNNTask(
init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080',
init_checkpoint_modules='backbone',
annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
'instances_val2017.json'),
model=MaskRCNN(
num_classes=91, input_size=[1024, 1024, 3], include_mask=True),
losses=Losses(l2_weight_decay=0.00004),
train_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
is_training=True,
global_batch_size=train_batch_size,
parser=Parser(
aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.25)),
validation_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
is_training=False,
global_batch_size=eval_batch_size,
drop_remainder=False)),
trainer=cfg.TrainerConfig(
train_steps=22500,
validation_steps=coco_val_samples // eval_batch_size,
validation_interval=steps_per_epoch,
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [15000, 20000],
'values': [0.12, 0.012, 0.0012],
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 500,
'warmup_learning_rate': 0.0067
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('maskrcnn_spinenet_coco')
def maskrcnn_spinenet_coco() -> cfg.ExperimentConfig:
"""COCO object detection with Mask R-CNN with SpineNet backbone."""
steps_per_epoch = 463
coco_val_samples = 5000
train_batch_size = 256
eval_batch_size = 8
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=MaskRCNNTask(
annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
'instances_val2017.json'),
model=MaskRCNN(
backbone=backbones.Backbone(
type='spinenet',
spinenet=backbones.SpineNet(
model_id='49',
min_level=3,
max_level=7,
)),
decoder=decoders.Decoder(
type='identity', identity=decoders.Identity()),
anchor=Anchor(anchor_size=3),
norm_activation=common.NormActivation(use_sync_bn=True),
num_classes=91,
input_size=[640, 640, 3],
min_level=3,
max_level=7,
include_mask=True),
losses=Losses(l2_weight_decay=0.00004),
train_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
is_training=True,
global_batch_size=train_batch_size,
parser=Parser(
aug_rand_hflip=True, aug_scale_min=0.5, aug_scale_max=2.0)),
validation_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
is_training=False,
global_batch_size=eval_batch_size,
drop_remainder=False)),
trainer=cfg.TrainerConfig(
train_steps=steps_per_epoch * 350,
validation_steps=coco_val_samples // eval_batch_size,
validation_interval=steps_per_epoch,
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [
steps_per_epoch * 320, steps_per_epoch * 340
],
'values': [0.32, 0.032, 0.0032],
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 2000,
'warmup_learning_rate': 0.0067
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.model.min_level == task.model.backbone.spinenet.min_level',
'task.model.max_level == task.model.backbone.spinenet.max_level',
])
return config
@exp_factory.register_config_factory('cascadercnn_spinenet_coco')
def cascadercnn_spinenet_coco() -> cfg.ExperimentConfig:
"""COCO object detection with Cascade RCNN-RS with SpineNet backbone."""
steps_per_epoch = 463
coco_val_samples = 5000
train_batch_size = 256
eval_batch_size = 8
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=MaskRCNNTask(
annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
'instances_val2017.json'),
model=MaskRCNN(
backbone=backbones.Backbone(
type='spinenet',
spinenet=backbones.SpineNet(
model_id='49',
min_level=3,
max_level=7,
)),
decoder=decoders.Decoder(
type='identity', identity=decoders.Identity()),
roi_sampler=ROISampler(cascade_iou_thresholds=[0.6, 0.7]),
detection_head=DetectionHead(
class_agnostic_bbox_pred=True, cascade_class_ensemble=True),
anchor=Anchor(anchor_size=3),
norm_activation=common.NormActivation(
use_sync_bn=True, activation='swish'),
num_classes=91,
input_size=[640, 640, 3],
min_level=3,
max_level=7,
include_mask=True),
losses=Losses(l2_weight_decay=0.00004),
train_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
is_training=True,
global_batch_size=train_batch_size,
parser=Parser(
aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.5)),
validation_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
is_training=False,
global_batch_size=eval_batch_size,
drop_remainder=False)),
trainer=cfg.TrainerConfig(
train_steps=steps_per_epoch * 500,
validation_steps=coco_val_samples // eval_batch_size,
validation_interval=steps_per_epoch,
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [
steps_per_epoch * 475, steps_per_epoch * 490
],
'values': [0.32, 0.032, 0.0032],
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 2000,
'warmup_learning_rate': 0.0067
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.model.min_level == task.model.backbone.spinenet.min_level',
'task.model.max_level == task.model.backbone.spinenet.max_level',
])
return config
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for maskrcnn."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official import vision
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.configs import maskrcnn as exp_cfg
class MaskRCNNConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
('fasterrcnn_resnetfpn_coco',),
('maskrcnn_resnetfpn_coco',),
('maskrcnn_spinenet_coco',),
('cascadercnn_spinenet_coco',),
)
def test_maskrcnn_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.MaskRCNNTask)
self.assertIsInstance(config.task.model, exp_cfg.MaskRCNN)
self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
config.validate()
config.task.train_data.is_training = None
with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""RetinaNet configuration definition."""
import dataclasses
import os
from typing import List, Optional, Union
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import hyperparams
from official.modeling import optimization
from official.vision.configs import common
from official.vision.configs import decoders
from official.vision.configs import backbones
# pylint: disable=missing-class-docstring
# Keep for backward compatibility.
@dataclasses.dataclass
class TfExampleDecoder(common.TfExampleDecoder):
"""A simple TF Example decoder config."""
# Keep for backward compatibility.
@dataclasses.dataclass
class TfExampleDecoderLabelMap(common.TfExampleDecoderLabelMap):
"""TF Example decoder with label map config."""
# Keep for backward compatibility.
@dataclasses.dataclass
class DataDecoder(common.DataDecoder):
"""Data decoder config."""
@dataclasses.dataclass
class Parser(hyperparams.Config):
num_channels: int = 3
match_threshold: float = 0.5
unmatched_threshold: float = 0.5
aug_rand_hflip: bool = False
aug_scale_min: float = 1.0
aug_scale_max: float = 1.0
skip_crowd_during_training: bool = True
max_num_instances: int = 100
# Can choose AutoAugment and RandAugment.
aug_type: Optional[common.Augmentation] = None
# Keep for backward compatibility. Not used.
aug_policy: Optional[str] = None
@dataclasses.dataclass
class DataConfig(cfg.DataConfig):
"""Input config for training."""
input_path: str = ''
global_batch_size: int = 0
is_training: bool = False
dtype: str = 'bfloat16'
decoder: common.DataDecoder = common.DataDecoder()
parser: Parser = Parser()
shuffle_buffer_size: int = 10000
file_type: str = 'tfrecord'
@dataclasses.dataclass
class Anchor(hyperparams.Config):
num_scales: int = 3
aspect_ratios: List[float] = dataclasses.field(
default_factory=lambda: [0.5, 1.0, 2.0])
anchor_size: float = 4.0
@dataclasses.dataclass
class Losses(hyperparams.Config):
loss_weight: float = 1.0
focal_loss_alpha: float = 0.25
focal_loss_gamma: float = 1.5
huber_loss_delta: float = 0.1
box_loss_weight: int = 50
l2_weight_decay: float = 0.0
@dataclasses.dataclass
class AttributeHead(hyperparams.Config):
name: str = ''
type: str = 'regression'
size: int = 1
@dataclasses.dataclass
class RetinaNetHead(hyperparams.Config):
num_convs: int = 4
num_filters: int = 256
use_separable_conv: bool = False
attribute_heads: List[AttributeHead] = dataclasses.field(default_factory=list)
@dataclasses.dataclass
class DetectionGenerator(hyperparams.Config):
apply_nms: bool = True
pre_nms_top_k: int = 5000
pre_nms_score_threshold: float = 0.05
nms_iou_threshold: float = 0.5
max_num_detections: int = 100
nms_version: str = 'v2' # `v2`, `v1`, `batched`, or `tflite`.
use_cpu_nms: bool = False
soft_nms_sigma: Optional[float] = None # Only works when nms_version='v1'.
# When nms_version = `tflite`, values from tflite_post_processing need to be
# specified. They are compatible with the input arguments used by TFLite
# custom NMS op and override above parameters.
tflite_post_processing: common.TFLitePostProcessingConfig = common.TFLitePostProcessingConfig(
)
max_detections: int = 200
max_classes_per_detection: int = 5
# Regular NMS run in a multi-class fashion and is slow. Setting it to False
# uses class-agnostic NMS, which is faster.
use_regular_nms: bool = False
nms_score_threshold: float = 0.1
@dataclasses.dataclass
class RetinaNet(hyperparams.Config):
num_classes: int = 0
input_size: List[int] = dataclasses.field(default_factory=list)
min_level: int = 3
max_level: int = 7
anchor: Anchor = Anchor()
backbone: backbones.Backbone = backbones.Backbone(
type='resnet', resnet=backbones.ResNet())
decoder: decoders.Decoder = decoders.Decoder(
type='fpn', fpn=decoders.FPN())
head: RetinaNetHead = RetinaNetHead()
detection_generator: DetectionGenerator = DetectionGenerator()
norm_activation: common.NormActivation = common.NormActivation()
@dataclasses.dataclass
class ExportConfig(hyperparams.Config):
output_normalized_coordinates: bool = False
cast_num_detections_to_float: bool = False
cast_detection_classes_to_float: bool = False
@dataclasses.dataclass
class RetinaNetTask(cfg.TaskConfig):
model: RetinaNet = RetinaNet()
train_data: DataConfig = DataConfig(is_training=True)
validation_data: DataConfig = DataConfig(is_training=False)
losses: Losses = Losses()
init_checkpoint: Optional[str] = None
init_checkpoint_modules: Union[
str, List[str]] = 'all' # all, backbone, and/or decoder
annotation_file: Optional[str] = None
per_category_metrics: bool = False
export_config: ExportConfig = ExportConfig()
# If set, the COCO metrics will be computed.
use_coco_metrics: bool = True
# If set, the Waymo Open Dataset evaluator would be used.
use_wod_metrics: bool = False
@exp_factory.register_config_factory('retinanet')
def retinanet() -> cfg.ExperimentConfig:
"""RetinaNet general config."""
return cfg.ExperimentConfig(
task=RetinaNetTask(),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
COCO_INPUT_PATH_BASE = 'coco'
COCO_TRAIN_EXAMPLES = 118287
COCO_VAL_EXAMPLES = 5000
@exp_factory.register_config_factory('retinanet_resnetfpn_coco')
def retinanet_resnetfpn_coco() -> cfg.ExperimentConfig:
"""COCO object detection with RetinaNet."""
train_batch_size = 256
eval_batch_size = 8
steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=RetinaNetTask(
init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080',
init_checkpoint_modules='backbone',
annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
'instances_val2017.json'),
model=RetinaNet(
num_classes=91,
input_size=[640, 640, 3],
norm_activation=common.NormActivation(use_sync_bn=False),
min_level=3,
max_level=7),
losses=Losses(l2_weight_decay=1e-4),
train_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
is_training=True,
global_batch_size=train_batch_size,
parser=Parser(
aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.2)),
validation_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
is_training=False,
global_batch_size=eval_batch_size)),
trainer=cfg.TrainerConfig(
train_steps=72 * steps_per_epoch,
validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [
57 * steps_per_epoch, 67 * steps_per_epoch
],
'values': [
0.32 * train_batch_size / 256.0,
0.032 * train_batch_size / 256.0,
0.0032 * train_batch_size / 256.0
],
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 500,
'warmup_learning_rate': 0.0067
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('retinanet_spinenet_coco')
def retinanet_spinenet_coco() -> cfg.ExperimentConfig:
"""COCO object detection with RetinaNet using SpineNet backbone."""
train_batch_size = 256
eval_batch_size = 8
steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
input_size = 640
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='float32'),
task=RetinaNetTask(
annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
'instances_val2017.json'),
model=RetinaNet(
backbone=backbones.Backbone(
type='spinenet',
spinenet=backbones.SpineNet(
model_id='49',
stochastic_depth_drop_rate=0.2,
min_level=3,
max_level=7)),
decoder=decoders.Decoder(
type='identity', identity=decoders.Identity()),
anchor=Anchor(anchor_size=3),
norm_activation=common.NormActivation(
use_sync_bn=True, activation='swish'),
num_classes=91,
input_size=[input_size, input_size, 3],
min_level=3,
max_level=7),
losses=Losses(l2_weight_decay=4e-5),
train_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
is_training=True,
global_batch_size=train_batch_size,
parser=Parser(
aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.0)),
validation_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
is_training=False,
global_batch_size=eval_batch_size)),
trainer=cfg.TrainerConfig(
train_steps=500 * steps_per_epoch,
validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [
475 * steps_per_epoch, 490 * steps_per_epoch
],
'values': [
0.32 * train_batch_size / 256.0,
0.032 * train_batch_size / 256.0,
0.0032 * train_batch_size / 256.0
],
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 2000,
'warmup_learning_rate': 0.0067
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.model.min_level == task.model.backbone.spinenet.min_level',
'task.model.max_level == task.model.backbone.spinenet.max_level',
])
return config
@exp_factory.register_config_factory('retinanet_mobile_coco')
def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig:
"""COCO object detection with mobile RetinaNet."""
train_batch_size = 256
eval_batch_size = 8
steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
input_size = 384
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='float32'),
task=RetinaNetTask(
annotation_file=os.path.join(COCO_INPUT_PATH_BASE,
'instances_val2017.json'),
model=RetinaNet(
backbone=backbones.Backbone(
type='spinenet_mobile',
spinenet_mobile=backbones.SpineNetMobile(
model_id='49',
stochastic_depth_drop_rate=0.2,
min_level=3,
max_level=7,
use_keras_upsampling_2d=False)),
decoder=decoders.Decoder(
type='identity', identity=decoders.Identity()),
head=RetinaNetHead(num_filters=48, use_separable_conv=True),
anchor=Anchor(anchor_size=3),
norm_activation=common.NormActivation(
use_sync_bn=True, activation='swish'),
num_classes=91,
input_size=[input_size, input_size, 3],
min_level=3,
max_level=7),
losses=Losses(l2_weight_decay=3e-5),
train_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'),
is_training=True,
global_batch_size=train_batch_size,
parser=Parser(
aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.0)),
validation_data=DataConfig(
input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'),
is_training=False,
global_batch_size=eval_batch_size)),
trainer=cfg.TrainerConfig(
train_steps=600 * steps_per_epoch,
validation_steps=COCO_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [
575 * steps_per_epoch, 590 * steps_per_epoch
],
'values': [
0.32 * train_batch_size / 256.0,
0.032 * train_batch_size / 256.0,
0.0032 * train_batch_size / 256.0
],
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 2000,
'warmup_learning_rate': 0.0067
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
])
return config
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for retinanet."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official import vision
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.configs import retinanet as exp_cfg
class RetinaNetConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
('retinanet_resnetfpn_coco',),
('retinanet_spinenet_coco',),
('retinanet_mobile_coco',),
)
def test_retinanet_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.RetinaNetTask)
self.assertIsInstance(config.task.model, exp_cfg.RetinaNet)
self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
config.validate()
config.task.train_data.is_training = None
with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Semantic segmentation configuration definition."""
import dataclasses
import os
from typing import List, Optional, Union
import numpy as np
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import hyperparams
from official.modeling import optimization
from official.vision.configs import common
from official.vision.configs import decoders
from official.vision.configs import backbones
@dataclasses.dataclass
class DataConfig(cfg.DataConfig):
"""Input config for training."""
output_size: List[int] = dataclasses.field(default_factory=list)
# If crop_size is specified, image will be resized first to
# output_size, then crop of size crop_size will be cropped.
crop_size: List[int] = dataclasses.field(default_factory=list)
input_path: str = ''
global_batch_size: int = 0
is_training: bool = True
dtype: str = 'float32'
shuffle_buffer_size: int = 1000
cycle_length: int = 10
# If resize_eval_groundtruth is set to False, original image sizes are used
# for eval. In that case, groundtruth_padded_size has to be specified too to
# allow for batching the variable input sizes of images.
resize_eval_groundtruth: bool = True
groundtruth_padded_size: List[int] = dataclasses.field(default_factory=list)
aug_scale_min: float = 1.0
aug_scale_max: float = 1.0
aug_rand_hflip: bool = True
preserve_aspect_ratio: bool = True
aug_policy: Optional[str] = None
drop_remainder: bool = True
file_type: str = 'tfrecord'
decoder: Optional[common.DataDecoder] = common.DataDecoder()
@dataclasses.dataclass
class SegmentationHead(hyperparams.Config):
"""Segmentation head config."""
level: int = 3
num_convs: int = 2
num_filters: int = 256
use_depthwise_convolution: bool = False
prediction_kernel_size: int = 1
upsample_factor: int = 1
feature_fusion: Optional[
str] = None # None, deeplabv3plus, panoptic_fpn_fusion or pyramid_fusion
# deeplabv3plus feature fusion params
low_level: Union[int, str] = 2
low_level_num_filters: int = 48
# panoptic_fpn_fusion params
decoder_min_level: Optional[Union[int, str]] = None
decoder_max_level: Optional[Union[int, str]] = None
@dataclasses.dataclass
class MaskScoringHead(hyperparams.Config):
"""Mask Scoring head config."""
num_convs: int = 4
num_filters: int = 128
fc_input_size: List[int] = dataclasses.field(default_factory=list)
num_fcs: int = 2
fc_dims: int = 1024
@dataclasses.dataclass
class SemanticSegmentationModel(hyperparams.Config):
"""Semantic segmentation model config."""
num_classes: int = 0
input_size: List[int] = dataclasses.field(default_factory=list)
min_level: int = 3
max_level: int = 6
head: SegmentationHead = SegmentationHead()
backbone: backbones.Backbone = backbones.Backbone(
type='resnet', resnet=backbones.ResNet())
decoder: decoders.Decoder = decoders.Decoder(type='identity')
mask_scoring_head: Optional[MaskScoringHead] = None
norm_activation: common.NormActivation = common.NormActivation()
@dataclasses.dataclass
class Losses(hyperparams.Config):
loss_weight: float = 1.0
label_smoothing: float = 0.0
ignore_label: int = 255
class_weights: List[float] = dataclasses.field(default_factory=list)
l2_weight_decay: float = 0.0
use_groundtruth_dimension: bool = True
top_k_percent_pixels: float = 1.0
@dataclasses.dataclass
class Evaluation(hyperparams.Config):
report_per_class_iou: bool = True
report_train_mean_iou: bool = True # Turning this off can speed up training.
@dataclasses.dataclass
class SemanticSegmentationTask(cfg.TaskConfig):
"""The model config."""
model: SemanticSegmentationModel = SemanticSegmentationModel()
train_data: DataConfig = DataConfig(is_training=True)
validation_data: DataConfig = DataConfig(is_training=False)
losses: Losses = Losses()
evaluation: Evaluation = Evaluation()
train_input_partition_dims: List[int] = dataclasses.field(
default_factory=list)
eval_input_partition_dims: List[int] = dataclasses.field(
default_factory=list)
init_checkpoint: Optional[str] = None
init_checkpoint_modules: Union[
str, List[str]] = 'all' # all, backbone, and/or decoder
@exp_factory.register_config_factory('semantic_segmentation')
def semantic_segmentation() -> cfg.ExperimentConfig:
"""Semantic segmentation general."""
return cfg.ExperimentConfig(
task=SemanticSegmentationTask(),
trainer=cfg.TrainerConfig(),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
# PASCAL VOC 2012 Dataset
PASCAL_TRAIN_EXAMPLES = 10582
PASCAL_VAL_EXAMPLES = 1449
PASCAL_INPUT_PATH_BASE = 'gs://**/pascal_voc_seg'
@exp_factory.register_config_factory('seg_deeplabv3_pascal')
def seg_deeplabv3_pascal() -> cfg.ExperimentConfig:
"""Image segmentation on pascal voc with resnet deeplabv3."""
train_batch_size = 16
eval_batch_size = 8
steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
output_stride = 16
aspp_dilation_rates = [12, 24, 36] # [6, 12, 18] if output_stride = 16
multigrid = [1, 2, 4]
stem_type = 'v1'
level = int(np.math.log2(output_stride))
config = cfg.ExperimentConfig(
task=SemanticSegmentationTask(
model=SemanticSegmentationModel(
num_classes=21,
input_size=[None, None, 3],
backbone=backbones.Backbone(
type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
model_id=101, output_stride=output_stride,
multigrid=multigrid, stem_type=stem_type)),
decoder=decoders.Decoder(
type='aspp', aspp=decoders.ASPP(
level=level, dilation_rates=aspp_dilation_rates)),
head=SegmentationHead(level=level, num_convs=0),
norm_activation=common.NormActivation(
activation='swish',
norm_momentum=0.9997,
norm_epsilon=1e-3,
use_sync_bn=True)),
losses=Losses(l2_weight_decay=1e-4),
train_data=DataConfig(
input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
# TODO(arashwan): test changing size to 513 to match deeplab.
output_size=[512, 512],
is_training=True,
global_batch_size=train_batch_size,
aug_scale_min=0.5,
aug_scale_max=2.0),
validation_data=DataConfig(
input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
output_size=[512, 512],
is_training=False,
global_batch_size=eval_batch_size,
resize_eval_groundtruth=False,
groundtruth_padded_size=[512, 512],
drop_remainder=False),
# resnet101
init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
init_checkpoint_modules='backbone'),
trainer=cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=45 * steps_per_epoch,
validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 0.007,
'decay_steps': 45 * steps_per_epoch,
'end_learning_rate': 0.0,
'power': 0.9
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 5 * steps_per_epoch,
'warmup_learning_rate': 0
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('seg_deeplabv3plus_pascal')
def seg_deeplabv3plus_pascal() -> cfg.ExperimentConfig:
"""Image segmentation on pascal voc with resnet deeplabv3+."""
train_batch_size = 16
eval_batch_size = 8
steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
output_stride = 16
aspp_dilation_rates = [6, 12, 18]
multigrid = [1, 2, 4]
stem_type = 'v1'
level = int(np.math.log2(output_stride))
config = cfg.ExperimentConfig(
task=SemanticSegmentationTask(
model=SemanticSegmentationModel(
num_classes=21,
input_size=[None, None, 3],
backbone=backbones.Backbone(
type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
model_id=101, output_stride=output_stride,
stem_type=stem_type, multigrid=multigrid)),
decoder=decoders.Decoder(
type='aspp',
aspp=decoders.ASPP(
level=level, dilation_rates=aspp_dilation_rates)),
head=SegmentationHead(
level=level,
num_convs=2,
feature_fusion='deeplabv3plus',
low_level=2,
low_level_num_filters=48),
norm_activation=common.NormActivation(
activation='swish',
norm_momentum=0.9997,
norm_epsilon=1e-3,
use_sync_bn=True)),
losses=Losses(l2_weight_decay=1e-4),
train_data=DataConfig(
input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
output_size=[512, 512],
is_training=True,
global_batch_size=train_batch_size,
aug_scale_min=0.5,
aug_scale_max=2.0),
validation_data=DataConfig(
input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
output_size=[512, 512],
is_training=False,
global_batch_size=eval_batch_size,
resize_eval_groundtruth=False,
groundtruth_padded_size=[512, 512],
drop_remainder=False),
# resnet101
init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
init_checkpoint_modules='backbone'),
trainer=cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=45 * steps_per_epoch,
validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 0.007,
'decay_steps': 45 * steps_per_epoch,
'end_learning_rate': 0.0,
'power': 0.9
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 5 * steps_per_epoch,
'warmup_learning_rate': 0
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('seg_resnetfpn_pascal')
def seg_resnetfpn_pascal() -> cfg.ExperimentConfig:
"""Image segmentation on pascal voc with resnet-fpn."""
train_batch_size = 256
eval_batch_size = 32
steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
config = cfg.ExperimentConfig(
task=SemanticSegmentationTask(
model=SemanticSegmentationModel(
num_classes=21,
input_size=[512, 512, 3],
min_level=3,
max_level=7,
backbone=backbones.Backbone(
type='resnet', resnet=backbones.ResNet(model_id=50)),
decoder=decoders.Decoder(type='fpn', fpn=decoders.FPN()),
head=SegmentationHead(level=3, num_convs=3),
norm_activation=common.NormActivation(
activation='swish',
use_sync_bn=True)),
losses=Losses(l2_weight_decay=1e-4),
train_data=DataConfig(
input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
is_training=True,
global_batch_size=train_batch_size,
aug_scale_min=0.2,
aug_scale_max=1.5),
validation_data=DataConfig(
input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
is_training=False,
global_batch_size=eval_batch_size,
resize_eval_groundtruth=False,
groundtruth_padded_size=[512, 512],
drop_remainder=False),
),
trainer=cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=450 * steps_per_epoch,
validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 0.007,
'decay_steps': 450 * steps_per_epoch,
'end_learning_rate': 0.0,
'power': 0.9
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 5 * steps_per_epoch,
'warmup_learning_rate': 0
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('mnv2_deeplabv3_pascal')
def mnv2_deeplabv3_pascal() -> cfg.ExperimentConfig:
"""Image segmentation on pascal with mobilenetv2 deeplabv3."""
train_batch_size = 16
eval_batch_size = 16
steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size
output_stride = 16
aspp_dilation_rates = []
level = int(np.math.log2(output_stride))
pool_kernel_size = []
config = cfg.ExperimentConfig(
task=SemanticSegmentationTask(
model=SemanticSegmentationModel(
num_classes=21,
input_size=[None, None, 3],
backbone=backbones.Backbone(
type='mobilenet',
mobilenet=backbones.MobileNet(
model_id='MobileNetV2', output_stride=output_stride)),
decoder=decoders.Decoder(
type='aspp',
aspp=decoders.ASPP(
level=level,
dilation_rates=aspp_dilation_rates,
pool_kernel_size=pool_kernel_size)),
head=SegmentationHead(level=level, num_convs=0),
norm_activation=common.NormActivation(
activation='relu',
norm_momentum=0.99,
norm_epsilon=1e-3,
use_sync_bn=True)),
losses=Losses(l2_weight_decay=4e-5),
train_data=DataConfig(
input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'),
output_size=[512, 512],
is_training=True,
global_batch_size=train_batch_size,
aug_scale_min=0.5,
aug_scale_max=2.0),
validation_data=DataConfig(
input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'),
output_size=[512, 512],
is_training=False,
global_batch_size=eval_batch_size,
resize_eval_groundtruth=False,
groundtruth_padded_size=[512, 512],
drop_remainder=False),
# mobilenetv2
init_checkpoint='gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63',
init_checkpoint_modules=['backbone', 'decoder']),
trainer=cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=30000,
validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
best_checkpoint_eval_metric='mean_iou',
best_checkpoint_export_subdir='best_ckpt',
best_checkpoint_metric_comp='higher',
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 0.007 * train_batch_size / 16,
'decay_steps': 30000,
'end_learning_rate': 0.0,
'power': 0.9
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 5 * steps_per_epoch,
'warmup_learning_rate': 0
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
# Cityscapes Dataset (Download and process the dataset yourself)
CITYSCAPES_TRAIN_EXAMPLES = 2975
CITYSCAPES_VAL_EXAMPLES = 500
CITYSCAPES_INPUT_PATH_BASE = 'cityscapes'
@exp_factory.register_config_factory('seg_deeplabv3plus_cityscapes')
def seg_deeplabv3plus_cityscapes() -> cfg.ExperimentConfig:
"""Image segmentation on cityscapes with resnet deeplabv3+."""
train_batch_size = 16
eval_batch_size = 16
steps_per_epoch = CITYSCAPES_TRAIN_EXAMPLES // train_batch_size
output_stride = 16
aspp_dilation_rates = [6, 12, 18]
multigrid = [1, 2, 4]
stem_type = 'v1'
level = int(np.math.log2(output_stride))
config = cfg.ExperimentConfig(
task=SemanticSegmentationTask(
model=SemanticSegmentationModel(
# Cityscapes uses only 19 semantic classes for train/evaluation.
# The void (background) class is ignored in train and evaluation.
num_classes=19,
input_size=[None, None, 3],
backbone=backbones.Backbone(
type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
model_id=101, output_stride=output_stride,
stem_type=stem_type, multigrid=multigrid)),
decoder=decoders.Decoder(
type='aspp',
aspp=decoders.ASPP(
level=level, dilation_rates=aspp_dilation_rates,
pool_kernel_size=[512, 1024])),
head=SegmentationHead(
level=level,
num_convs=2,
feature_fusion='deeplabv3plus',
low_level=2,
low_level_num_filters=48),
norm_activation=common.NormActivation(
activation='swish',
norm_momentum=0.99,
norm_epsilon=1e-3,
use_sync_bn=True)),
losses=Losses(l2_weight_decay=1e-4),
train_data=DataConfig(
input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE,
'train_fine**'),
crop_size=[512, 1024],
output_size=[1024, 2048],
is_training=True,
global_batch_size=train_batch_size,
aug_scale_min=0.5,
aug_scale_max=2.0),
validation_data=DataConfig(
input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, 'val_fine*'),
output_size=[1024, 2048],
is_training=False,
global_batch_size=eval_batch_size,
resize_eval_groundtruth=True,
drop_remainder=False),
# resnet101
init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400',
init_checkpoint_modules='backbone'),
trainer=cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=500 * steps_per_epoch,
validation_steps=CITYSCAPES_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 0.01,
'decay_steps': 500 * steps_per_epoch,
'end_learning_rate': 0.0,
'power': 0.9
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 5 * steps_per_epoch,
'warmup_learning_rate': 0
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('mnv2_deeplabv3_cityscapes')
def mnv2_deeplabv3_cityscapes() -> cfg.ExperimentConfig:
"""Image segmentation on cityscapes with mobilenetv2 deeplabv3."""
train_batch_size = 16
eval_batch_size = 16
steps_per_epoch = CITYSCAPES_TRAIN_EXAMPLES // train_batch_size
output_stride = 16
aspp_dilation_rates = []
pool_kernel_size = [512, 1024]
level = int(np.math.log2(output_stride))
config = cfg.ExperimentConfig(
task=SemanticSegmentationTask(
model=SemanticSegmentationModel(
# Cityscapes uses only 19 semantic classes for train/evaluation.
# The void (background) class is ignored in train and evaluation.
num_classes=19,
input_size=[None, None, 3],
backbone=backbones.Backbone(
type='mobilenet',
mobilenet=backbones.MobileNet(
model_id='MobileNetV2', output_stride=output_stride)),
decoder=decoders.Decoder(
type='aspp',
aspp=decoders.ASPP(
level=level,
dilation_rates=aspp_dilation_rates,
pool_kernel_size=pool_kernel_size)),
head=SegmentationHead(level=level, num_convs=0),
norm_activation=common.NormActivation(
activation='relu',
norm_momentum=0.99,
norm_epsilon=1e-3,
use_sync_bn=True)),
losses=Losses(l2_weight_decay=4e-5),
train_data=DataConfig(
input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE,
'train_fine**'),
crop_size=[512, 1024],
output_size=[1024, 2048],
is_training=True,
global_batch_size=train_batch_size,
aug_scale_min=0.5,
aug_scale_max=2.0),
validation_data=DataConfig(
input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, 'val_fine*'),
output_size=[1024, 2048],
is_training=False,
global_batch_size=eval_batch_size,
resize_eval_groundtruth=True,
drop_remainder=False),
# Coco pre-trained mobilenetv2 checkpoint
init_checkpoint='gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63',
init_checkpoint_modules='backbone'),
trainer=cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=100000,
validation_steps=CITYSCAPES_VAL_EXAMPLES // eval_batch_size,
validation_interval=steps_per_epoch,
best_checkpoint_eval_metric='mean_iou',
best_checkpoint_export_subdir='best_ckpt',
best_checkpoint_metric_comp='higher',
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 0.01,
'decay_steps': 100000,
'end_learning_rate': 0.0,
'power': 0.9
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': 5 * steps_per_epoch,
'warmup_learning_rate': 0
}
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('mnv2_deeplabv3plus_cityscapes')
def mnv2_deeplabv3plus_cityscapes() -> cfg.ExperimentConfig:
"""Image segmentation on cityscapes with mobilenetv2 deeplabv3plus."""
config = mnv2_deeplabv3_cityscapes()
config.task.model.head = SegmentationHead(
level=4,
num_convs=2,
feature_fusion='deeplabv3plus',
use_depthwise_convolution=True,
low_level='2/depthwise',
low_level_num_filters=48)
config.task.model.backbone.mobilenet.output_intermediate_endpoints = True
return config
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for semantic_segmentation."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official import vision
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.configs import semantic_segmentation as exp_cfg
class ImageSegmentationConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(('seg_deeplabv3_pascal',),
('seg_deeplabv3plus_pascal',))
def test_semantic_segmentation_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.SemanticSegmentationTask)
self.assertIsInstance(config.task.model,
exp_cfg.SemanticSegmentationModel)
self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
config.validate()
config.task.train_data.is_training = None
with self.assertRaises(KeyError):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Video classification configuration definition."""
import dataclasses
from typing import Optional, Tuple
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import hyperparams
from official.modeling import optimization
from official.vision.configs import backbones_3d
from official.vision.configs import common
@dataclasses.dataclass
class DataConfig(cfg.DataConfig):
"""The base configuration for building datasets."""
name: Optional[str] = None
file_type: Optional[str] = 'tfrecord'
compressed_input: bool = False
split: str = 'train'
variant_name: Optional[str] = None
feature_shape: Tuple[int, ...] = (64, 224, 224, 3)
temporal_stride: int = 1
random_stride_range: int = 0
num_test_clips: int = 1
num_test_crops: int = 1
num_classes: int = -1
num_examples: int = -1
global_batch_size: int = 128
data_format: str = 'channels_last'
dtype: str = 'float32'
one_hot: bool = True
shuffle_buffer_size: int = 64
cache: bool = False
input_path: str = ''
is_training: bool = True
cycle_length: int = 10
drop_remainder: bool = True
min_image_size: int = 256
is_multilabel: bool = False
output_audio: bool = False
audio_feature: str = ''
audio_feature_shape: Tuple[int, ...] = (-1,)
aug_min_aspect_ratio: float = 0.5
aug_max_aspect_ratio: float = 2.0
aug_min_area_ratio: float = 0.49
aug_max_area_ratio: float = 1.0
aug_type: Optional[str] = None # 'autoaug', 'randaug', or None
image_field_key: str = 'image/encoded'
label_field_key: str = 'clip/label/index'
def kinetics400(is_training):
"""Generated Kinectics 400 dataset configs."""
return DataConfig(
name='kinetics400',
num_classes=400,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=215570 if is_training else 17706,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics600(is_training):
"""Generated Kinectics 600 dataset configs."""
return DataConfig(
name='kinetics600',
num_classes=600,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=366016 if is_training else 27780,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics700(is_training):
"""Generated Kinectics 600 dataset configs."""
return DataConfig(
name='kinetics700',
num_classes=700,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=522883 if is_training else 33441,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics700_2020(is_training):
"""Generated Kinectics 600 dataset configs."""
return DataConfig(
name='kinetics700',
num_classes=700,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=535982 if is_training else 33640,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
@dataclasses.dataclass
class VideoClassificationModel(hyperparams.Config):
"""The model config."""
model_type: str = 'video_classification'
backbone: backbones_3d.Backbone3D = backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50())
norm_activation: common.NormActivation = common.NormActivation(
use_sync_bn=False)
dropout_rate: float = 0.2
aggregate_endpoints: bool = False
require_endpoints: Optional[Tuple[str, ...]] = None
@dataclasses.dataclass
class Losses(hyperparams.Config):
one_hot: bool = True
label_smoothing: float = 0.0
l2_weight_decay: float = 0.0
@dataclasses.dataclass
class Metrics(hyperparams.Config):
use_per_class_recall: bool = False
@dataclasses.dataclass
class VideoClassificationTask(cfg.TaskConfig):
"""The task config."""
model: VideoClassificationModel = VideoClassificationModel()
train_data: DataConfig = DataConfig(is_training=True, drop_remainder=True)
validation_data: DataConfig = DataConfig(
is_training=False, drop_remainder=False)
losses: Losses = Losses()
metrics: Metrics = Metrics()
init_checkpoint: Optional[str] = None
init_checkpoint_modules: str = 'all' # all or backbone
# Spatial Partitioning fields.
train_input_partition_dims: Optional[Tuple[int, ...]] = None
eval_input_partition_dims: Optional[Tuple[int, ...]] = None
def add_trainer(experiment: cfg.ExperimentConfig,
train_batch_size: int,
eval_batch_size: int,
learning_rate: float = 1.6,
train_epochs: int = 44,
warmup_epochs: int = 5):
"""Add and config a trainer to the experiment config."""
if experiment.task.train_data.num_examples <= 0:
raise ValueError('Wrong train dataset size {!r}'.format(
experiment.task.train_data))
if experiment.task.validation_data.num_examples <= 0:
raise ValueError('Wrong validation dataset size {!r}'.format(
experiment.task.validation_data))
experiment.task.train_data.global_batch_size = train_batch_size
experiment.task.validation_data.global_batch_size = eval_batch_size
steps_per_epoch = experiment.task.train_data.num_examples // train_batch_size
experiment.trainer = cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=train_epochs * steps_per_epoch,
validation_steps=experiment.task.validation_data.num_examples //
eval_batch_size,
validation_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9,
'nesterov': True,
}
},
'learning_rate': {
'type': 'cosine',
'cosine': {
'initial_learning_rate': learning_rate,
'decay_steps': train_epochs * steps_per_epoch,
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': warmup_epochs * steps_per_epoch,
'warmup_learning_rate': 0
}
}
}))
return experiment
@exp_factory.register_config_factory('video_classification')
def video_classification() -> cfg.ExperimentConfig:
"""Video classification general."""
return cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=VideoClassificationTask(),
trainer=cfg.TrainerConfig(),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
@exp_factory.register_config_factory('video_classification_ucf101')
def video_classification_ucf101() -> cfg.ExperimentConfig:
"""Video classification on UCF-101 with resnet."""
train_dataset = DataConfig(
name='ucf101',
num_classes=101,
is_training=True,
split='train',
drop_remainder=True,
num_examples=9537,
temporal_stride=2,
feature_shape=(32, 224, 224, 3))
train_dataset.tfds_name = 'ucf101'
train_dataset.tfds_split = 'train'
validation_dataset = DataConfig(
name='ucf101',
num_classes=101,
is_training=True,
split='test',
drop_remainder=False,
num_examples=3783,
temporal_stride=2,
feature_shape=(32, 224, 224, 3))
validation_dataset.tfds_name = 'ucf101'
validation_dataset.tfds_split = 'test'
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(
config,
train_batch_size=64,
eval_batch_size=16,
learning_rate=0.8,
train_epochs=100)
return config
@exp_factory.register_config_factory('video_classification_kinetics400')
def video_classification_kinetics400() -> cfg.ExperimentConfig:
"""Video classification on Kinectics 400 with resnet."""
train_dataset = kinetics400(is_training=True)
validation_dataset = kinetics400(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics600')
def video_classification_kinetics600() -> cfg.ExperimentConfig:
"""Video classification on Kinectics 600 with resnet."""
train_dataset = kinetics600(is_training=True)
validation_dataset = kinetics600(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics700')
def video_classification_kinetics700() -> cfg.ExperimentConfig:
"""Video classification on Kinectics 700 with resnet."""
train_dataset = kinetics700(is_training=True)
validation_dataset = kinetics700(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics700_2020')
def video_classification_kinetics700_2020() -> cfg.ExperimentConfig:
"""Video classification on Kinectics 700 2020 with resnet."""
train_dataset = kinetics700_2020(is_training=True)
validation_dataset = kinetics700_2020(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for video_classification."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official import vision
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.configs import video_classification as exp_cfg
class VideoClassificationConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(('video_classification',),
('video_classification_kinetics600',))
def test_video_classification_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.VideoClassificationTask)
self.assertIsInstance(config.task.model, exp_cfg.VideoClassificationModel)
self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
config.validate()
config.task.train_data.is_training = None
with self.assertRaises(KeyError):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Convert raw COCO dataset to TFRecord format.
This scripts follows the label map decoder format and supports detection
boxes, instance masks and captions.
Example usage:
python create_coco_tf_record.py --logtostderr \
--image_dir="${TRAIN_IMAGE_DIR}" \
--image_info_file="${TRAIN_IMAGE_INFO_FILE}" \
--object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
--caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
--output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
--num_shards=100
"""
import collections
import json
import logging
import os
from absl import app # pylint:disable=unused-import
from absl import flags
import numpy as np
from pycocotools import mask
import tensorflow as tf
import multiprocessing as mp
from official.vision.data import tfrecord_lib
flags.DEFINE_boolean(
'include_masks', False, 'Whether to include instance segmentations masks '
'(PNG encoded) in the result. default: False.')
flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
flags.DEFINE_string(
'image_info_file', '', 'File containing image information. '
'Tf Examples in the output files correspond to the image '
'info entries in this file. If this file is not provided '
'object_annotations_file is used if present. Otherwise, '
'caption_annotations_file is used to get image info.')
flags.DEFINE_string(
'object_annotations_file', '', 'File containing object '
'annotations - boxes and instance masks.')
flags.DEFINE_string('caption_annotations_file', '', 'File containing image '
'captions.')
flags.DEFINE_string('panoptic_annotations_file', '', 'File containing panoptic '
'annotations.')
flags.DEFINE_string('panoptic_masks_dir', '',
'Directory containing panoptic masks annotations.')
flags.DEFINE_boolean(
'include_panoptic_masks', False, 'Whether to include category and '
'instance masks in the result. These are required to run the PQ evaluator '
'default: False.')
flags.DEFINE_string('output_file_prefix', '/tmp/train', 'Path to output file')
flags.DEFINE_integer('num_shards', 32, 'Number of shards for output file.')
FLAGS = flags.FLAGS
logger = tf.get_logger()
logger.setLevel(logging.INFO)
_VOID_LABEL = 0
_VOID_INSTANCE_ID = 0
_THING_CLASS_ID = 1
_STUFF_CLASSES_OFFSET = 90
def coco_segmentation_to_mask_png(segmentation, height, width, is_crowd):
"""Encode a COCO mask segmentation as PNG string."""
run_len_encoding = mask.frPyObjects(segmentation, height, width)
binary_mask = mask.decode(run_len_encoding)
if not is_crowd:
binary_mask = np.amax(binary_mask, axis=2)
return tfrecord_lib.encode_mask_as_png(binary_mask)
def generate_coco_panoptics_masks(segments_info, mask_path,
include_panoptic_masks,
is_category_thing):
"""Creates masks for panoptic segmentation task.
Args:
segments_info: a list of dicts, where each dict has keys: [u'id',
u'category_id', u'area', u'bbox', u'iscrowd'], detailing information for
each segment in the panoptic mask.
mask_path: path to the panoptic mask.
include_panoptic_masks: bool, when set to True, category and instance
masks are included in the outputs. Set this to True, when using
the Panoptic Quality evaluator.
is_category_thing: a dict with category ids as keys and, 0/1 as values to
represent "stuff" and "things" classes respectively.
Returns:
A dict with with keys: [u'semantic_segmentation_mask', u'category_mask',
u'instance_mask']. The dict contains 'category_mask' and 'instance_mask'
only if `include_panoptic_eval_masks` is set to True.
"""
rgb_mask = tfrecord_lib.read_image(mask_path)
r, g, b = np.split(rgb_mask, 3, axis=-1)
# decode rgb encoded panoptic mask to get segments ids
# refer https://cocodataset.org/#format-data
segments_encoded_mask = (r + g * 256 + b * (256**2)).squeeze()
semantic_segmentation_mask = np.ones_like(
segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
if include_panoptic_masks:
category_mask = np.ones_like(
segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
instance_mask = np.ones_like(
segments_encoded_mask, dtype=np.uint8) * _VOID_INSTANCE_ID
for idx, segment in enumerate(segments_info):
segment_id = segment['id']
category_id = segment['category_id']
if is_category_thing[category_id]:
encoded_category_id = _THING_CLASS_ID
instance_id = idx + 1
else:
encoded_category_id = category_id - _STUFF_CLASSES_OFFSET
instance_id = _VOID_INSTANCE_ID
segment_mask = (segments_encoded_mask == segment_id)
semantic_segmentation_mask[segment_mask] = encoded_category_id
if include_panoptic_masks:
category_mask[segment_mask] = category_id
instance_mask[segment_mask] = instance_id
outputs = {
'semantic_segmentation_mask': tfrecord_lib.encode_mask_as_png(
semantic_segmentation_mask)
}
if include_panoptic_masks:
outputs.update({
'category_mask': tfrecord_lib.encode_mask_as_png(category_mask),
'instance_mask': tfrecord_lib.encode_mask_as_png(instance_mask)
})
return outputs
def coco_annotations_to_lists(bbox_annotations, id_to_name_map,
image_height, image_width, include_masks):
"""Converts COCO annotations to feature lists."""
data = dict((k, list()) for k in
['xmin', 'xmax', 'ymin', 'ymax', 'is_crowd',
'category_id', 'category_names', 'area'])
if include_masks:
data['encoded_mask_png'] = []
num_annotations_skipped = 0
for object_annotations in bbox_annotations:
(x, y, width, height) = tuple(object_annotations['bbox'])
if width <= 0 or height <= 0:
num_annotations_skipped += 1
continue
if x + width > image_width or y + height > image_height:
num_annotations_skipped += 1
continue
data['xmin'].append(float(x) / image_width)
data['xmax'].append(float(x + width) / image_width)
data['ymin'].append(float(y) / image_height)
data['ymax'].append(float(y + height) / image_height)
data['is_crowd'].append(object_annotations['iscrowd'])
category_id = int(object_annotations['category_id'])
data['category_id'].append(category_id)
data['category_names'].append(id_to_name_map[category_id].encode('utf8'))
data['area'].append(object_annotations['area'])
if include_masks:
data['encoded_mask_png'].append(
coco_segmentation_to_mask_png(object_annotations['segmentation'],
image_height, image_width,
object_annotations['iscrowd'])
)
return data, num_annotations_skipped
def bbox_annotations_to_feature_dict(
bbox_annotations, image_height, image_width, id_to_name_map, include_masks):
"""Convert COCO annotations to an encoded feature dict."""
data, num_skipped = coco_annotations_to_lists(
bbox_annotations, id_to_name_map, image_height, image_width,
include_masks)
feature_dict = {
'image/object/bbox/xmin':
tfrecord_lib.convert_to_feature(data['xmin']),
'image/object/bbox/xmax':
tfrecord_lib.convert_to_feature(data['xmax']),
'image/object/bbox/ymin':
tfrecord_lib.convert_to_feature(data['ymin']),
'image/object/bbox/ymax':
tfrecord_lib.convert_to_feature(data['ymax']),
'image/object/class/text':
tfrecord_lib.convert_to_feature(data['category_names']),
'image/object/class/label':
tfrecord_lib.convert_to_feature(data['category_id']),
'image/object/is_crowd':
tfrecord_lib.convert_to_feature(data['is_crowd']),
'image/object/area':
tfrecord_lib.convert_to_feature(data['area']),
}
if include_masks:
feature_dict['image/object/mask'] = (
tfrecord_lib.convert_to_feature(data['encoded_mask_png']))
return feature_dict, num_skipped
def encode_caption_annotations(caption_annotations):
captions = []
for caption_annotation in caption_annotations:
captions.append(caption_annotation['caption'].encode('utf8'))
return captions
def create_tf_example(image,
image_dirs,
panoptic_masks_dir=None,
bbox_annotations=None,
id_to_name_map=None,
caption_annotations=None,
panoptic_annotation=None,
is_category_thing=None,
include_panoptic_masks=False,
include_masks=False):
"""Converts image and annotations to a tf.Example proto.
Args:
image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
u'width', u'date_captured', u'flickr_url', u'id']
image_dirs: list of directories containing the image files.
panoptic_masks_dir: `str` of the panoptic masks directory.
bbox_annotations:
list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
coordinates in the official COCO dataset are given as [x, y, width,
height] tuples using absolute coordinates where x, y represent the
top-left (0-indexed) corner. This function converts to the format
expected by the Tensorflow Object Detection API (which is which is
[ymin, xmin, ymax, xmax] with coordinates normalized relative to image
size).
id_to_name_map: a dict mapping category IDs to string names.
caption_annotations:
list of dict with keys: [u'id', u'image_id', u'str'].
panoptic_annotation: dict with keys: [u'image_id', u'file_name',
u'segments_info']. Where the value for segments_info is a list of dicts,
with each dict containing information for a single segment in the mask.
is_category_thing: `bool`, whether it is a category thing.
include_panoptic_masks: `bool`, whether to include panoptic masks.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
Returns:
example: The converted tf.Example
num_annotations_skipped: Number of (invalid) annotations that were ignored.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
does not exist, or is not unique across image directories.
"""
image_height = image['height']
image_width = image['width']
filename = image['file_name']
image_id = image['id']
if len(image_dirs) > 1:
full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
if not full_existing_paths:
raise ValueError(
'{} does not exist across image directories.'.format(filename))
if len(full_existing_paths) > 1:
raise ValueError(
'{} is not unique across image directories'.format(filename))
full_path, = full_existing_paths
# If there is only one image directory, it's not worth checking for existence,
# since trying to open the file will raise an informative error message if it
# does not exist.
else:
image_dir, = image_dirs
full_path = os.path.join(image_dir, filename)
with tf.io.gfile.GFile(full_path, 'rb') as fid:
encoded_jpg = fid.read()
feature_dict = tfrecord_lib.image_info_to_feature_dict(
image_height, image_width, filename, image_id, encoded_jpg, 'jpg')
num_annotations_skipped = 0
if bbox_annotations:
box_feature_dict, num_skipped = bbox_annotations_to_feature_dict(
bbox_annotations, image_height, image_width, id_to_name_map,
include_masks)
num_annotations_skipped += num_skipped
feature_dict.update(box_feature_dict)
if caption_annotations:
encoded_captions = encode_caption_annotations(caption_annotations)
feature_dict.update(
{'image/caption': tfrecord_lib.convert_to_feature(encoded_captions)})
if panoptic_annotation:
segments_info = panoptic_annotation['segments_info']
panoptic_mask_filename = os.path.join(
panoptic_masks_dir,
panoptic_annotation['file_name'])
encoded_panoptic_masks = generate_coco_panoptics_masks(
segments_info, panoptic_mask_filename, include_panoptic_masks,
is_category_thing)
feature_dict.update(
{'image/segmentation/class/encoded': tfrecord_lib.convert_to_feature(
encoded_panoptic_masks['semantic_segmentation_mask'])})
if include_panoptic_masks:
feature_dict.update({
'image/panoptic/category_mask': tfrecord_lib.convert_to_feature(
encoded_panoptic_masks['category_mask']),
'image/panoptic/instance_mask': tfrecord_lib.convert_to_feature(
encoded_panoptic_masks['instance_mask'])
})
example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
return example, num_annotations_skipped
def _load_object_annotations(object_annotations_file):
"""Loads object annotation JSON file."""
with tf.io.gfile.GFile(object_annotations_file, 'r') as fid:
obj_annotations = json.load(fid)
images = obj_annotations['images']
id_to_name_map = dict((element['id'], element['name']) for element in
obj_annotations['categories'])
img_to_obj_annotation = collections.defaultdict(list)
logging.info('Building bounding box index.')
for annotation in obj_annotations['annotations']:
image_id = annotation['image_id']
img_to_obj_annotation[image_id].append(annotation)
missing_annotation_count = 0
for image in images:
image_id = image['id']
if image_id not in img_to_obj_annotation:
missing_annotation_count += 1
logging.info('%d images are missing bboxes.', missing_annotation_count)
return img_to_obj_annotation, id_to_name_map
def _load_caption_annotations(caption_annotations_file):
"""Loads caption annotation JSON file."""
with tf.io.gfile.GFile(caption_annotations_file, 'r') as fid:
caption_annotations = json.load(fid)
img_to_caption_annotation = collections.defaultdict(list)
logging.info('Building caption index.')
for annotation in caption_annotations['annotations']:
image_id = annotation['image_id']
img_to_caption_annotation[image_id].append(annotation)
missing_annotation_count = 0
images = caption_annotations['images']
for image in images:
image_id = image['id']
if image_id not in img_to_caption_annotation:
missing_annotation_count += 1
logging.info('%d images are missing captions.', missing_annotation_count)
return img_to_caption_annotation
def _load_panoptic_annotations(panoptic_annotations_file):
"""Loads panoptic annotation from file."""
with tf.io.gfile.GFile(panoptic_annotations_file, 'r') as fid:
panoptic_annotations = json.load(fid)
img_to_panoptic_annotation = dict()
logging.info('Building panoptic index.')
for annotation in panoptic_annotations['annotations']:
image_id = annotation['image_id']
img_to_panoptic_annotation[image_id] = annotation
is_category_thing = dict()
for category_info in panoptic_annotations['categories']:
is_category_thing[category_info['id']] = category_info['isthing'] == 1
missing_annotation_count = 0
images = panoptic_annotations['images']
for image in images:
image_id = image['id']
if image_id not in img_to_panoptic_annotation:
missing_annotation_count += 1
logging.info(
'%d images are missing panoptic annotations.', missing_annotation_count)
return img_to_panoptic_annotation, is_category_thing
def _load_images_info(images_info_file):
with tf.io.gfile.GFile(images_info_file, 'r') as fid:
info_dict = json.load(fid)
return info_dict['images']
def generate_annotations(images, image_dirs,
panoptic_masks_dir=None,
img_to_obj_annotation=None,
img_to_caption_annotation=None,
img_to_panoptic_annotation=None,
is_category_thing=None,
id_to_name_map=None,
include_panoptic_masks=False,
include_masks=False):
"""Generator for COCO annotations."""
for image in images:
object_annotation = (img_to_obj_annotation.get(image['id'], None) if
img_to_obj_annotation else None)
caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
img_to_caption_annotation else None)
panoptic_annotation = (img_to_panoptic_annotation.get(image['id'], None) if
img_to_panoptic_annotation else None)
yield (image, image_dirs, panoptic_masks_dir, object_annotation,
id_to_name_map, caption_annotaion, panoptic_annotation,
is_category_thing, include_panoptic_masks, include_masks)
def _create_tf_record_from_coco_annotations(images_info_file,
image_dirs,
output_path,
num_shards,
object_annotations_file=None,
caption_annotations_file=None,
panoptic_masks_dir=None,
panoptic_annotations_file=None,
include_panoptic_masks=False,
include_masks=False):
"""Loads COCO annotation json files and converts to tf.Record format.
Args:
images_info_file: JSON file containing image info. The number of tf.Examples
in the output tf Record files is exactly equal to the number of image info
entries in this file. This can be any of train/val/test annotation json
files Eg. 'image_info_test-dev2017.json',
'instance_annotations_train2017.json',
'caption_annotations_train2017.json', etc.
image_dirs: List of directories containing the image files.
output_path: Path to output tf.Record file.
num_shards: Number of output files to create.
object_annotations_file: JSON file containing bounding box annotations.
caption_annotations_file: JSON file containing caption annotations.
panoptic_masks_dir: Directory containing panoptic masks.
panoptic_annotations_file: JSON file containing panoptic annotations.
include_panoptic_masks: Whether to include 'category_mask'
and 'instance_mask', which is required by the panoptic quality evaluator.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
"""
logging.info('writing to output path: %s', output_path)
images = _load_images_info(images_info_file)
img_to_obj_annotation = None
img_to_caption_annotation = None
id_to_name_map = None
img_to_panoptic_annotation = None
is_category_thing = None
if object_annotations_file:
img_to_obj_annotation, id_to_name_map = (
_load_object_annotations(object_annotations_file))
if caption_annotations_file:
img_to_caption_annotation = (
_load_caption_annotations(caption_annotations_file))
if panoptic_annotations_file:
img_to_panoptic_annotation, is_category_thing = (
_load_panoptic_annotations(panoptic_annotations_file))
coco_annotations_iter = generate_annotations(
images=images,
image_dirs=image_dirs,
panoptic_masks_dir=panoptic_masks_dir,
img_to_obj_annotation=img_to_obj_annotation,
img_to_caption_annotation=img_to_caption_annotation,
img_to_panoptic_annotation=img_to_panoptic_annotation,
is_category_thing=is_category_thing,
id_to_name_map=id_to_name_map,
include_panoptic_masks=include_panoptic_masks,
include_masks=include_masks)
num_skipped = tfrecord_lib.write_tf_record_dataset(
output_path, coco_annotations_iter, create_tf_example, num_shards)
logging.info('Finished writing, skipped %d annotations.', num_skipped)
def main(_):
assert FLAGS.image_dir, '`image_dir` missing.'
assert (FLAGS.image_info_file or FLAGS.object_annotations_file or
FLAGS.caption_annotations_file), ('All annotation files are '
'missing.')
if FLAGS.image_info_file:
images_info_file = FLAGS.image_info_file
elif FLAGS.object_annotations_file:
images_info_file = FLAGS.object_annotations_file
else:
images_info_file = FLAGS.caption_annotations_file
directory = os.path.dirname(FLAGS.output_file_prefix)
if not tf.io.gfile.isdir(directory):
tf.io.gfile.makedirs(directory)
_create_tf_record_from_coco_annotations(images_info_file, FLAGS.image_dir,
FLAGS.output_file_prefix,
FLAGS.num_shards,
FLAGS.object_annotations_file,
FLAGS.caption_annotations_file,
FLAGS.panoptic_masks_dir,
FLAGS.panoptic_annotations_file,
FLAGS.include_panoptic_masks,
FLAGS.include_masks)
if __name__ == '__main__':
app.run(main)
#!/bin/bash
#
# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
base_image_dir="/tmp/coco_images"
output_dir="/tmp/coco_few_shot"
while getopts ":i:o:" o; do
case "${o}" in
o) output_dir=${OPTARG} ;;
i) base_image_dir=${OPTARG} ;;
*) echo "Usage: ${0} [-i <base_image_dir>] [-o <output_dir>]" 1>&2; exit 1 ;;
esac
done
cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
-P "${tmp_dir}" -A "trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json" \
"http://${cocosplit_url}/"
mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
rm -rf "${tmp_dir}/${cocosplit_url}/"
python process_coco_few_shot_json_files.py \
--logtostderr --workdir="${tmp_dir}"
for seed in {0..9}; do
for shots in 1 3 5 10 30; do
python create_coco_tf_record.py \
--logtostderr \
--image_dir="${base_image_dir}/train2014" \
--image_dir="${base_image_dir}/val2014" \
--image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
--object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
--num_shards=4
done
done
python create_coco_tf_record.py \
--logtostderr \
--image_dir="${base_image_dir}/train2014" \
--image_dir="${base_image_dir}/val2014" \
--image_info_file="${tmp_dir}/datasplit/5k.json" \
--object_annotations_file="${tmp_dir}/datasplit/5k.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/5k" \
--num_shards=10
python create_coco_tf_record.py \
--logtostderr \
--image_dir="${base_image_dir}/train2014" \
--image_dir="${base_image_dir}/val2014" \
--image_info_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
--object_annotations_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/trainvalno5k_base" \
--num_shards=200
python create_coco_tf_record.py \
--logtostderr \
--image_dir="${base_image_dir}/train2014" \
--image_dir="${base_image_dir}/val2014" \
--image_info_file="${tmp_dir}/datasplit/5k_base.json" \
--object_annotations_file="${tmp_dir}/datasplit/5k_base.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/5k_base" \
--num_shards=10
rm -rf "${tmp_dir}"
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processes the JSON files for COCO few-shot.
We assume that `workdir` mirrors the contents of
http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
"Frustratingly Simple Few-Shot Object Detection" paper uses.
"""
import collections
import itertools
import json
import logging
import os
from absl import app
from absl import flags
import tensorflow as tf
logger = tf.get_logger()
logger.setLevel(logging.INFO)
flags.DEFINE_string('workdir', None, 'Working directory.')
FLAGS = flags.FLAGS
CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
'parking meter', 'person', 'pizza', 'potted plant',
'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
'stop sign', 'suitcase', 'surfboard', 'teddy bear',
'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
'wine glass', 'zebra']
SEEDS = list(range(10))
SHOTS = [1, 3, 5, 10, 30]
FILE_SUFFIXES = collections.defaultdict(list)
for _seed, _shots in itertools.product(SEEDS, SHOTS):
for _category in CATEGORIES:
FILE_SUFFIXES[(_seed, _shots)].append(
'{}full_box_{}shot_{}_trainval.json'.format(
# http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
#
# datasplit/
# trainvalno5k.json
# 5k.json
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
# seed{1-9}/
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
#
# This means that the JSON files for seed0 are located in the root
# directory rather than in a `seed?/` subdirectory, hence the
# conditional expression below.
'' if _seed == 0 else 'seed{}/'.format(_seed),
_shots,
_category))
# Base class IDs, as defined in
# https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/evaluation/coco_evaluation.py#L60-L65
BASE_CLASS_IDS = [8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 65, 70, 73, 74, 75,
76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
def main(unused_argv):
workdir = FLAGS.workdir
# Filter novel class annotations from the training and validation sets.
for name in ('trainvalno5k', '5k'):
file_path = os.path.join(workdir, 'datasplit', '{}.json'.format(name))
with tf.io.gfile.GFile(file_path, 'r') as f:
json_dict = json.load(f)
json_dict['annotations'] = [a for a in json_dict['annotations']
if a['category_id'] in BASE_CLASS_IDS]
output_path = os.path.join(
workdir, 'datasplit', '{}_base.json'.format(name))
with tf.io.gfile.GFile(output_path, 'w') as f:
json.dump(json_dict, f)
for seed, shots in itertools.product(SEEDS, SHOTS):
# Retrieve all examples for a given seed and shots setting.
file_paths = [os.path.join(workdir, suffix)
for suffix in FILE_SUFFIXES[(seed, shots)]]
json_dicts = []
for file_path in file_paths:
with tf.io.gfile.GFile(file_path, 'r') as f:
json_dicts.append(json.load(f))
# Make sure that all JSON files for a given seed and shots setting have the
# same metadata. We count on this to fuse them later on.
metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
'categories': d['categories']} for d in json_dicts]
if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
raise RuntimeError(
'JSON files for {} shots (seed {}) '.format(shots, seed) +
'have different info, licences, or categories fields')
# Retrieve images across all JSON files.
images = sum((d['images'] for d in json_dicts), [])
# Remove duplicate image entries.
images = list({image['id']: image for image in images}.values())
output_dict = {
'info': json_dicts[0]['info'],
'licenses': json_dicts[0]['licenses'],
'categories': json_dicts[0]['categories'],
'images': images,
'annotations': sum((d['annotations'] for d in json_dicts), [])
}
output_path = os.path.join(workdir,
'{}shot_seed{}.json'.format(shots, seed))
with tf.io.gfile.GFile(output_path, 'w') as f:
json.dump(output_dict, f)
logger.info('Processed %d shots (seed %d) and saved to %s',
shots, seed, output_path)
if __name__ == '__main__':
flags.mark_flag_as_required('workdir')
app.run(main)
#!/bin/bash
sudo apt update
sudo apt install unzip aria2 -y
DATA_DIR=$1
aria2c -j 8 -Z \
http://images.cocodataset.org/annotations/annotations_trainval2017.zip \
http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip \
http://images.cocodataset.org/zips/train2017.zip \
http://images.cocodataset.org/zips/val2017.zip \
--dir=$DATA_DIR;
unzip $DATA_DIR/"*".zip -d $DATA_DIR;
mkdir $DATA_DIR/zips && mv $DATA_DIR/*.zip $DATA_DIR/zips;
unzip $DATA_DIR/annotations/panoptic_train2017.zip -d $DATA_DIR
unzip $DATA_DIR/annotations/panoptic_val2017.zip -d $DATA_DIR
python3 official/vision/beta/data/create_coco_tf_record.py \
--logtostderr \
--image_dir="$DATA_DIR/val2017" \
--object_annotations_file="$DATA_DIR/annotations/instances_val2017.json" \
--output_file_prefix="$DATA_DIR/tfrecords/val" \
--panoptic_annotations_file="$DATA_DIR/annotations/panoptic_val2017.json" \
--panoptic_masks_dir="$DATA_DIR/panoptic_val2017" \
--num_shards=8 \
--include_masks \
--include_panoptic_masks
python3 official/vision/beta/data/create_coco_tf_record.py \
--logtostderr \
--image_dir="$DATA_DIR/train2017" \
--object_annotations_file="$DATA_DIR/annotations/instances_train2017.json" \
--output_file_prefix="$DATA_DIR/tfrecords/train" \
--panoptic_annotations_file="$DATA_DIR/annotations/panoptic_train2017.json" \
--panoptic_masks_dir="$DATA_DIR/panoptic_train2017" \
--num_shards=32 \
--include_masks \
--include_panoptic_masks
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper functions for creating TFRecord datasets."""
import hashlib
import io
import itertools
from absl import logging
import numpy as np
from PIL import Image
import tensorflow as tf
import multiprocessing as mp
def convert_to_feature(value, value_type=None):
"""Converts the given python object to a tf.train.Feature.
Args:
value: int, float, bytes or a list of them.
value_type: optional, if specified, forces the feature to be of the given
type. Otherwise, type is inferred automatically. Can be one of
['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list']
Returns:
feature: A tf.train.Feature object.
"""
if value_type is None:
element = value[0] if isinstance(value, list) else value
if isinstance(element, bytes):
value_type = 'bytes'
elif isinstance(element, (int, np.integer)):
value_type = 'int64'
elif isinstance(element, (float, np.floating)):
value_type = 'float'
else:
raise ValueError('Cannot convert type {} to feature'.
format(type(element)))
if isinstance(value, list):
value_type = value_type + '_list'
if value_type == 'int64':
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
elif value_type == 'int64_list':
value = np.asarray(value).astype(np.int64).reshape(-1)
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
elif value_type == 'float':
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
elif value_type == 'float_list':
value = np.asarray(value).astype(np.float32).reshape(-1)
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
elif value_type == 'bytes':
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
elif value_type == 'bytes_list':
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
else:
raise ValueError('Unknown value_type parameter - {}'.format(value_type))
def image_info_to_feature_dict(height, width, filename, image_id,
encoded_str, encoded_format):
"""Convert image information to a dict of features."""
key = hashlib.sha256(encoded_str).hexdigest()
return {
'image/height': convert_to_feature(height),
'image/width': convert_to_feature(width),
'image/filename': convert_to_feature(filename.encode('utf8')),
'image/source_id': convert_to_feature(str(image_id).encode('utf8')),
'image/key/sha256': convert_to_feature(key.encode('utf8')),
'image/encoded': convert_to_feature(encoded_str),
'image/format': convert_to_feature(encoded_format.encode('utf8')),
}
def read_image(image_path):
pil_image = Image.open(image_path)
return np.asarray(pil_image)
def encode_mask_as_png(mask):
pil_image = Image.fromarray(mask)
output_io = io.BytesIO()
pil_image.save(output_io, format='PNG')
return output_io.getvalue()
def write_tf_record_dataset(output_path, annotation_iterator,
process_func, num_shards,
use_multiprocessing=True, unpack_arguments=True):
"""Iterates over annotations, processes them and writes into TFRecords.
Args:
output_path: The prefix path to create TF record files.
annotation_iterator: An iterator of tuples containing details about the
dataset.
process_func: A function which takes the elements from the tuples of
annotation_iterator as arguments and returns a tuple of (tf.train.Example,
int). The integer indicates the number of annotations that were skipped.
num_shards: int, the number of shards to write for the dataset.
use_multiprocessing:
Whether or not to use multiple processes to write TF Records.
unpack_arguments:
Whether to unpack the tuples from annotation_iterator as individual
arguments to the process func or to pass the returned value as it is.
Returns:
num_skipped: The total number of skipped annotations.
"""
writers = [
tf.io.TFRecordWriter(
output_path + '-%05d-of-%05d.tfrecord' % (i, num_shards))
for i in range(num_shards)
]
total_num_annotations_skipped = 0
if use_multiprocessing:
pool = mp.Pool()
if unpack_arguments:
tf_example_iterator = pool.starmap(process_func, annotation_iterator)
else:
tf_example_iterator = pool.imap(process_func, annotation_iterator)
else:
if unpack_arguments:
tf_example_iterator = itertools.starmap(process_func, annotation_iterator)
else:
tf_example_iterator = map(process_func, annotation_iterator)
for idx, (tf_example, num_annotations_skipped) in enumerate(
tf_example_iterator):
if idx % 100 == 0:
logging.info('On image %d', idx)
total_num_annotations_skipped += num_annotations_skipped
writers[idx % num_shards].write(tf_example.SerializeToString())
if use_multiprocessing:
pool.close()
pool.join()
for writer in writers:
writer.close()
logging.info('Finished writing, skipped %d annotations.',
total_num_annotations_skipped)
return total_num_annotations_skipped
def check_and_make_dir(directory):
"""Creates the directory if it doesn't exist."""
if not tf.io.gfile.isdir(directory):
tf.io.gfile.makedirs(directory)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tfrecord_lib."""
import os
from absl import flags
from absl.testing import parameterized
import tensorflow as tf
from official.vision.data import tfrecord_lib
FLAGS = flags.FLAGS
def process_sample(x):
d = {'x': x}
return tf.train.Example(features=tf.train.Features(feature=d)), 0
def parse_function(example_proto):
feature_description = {
'x': tf.io.FixedLenFeature([], tf.int64, default_value=-1)
}
return tf.io.parse_single_example(example_proto, feature_description)
class TfrecordLibTest(parameterized.TestCase):
def test_write_tf_record_dataset(self):
data = [(tfrecord_lib.convert_to_feature(i),) for i in range(17)]
path = os.path.join(FLAGS.test_tmpdir, 'train')
tfrecord_lib.write_tf_record_dataset(
path, data, process_sample, 3, use_multiprocessing=False)
tfrecord_files = tf.io.gfile.glob(path + '*')
self.assertLen(tfrecord_files, 3)
dataset = tf.data.TFRecordDataset(tfrecord_files)
dataset = dataset.map(parse_function)
read_values = set(d['x'] for d in dataset.as_numpy_iterator())
self.assertSetEqual(read_values, set(range(17)))
def test_convert_to_feature_float(self):
proto = tfrecord_lib.convert_to_feature(0.0)
self.assertEqual(proto.float_list.value[0], 0.0)
def test_convert_to_feature_int(self):
proto = tfrecord_lib.convert_to_feature(0)
self.assertEqual(proto.int64_list.value[0], 0)
def test_convert_to_feature_bytes(self):
proto = tfrecord_lib.convert_to_feature(b'123')
self.assertEqual(proto.bytes_list.value[0], b'123')
def test_convert_to_feature_float_list(self):
proto = tfrecord_lib.convert_to_feature([0.0, 1.0])
self.assertSequenceAlmostEqual(proto.float_list.value, [0.0, 1.0])
def test_convert_to_feature_int_list(self):
proto = tfrecord_lib.convert_to_feature([0, 1])
self.assertSequenceAlmostEqual(proto.int64_list.value, [0, 1])
def test_convert_to_feature_bytes_list(self):
proto = tfrecord_lib.convert_to_feature([b'123', b'456'])
self.assertSequenceAlmostEqual(proto.bytes_list.value, [b'123', b'456'])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classification decoder and parser."""
from typing import Any, Dict, List, Optional
# Import libraries
import tensorflow as tf
from official.vision.configs import common
from official.vision.dataloaders import decoder
from official.vision.dataloaders import parser
from official.vision.ops import augment
from official.vision.ops import preprocess_ops
MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
DEFAULT_IMAGE_FIELD_KEY = 'image/encoded'
DEFAULT_LABEL_FIELD_KEY = 'image/class/label'
class Decoder(decoder.Decoder):
"""A tf.Example decoder for classification task."""
def __init__(self,
image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
is_multilabel: bool = False,
keys_to_features: Optional[Dict[str, Any]] = None):
if not keys_to_features:
keys_to_features = {
image_field_key:
tf.io.FixedLenFeature((), tf.string, default_value=''),
}
if is_multilabel:
keys_to_features.update(
{label_field_key: tf.io.VarLenFeature(dtype=tf.int64)})
else:
keys_to_features.update({
label_field_key:
tf.io.FixedLenFeature((), tf.int64, default_value=-1)
})
self._keys_to_features = keys_to_features
def decode(self, serialized_example):
return tf.io.parse_single_example(
serialized_example, self._keys_to_features)
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def __init__(self,
output_size: List[int],
num_classes: float,
image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
decode_jpeg_only: bool = True,
aug_rand_hflip: bool = True,
aug_type: Optional[common.Augmentation] = None,
color_jitter: float = 0.,
random_erasing: Optional[common.RandomErasing] = None,
is_multilabel: bool = False,
dtype: str = 'float32'):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
num_classes: `float`, number of classes.
image_field_key: `str`, the key name to encoded image in tf.Example.
label_field_key: `str`, the key name to label in tf.Example.
decode_jpeg_only: `bool`, if True, only JPEG format is decoded, this is
faster than decoding other types. Default is True.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
color_jitter: Magnitude of color jitter. If > 0, the value is used to
generate random scale factor for brightness, contrast and saturation.
See `preprocess_ops.color_jitter` for more details.
random_erasing: if not None, augment input image by random erasing. See
`augment.RandomErasing` for more details.
is_multilabel: A `bool`, whether or not each example has multiple labels.
dtype: `str`, cast output image in dtype. It can be 'float32', 'float16',
or 'bfloat16'.
"""
self._output_size = output_size
self._aug_rand_hflip = aug_rand_hflip
self._num_classes = num_classes
self._image_field_key = image_field_key
if dtype == 'float32':
self._dtype = tf.float32
elif dtype == 'float16':
self._dtype = tf.float16
elif dtype == 'bfloat16':
self._dtype = tf.bfloat16
else:
raise ValueError('dtype {!r} is not supported!'.format(dtype))
if aug_type:
if aug_type.type == 'autoaug':
self._augmenter = augment.AutoAugment(
augmentation_name=aug_type.autoaug.augmentation_name,
cutout_const=aug_type.autoaug.cutout_const,
translate_const=aug_type.autoaug.translate_const)
elif aug_type.type == 'randaug':
self._augmenter = augment.RandAugment(
num_layers=aug_type.randaug.num_layers,
magnitude=aug_type.randaug.magnitude,
cutout_const=aug_type.randaug.cutout_const,
translate_const=aug_type.randaug.translate_const,
prob_to_apply=aug_type.randaug.prob_to_apply,
exclude_ops=aug_type.randaug.exclude_ops)
else:
raise ValueError('Augmentation policy {} not supported.'.format(
aug_type.type))
else:
self._augmenter = None
self._label_field_key = label_field_key
self._color_jitter = color_jitter
if random_erasing:
self._random_erasing = augment.RandomErasing(
probability=random_erasing.probability,
min_area=random_erasing.min_area,
max_area=random_erasing.max_area,
min_aspect=random_erasing.min_aspect,
max_aspect=random_erasing.max_aspect,
min_count=random_erasing.min_count,
max_count=random_erasing.max_count,
trials=random_erasing.trials)
else:
self._random_erasing = None
self._is_multilabel = is_multilabel
self._decode_jpeg_only = decode_jpeg_only
def _parse_train_data(self, decoded_tensors):
"""Parses data for training."""
image = self._parse_train_image(decoded_tensors)
label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
if self._is_multilabel:
if isinstance(label, tf.sparse.SparseTensor):
label = tf.sparse.to_dense(label)
label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
return image, label
def _parse_eval_data(self, decoded_tensors):
"""Parses data for evaluation."""
image = self._parse_eval_image(decoded_tensors)
label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
if self._is_multilabel:
if isinstance(label, tf.sparse.SparseTensor):
label = tf.sparse.to_dense(label)
label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
return image, label
def _parse_train_image(self, decoded_tensors):
"""Parses image data for training."""
image_bytes = decoded_tensors[self._image_field_key]
if self._decode_jpeg_only:
image_shape = tf.image.extract_jpeg_shape(image_bytes)
# Crops image.
cropped_image = preprocess_ops.random_crop_image_v2(
image_bytes, image_shape)
image = tf.cond(
tf.reduce_all(tf.equal(tf.shape(cropped_image), image_shape)),
lambda: preprocess_ops.center_crop_image_v2(image_bytes, image_shape),
lambda: cropped_image)
else:
# Decodes image.
image = tf.io.decode_image(image_bytes, channels=3)
image.set_shape([None, None, 3])
# Crops image.
cropped_image = preprocess_ops.random_crop_image(image)
image = tf.cond(
tf.reduce_all(tf.equal(tf.shape(cropped_image), tf.shape(image))),
lambda: preprocess_ops.center_crop_image(image),
lambda: cropped_image)
if self._aug_rand_hflip:
image = tf.image.random_flip_left_right(image)
# Color jitter.
if self._color_jitter > 0:
image = preprocess_ops.color_jitter(image, self._color_jitter,
self._color_jitter,
self._color_jitter)
# Resizes image.
image = tf.image.resize(
image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
image.set_shape([self._output_size[0], self._output_size[1], 3])
# Apply autoaug or randaug.
if self._augmenter is not None:
image = self._augmenter.distort(image)
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image,
offset=MEAN_RGB,
scale=STDDEV_RGB)
# Random erasing after the image has been normalized
if self._random_erasing is not None:
image = self._random_erasing.distort(image)
# Convert image to self._dtype.
image = tf.image.convert_image_dtype(image, self._dtype)
return image
def _parse_eval_image(self, decoded_tensors):
"""Parses image data for evaluation."""
image_bytes = decoded_tensors[self._image_field_key]
if self._decode_jpeg_only:
image_shape = tf.image.extract_jpeg_shape(image_bytes)
# Center crops.
image = preprocess_ops.center_crop_image_v2(image_bytes, image_shape)
else:
# Decodes image.
image = tf.io.decode_image(image_bytes, channels=3)
image.set_shape([None, None, 3])
# Center crops.
image = preprocess_ops.center_crop_image(image)
image = tf.image.resize(
image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
image.set_shape([self._output_size[0], self._output_size[1], 3])
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image,
offset=MEAN_RGB,
scale=STDDEV_RGB)
# Convert image to self._dtype.
image = tf.image.convert_image_dtype(image, self._dtype)
return image
@classmethod
def inference_fn(cls,
image: tf.Tensor,
input_image_size: List[int],
num_channels: int = 3) -> tf.Tensor:
"""Builds image model inputs for serving."""
image = tf.cast(image, dtype=tf.float32)
image = preprocess_ops.center_crop_image(image)
image = tf.image.resize(
image, input_image_size, method=tf.image.ResizeMethod.BILINEAR)
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(
image, offset=MEAN_RGB, scale=STDDEV_RGB)
image.set_shape(input_image_size + [num_channels])
return image
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The generic decoder interface."""
import abc
class Decoder(object):
"""Decodes the raw data into tensors."""
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def decode(self, serialized_example):
"""Decodes the serialized example into tensors.
Args:
serialized_example: a serialized string tensor that encodes the data.
Returns:
decoded_tensors: a dict of Tensors.
"""
pass
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Dataset reader for vision model garden."""
from typing import Any, Callable, Optional, Tuple
import tensorflow as tf
from official.core import config_definitions as cfg
from official.core import input_reader
def calculate_batch_sizes(total_batch_size: int,
pseudo_label_ratio: float) -> Tuple[int, int]:
"""Calculates labeled and pseudo-labeled dataset batch sizes.
Returns (labeled_batch_size, pseudo_labeled_batch_size) given a
total batch size and pseudo-label data ratio.
Args:
total_batch_size: The total batch size for all data.
pseudo_label_ratio: A non-negative float ratio of pseudo-labeled
to labeled data in a batch.
Returns:
(labeled_batch_size, pseudo_labeled_batch_size) as ints.
Raises:
ValueError: If total_batch_size is negative.
ValueError: If pseudo_label_ratio is negative.
"""
if total_batch_size < 0:
raise ValueError('Invalid total_batch_size: {}'.format(total_batch_size))
if pseudo_label_ratio < 0.0:
raise ValueError(
'Invalid pseudo_label_ratio: {}'.format(pseudo_label_ratio))
ratio_factor = pseudo_label_ratio / (1.0 + pseudo_label_ratio)
pseudo_labeled_batch_size = int(round(total_batch_size * ratio_factor))
labeled_batch_size = total_batch_size - pseudo_labeled_batch_size
return labeled_batch_size, pseudo_labeled_batch_size
class CombinationDatasetInputReader(input_reader.InputReader):
"""Combination dataset input reader."""
def __init__(self,
params: cfg.DataConfig,
dataset_fn=tf.data.TFRecordDataset,
pseudo_label_dataset_fn=tf.data.TFRecordDataset,
decoder_fn: Optional[Callable[..., Any]] = None,
sample_fn: Optional[Callable[..., Any]] = None,
parser_fn: Optional[Callable[..., Any]] = None,
transform_and_batch_fn: Optional[Callable[
[tf.data.Dataset, Optional[tf.distribute.InputContext]],
tf.data.Dataset]] = None,
postprocess_fn: Optional[Callable[..., Any]] = None):
"""Initializes an CombinationDatasetInputReader instance.
This class mixes a labeled and pseudo-labeled dataset. The params
must contain "pseudo_label_data.input_path" to specify the
pseudo-label dataset files and "pseudo_label_data.data_ratio"
to specify a per-batch mixing ratio of pseudo-label examples to
labeled dataset examples.
Args:
params: A config_definitions.DataConfig object.
dataset_fn: A `tf.data.Dataset` that consumes the input files. For
example, it can be `tf.data.TFRecordDataset`.
pseudo_label_dataset_fn: A `tf.data.Dataset` that consumes the input
files. For example, it can be `tf.data.TFRecordDataset`.
decoder_fn: An optional `callable` that takes the serialized data string
and decodes them into the raw tensor dictionary.
sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
input and outputs the transformed dataset. It performs sampling on the
decoded raw tensors dict before the parser_fn.
parser_fn: An optional `callable` that takes the decoded raw tensors dict
and parse them into a dictionary of tensors that can be consumed by the
model. It will be executed after decoder_fn.
transform_and_batch_fn: An optional `callable` that takes a
`tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
input, and returns a `tf.data.Dataset` object. It will be executed after
`parser_fn` to transform and batch the dataset; if None, after
`parser_fn` is executed, the dataset will be batched into per-replica
batch size.
postprocess_fn: A optional `callable` that processes batched tensors. It
will be executed after batching.
Raises:
ValueError: If drop_remainder is False.
"""
super().__init__(params=params,
dataset_fn=dataset_fn,
decoder_fn=decoder_fn,
sample_fn=sample_fn,
parser_fn=parser_fn,
transform_and_batch_fn=transform_and_batch_fn,
postprocess_fn=postprocess_fn)
self._pseudo_label_file_pattern = params.pseudo_label_data.input_path
self._pseudo_label_dataset_fn = pseudo_label_dataset_fn
self._pseudo_label_data_ratio = params.pseudo_label_data.data_ratio
self._pseudo_label_matched_files = input_reader.match_files(
self._pseudo_label_file_pattern)
if not self._drop_remainder:
raise ValueError(
'Must use drop_remainder=True with CombinationDatasetInputReader')
def read(
self,
input_context: Optional[tf.distribute.InputContext] = None
) -> tf.data.Dataset:
"""Generates a tf.data.Dataset object."""
labeled_batch_size, pl_batch_size = calculate_batch_sizes(
self._global_batch_size, self._pseudo_label_data_ratio)
if not labeled_batch_size and pl_batch_size:
raise ValueError(
'Invalid batch_size: {} and pseudo_label_data_ratio: {}, '
'resulting in a 0 batch size for one of the datasets.'.format(
self._global_batch_size, self._pseudo_label_data_ratio))
def _read_decode_and_parse_dataset(matched_files, dataset_fn, batch_size,
input_context, tfds_builder):
dataset = self._read_data_source(matched_files, dataset_fn, input_context,
tfds_builder)
return self._decode_and_parse_dataset(dataset, batch_size, input_context)
labeled_dataset = _read_decode_and_parse_dataset(
matched_files=self._matched_files,
dataset_fn=self._dataset_fn,
batch_size=labeled_batch_size,
input_context=input_context,
tfds_builder=self._tfds_builder)
pseudo_labeled_dataset = _read_decode_and_parse_dataset(
matched_files=self._pseudo_label_matched_files,
dataset_fn=self._pseudo_label_dataset_fn,
batch_size=pl_batch_size,
input_context=input_context,
tfds_builder=False)
def concat_fn(d1, d2):
return tf.nest.map_structure(
lambda x1, x2: tf.concat([x1, x2], axis=0), d1, d2)
dataset_concat = tf.data.Dataset.zip(
(labeled_dataset, pseudo_labeled_dataset))
dataset_concat = dataset_concat.map(
concat_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def maybe_map_fn(dataset, fn):
return dataset if fn is None else dataset.map(
fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_concat = maybe_map_fn(dataset_concat, self._postprocess_fn)
dataset_concat = self._maybe_apply_data_service(dataset_concat,
input_context)
if self._deterministic is not None:
options = tf.data.Options()
options.experimental_deterministic = self._deterministic
dataset_concat = dataset_concat.with_options(options)
return dataset_concat.prefetch(tf.data.experimental.AUTOTUNE)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment