Commit c8e6faf7 authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 431756117
parent 13a5e4fb
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for semantic_segmentation."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official import vision
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.configs import semantic_segmentation as exp_cfg
class ImageSegmentationConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(('seg_deeplabv3_pascal',),
('seg_deeplabv3plus_pascal',))
def test_semantic_segmentation_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.SemanticSegmentationTask)
self.assertIsInstance(config.task.model,
exp_cfg.SemanticSegmentationModel)
self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
config.validate()
config.task.train_data.is_training = None
with self.assertRaises(KeyError):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Video classification configuration definition."""
import dataclasses
from typing import Optional, Tuple
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import hyperparams
from official.modeling import optimization
from official.vision.configs import backbones_3d
from official.vision.configs import common
@dataclasses.dataclass
class DataConfig(cfg.DataConfig):
"""The base configuration for building datasets."""
name: Optional[str] = None
file_type: Optional[str] = 'tfrecord'
compressed_input: bool = False
split: str = 'train'
variant_name: Optional[str] = None
feature_shape: Tuple[int, ...] = (64, 224, 224, 3)
temporal_stride: int = 1
random_stride_range: int = 0
num_test_clips: int = 1
num_test_crops: int = 1
num_classes: int = -1
num_examples: int = -1
global_batch_size: int = 128
data_format: str = 'channels_last'
dtype: str = 'float32'
one_hot: bool = True
shuffle_buffer_size: int = 64
cache: bool = False
input_path: str = ''
is_training: bool = True
cycle_length: int = 10
drop_remainder: bool = True
min_image_size: int = 256
is_multilabel: bool = False
output_audio: bool = False
audio_feature: str = ''
audio_feature_shape: Tuple[int, ...] = (-1,)
aug_min_aspect_ratio: float = 0.5
aug_max_aspect_ratio: float = 2.0
aug_min_area_ratio: float = 0.49
aug_max_area_ratio: float = 1.0
aug_type: Optional[str] = None # 'autoaug', 'randaug', or None
image_field_key: str = 'image/encoded'
label_field_key: str = 'clip/label/index'
def kinetics400(is_training):
"""Generated Kinectics 400 dataset configs."""
return DataConfig(
name='kinetics400',
num_classes=400,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=215570 if is_training else 17706,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics600(is_training):
"""Generated Kinectics 600 dataset configs."""
return DataConfig(
name='kinetics600',
num_classes=600,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=366016 if is_training else 27780,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics700(is_training):
"""Generated Kinectics 600 dataset configs."""
return DataConfig(
name='kinetics700',
num_classes=700,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=522883 if is_training else 33441,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics700_2020(is_training):
"""Generated Kinectics 600 dataset configs."""
return DataConfig(
name='kinetics700',
num_classes=700,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=535982 if is_training else 33640,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
@dataclasses.dataclass
class VideoClassificationModel(hyperparams.Config):
"""The model config."""
model_type: str = 'video_classification'
backbone: backbones_3d.Backbone3D = backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50())
norm_activation: common.NormActivation = common.NormActivation(
use_sync_bn=False)
dropout_rate: float = 0.2
aggregate_endpoints: bool = False
require_endpoints: Optional[Tuple[str, ...]] = None
@dataclasses.dataclass
class Losses(hyperparams.Config):
one_hot: bool = True
label_smoothing: float = 0.0
l2_weight_decay: float = 0.0
@dataclasses.dataclass
class Metrics(hyperparams.Config):
use_per_class_recall: bool = False
@dataclasses.dataclass
class VideoClassificationTask(cfg.TaskConfig):
"""The task config."""
model: VideoClassificationModel = VideoClassificationModel()
train_data: DataConfig = DataConfig(is_training=True, drop_remainder=True)
validation_data: DataConfig = DataConfig(
is_training=False, drop_remainder=False)
losses: Losses = Losses()
metrics: Metrics = Metrics()
init_checkpoint: Optional[str] = None
init_checkpoint_modules: str = 'all' # all or backbone
# Spatial Partitioning fields.
train_input_partition_dims: Optional[Tuple[int, ...]] = None
eval_input_partition_dims: Optional[Tuple[int, ...]] = None
def add_trainer(experiment: cfg.ExperimentConfig,
train_batch_size: int,
eval_batch_size: int,
learning_rate: float = 1.6,
train_epochs: int = 44,
warmup_epochs: int = 5):
"""Add and config a trainer to the experiment config."""
if experiment.task.train_data.num_examples <= 0:
raise ValueError('Wrong train dataset size {!r}'.format(
experiment.task.train_data))
if experiment.task.validation_data.num_examples <= 0:
raise ValueError('Wrong validation dataset size {!r}'.format(
experiment.task.validation_data))
experiment.task.train_data.global_batch_size = train_batch_size
experiment.task.validation_data.global_batch_size = eval_batch_size
steps_per_epoch = experiment.task.train_data.num_examples // train_batch_size
experiment.trainer = cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=train_epochs * steps_per_epoch,
validation_steps=experiment.task.validation_data.num_examples //
eval_batch_size,
validation_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9,
'nesterov': True,
}
},
'learning_rate': {
'type': 'cosine',
'cosine': {
'initial_learning_rate': learning_rate,
'decay_steps': train_epochs * steps_per_epoch,
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': warmup_epochs * steps_per_epoch,
'warmup_learning_rate': 0
}
}
}))
return experiment
@exp_factory.register_config_factory('video_classification')
def video_classification() -> cfg.ExperimentConfig:
"""Video classification general."""
return cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=VideoClassificationTask(),
trainer=cfg.TrainerConfig(),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
@exp_factory.register_config_factory('video_classification_ucf101')
def video_classification_ucf101() -> cfg.ExperimentConfig:
"""Video classification on UCF-101 with resnet."""
train_dataset = DataConfig(
name='ucf101',
num_classes=101,
is_training=True,
split='train',
drop_remainder=True,
num_examples=9537,
temporal_stride=2,
feature_shape=(32, 224, 224, 3))
train_dataset.tfds_name = 'ucf101'
train_dataset.tfds_split = 'train'
validation_dataset = DataConfig(
name='ucf101',
num_classes=101,
is_training=True,
split='test',
drop_remainder=False,
num_examples=3783,
temporal_stride=2,
feature_shape=(32, 224, 224, 3))
validation_dataset.tfds_name = 'ucf101'
validation_dataset.tfds_split = 'test'
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(
config,
train_batch_size=64,
eval_batch_size=16,
learning_rate=0.8,
train_epochs=100)
return config
@exp_factory.register_config_factory('video_classification_kinetics400')
def video_classification_kinetics400() -> cfg.ExperimentConfig:
"""Video classification on Kinectics 400 with resnet."""
train_dataset = kinetics400(is_training=True)
validation_dataset = kinetics400(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics600')
def video_classification_kinetics600() -> cfg.ExperimentConfig:
"""Video classification on Kinectics 600 with resnet."""
train_dataset = kinetics600(is_training=True)
validation_dataset = kinetics600(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics700')
def video_classification_kinetics700() -> cfg.ExperimentConfig:
"""Video classification on Kinectics 700 with resnet."""
train_dataset = kinetics700(is_training=True)
validation_dataset = kinetics700(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics700_2020')
def video_classification_kinetics700_2020() -> cfg.ExperimentConfig:
"""Video classification on Kinectics 700 2020 with resnet."""
train_dataset = kinetics700_2020(is_training=True)
validation_dataset = kinetics700_2020(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for video_classification."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official import vision
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.configs import video_classification as exp_cfg
class VideoClassificationConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(('video_classification',),
('video_classification_kinetics600',))
def test_video_classification_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.VideoClassificationTask)
self.assertIsInstance(config.task.model, exp_cfg.VideoClassificationModel)
self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
config.validate()
config.task.train_data.is_training = None
with self.assertRaises(KeyError):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Convert raw COCO dataset to TFRecord format.
This scripts follows the label map decoder format and supports detection
boxes, instance masks and captions.
Example usage:
python create_coco_tf_record.py --logtostderr \
--image_dir="${TRAIN_IMAGE_DIR}" \
--image_info_file="${TRAIN_IMAGE_INFO_FILE}" \
--object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
--caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
--output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
--num_shards=100
"""
import collections
import json
import logging
import os
from absl import app # pylint:disable=unused-import
from absl import flags
import numpy as np
from pycocotools import mask
import tensorflow as tf
import multiprocessing as mp
from official.vision.data import tfrecord_lib
flags.DEFINE_boolean(
'include_masks', False, 'Whether to include instance segmentations masks '
'(PNG encoded) in the result. default: False.')
flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
flags.DEFINE_string(
'image_info_file', '', 'File containing image information. '
'Tf Examples in the output files correspond to the image '
'info entries in this file. If this file is not provided '
'object_annotations_file is used if present. Otherwise, '
'caption_annotations_file is used to get image info.')
flags.DEFINE_string(
'object_annotations_file', '', 'File containing object '
'annotations - boxes and instance masks.')
flags.DEFINE_string('caption_annotations_file', '', 'File containing image '
'captions.')
flags.DEFINE_string('panoptic_annotations_file', '', 'File containing panoptic '
'annotations.')
flags.DEFINE_string('panoptic_masks_dir', '',
'Directory containing panoptic masks annotations.')
flags.DEFINE_boolean(
'include_panoptic_masks', False, 'Whether to include category and '
'instance masks in the result. These are required to run the PQ evaluator '
'default: False.')
flags.DEFINE_string('output_file_prefix', '/tmp/train', 'Path to output file')
flags.DEFINE_integer('num_shards', 32, 'Number of shards for output file.')
FLAGS = flags.FLAGS
logger = tf.get_logger()
logger.setLevel(logging.INFO)
_VOID_LABEL = 0
_VOID_INSTANCE_ID = 0
_THING_CLASS_ID = 1
_STUFF_CLASSES_OFFSET = 90
def coco_segmentation_to_mask_png(segmentation, height, width, is_crowd):
"""Encode a COCO mask segmentation as PNG string."""
run_len_encoding = mask.frPyObjects(segmentation, height, width)
binary_mask = mask.decode(run_len_encoding)
if not is_crowd:
binary_mask = np.amax(binary_mask, axis=2)
return tfrecord_lib.encode_mask_as_png(binary_mask)
def generate_coco_panoptics_masks(segments_info, mask_path,
include_panoptic_masks,
is_category_thing):
"""Creates masks for panoptic segmentation task.
Args:
segments_info: a list of dicts, where each dict has keys: [u'id',
u'category_id', u'area', u'bbox', u'iscrowd'], detailing information for
each segment in the panoptic mask.
mask_path: path to the panoptic mask.
include_panoptic_masks: bool, when set to True, category and instance
masks are included in the outputs. Set this to True, when using
the Panoptic Quality evaluator.
is_category_thing: a dict with category ids as keys and, 0/1 as values to
represent "stuff" and "things" classes respectively.
Returns:
A dict with with keys: [u'semantic_segmentation_mask', u'category_mask',
u'instance_mask']. The dict contains 'category_mask' and 'instance_mask'
only if `include_panoptic_eval_masks` is set to True.
"""
rgb_mask = tfrecord_lib.read_image(mask_path)
r, g, b = np.split(rgb_mask, 3, axis=-1)
# decode rgb encoded panoptic mask to get segments ids
# refer https://cocodataset.org/#format-data
segments_encoded_mask = (r + g * 256 + b * (256**2)).squeeze()
semantic_segmentation_mask = np.ones_like(
segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
if include_panoptic_masks:
category_mask = np.ones_like(
segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL
instance_mask = np.ones_like(
segments_encoded_mask, dtype=np.uint8) * _VOID_INSTANCE_ID
for idx, segment in enumerate(segments_info):
segment_id = segment['id']
category_id = segment['category_id']
if is_category_thing[category_id]:
encoded_category_id = _THING_CLASS_ID
instance_id = idx + 1
else:
encoded_category_id = category_id - _STUFF_CLASSES_OFFSET
instance_id = _VOID_INSTANCE_ID
segment_mask = (segments_encoded_mask == segment_id)
semantic_segmentation_mask[segment_mask] = encoded_category_id
if include_panoptic_masks:
category_mask[segment_mask] = category_id
instance_mask[segment_mask] = instance_id
outputs = {
'semantic_segmentation_mask': tfrecord_lib.encode_mask_as_png(
semantic_segmentation_mask)
}
if include_panoptic_masks:
outputs.update({
'category_mask': tfrecord_lib.encode_mask_as_png(category_mask),
'instance_mask': tfrecord_lib.encode_mask_as_png(instance_mask)
})
return outputs
def coco_annotations_to_lists(bbox_annotations, id_to_name_map,
image_height, image_width, include_masks):
"""Converts COCO annotations to feature lists."""
data = dict((k, list()) for k in
['xmin', 'xmax', 'ymin', 'ymax', 'is_crowd',
'category_id', 'category_names', 'area'])
if include_masks:
data['encoded_mask_png'] = []
num_annotations_skipped = 0
for object_annotations in bbox_annotations:
(x, y, width, height) = tuple(object_annotations['bbox'])
if width <= 0 or height <= 0:
num_annotations_skipped += 1
continue
if x + width > image_width or y + height > image_height:
num_annotations_skipped += 1
continue
data['xmin'].append(float(x) / image_width)
data['xmax'].append(float(x + width) / image_width)
data['ymin'].append(float(y) / image_height)
data['ymax'].append(float(y + height) / image_height)
data['is_crowd'].append(object_annotations['iscrowd'])
category_id = int(object_annotations['category_id'])
data['category_id'].append(category_id)
data['category_names'].append(id_to_name_map[category_id].encode('utf8'))
data['area'].append(object_annotations['area'])
if include_masks:
data['encoded_mask_png'].append(
coco_segmentation_to_mask_png(object_annotations['segmentation'],
image_height, image_width,
object_annotations['iscrowd'])
)
return data, num_annotations_skipped
def bbox_annotations_to_feature_dict(
bbox_annotations, image_height, image_width, id_to_name_map, include_masks):
"""Convert COCO annotations to an encoded feature dict."""
data, num_skipped = coco_annotations_to_lists(
bbox_annotations, id_to_name_map, image_height, image_width,
include_masks)
feature_dict = {
'image/object/bbox/xmin':
tfrecord_lib.convert_to_feature(data['xmin']),
'image/object/bbox/xmax':
tfrecord_lib.convert_to_feature(data['xmax']),
'image/object/bbox/ymin':
tfrecord_lib.convert_to_feature(data['ymin']),
'image/object/bbox/ymax':
tfrecord_lib.convert_to_feature(data['ymax']),
'image/object/class/text':
tfrecord_lib.convert_to_feature(data['category_names']),
'image/object/class/label':
tfrecord_lib.convert_to_feature(data['category_id']),
'image/object/is_crowd':
tfrecord_lib.convert_to_feature(data['is_crowd']),
'image/object/area':
tfrecord_lib.convert_to_feature(data['area']),
}
if include_masks:
feature_dict['image/object/mask'] = (
tfrecord_lib.convert_to_feature(data['encoded_mask_png']))
return feature_dict, num_skipped
def encode_caption_annotations(caption_annotations):
captions = []
for caption_annotation in caption_annotations:
captions.append(caption_annotation['caption'].encode('utf8'))
return captions
def create_tf_example(image,
image_dirs,
panoptic_masks_dir=None,
bbox_annotations=None,
id_to_name_map=None,
caption_annotations=None,
panoptic_annotation=None,
is_category_thing=None,
include_panoptic_masks=False,
include_masks=False):
"""Converts image and annotations to a tf.Example proto.
Args:
image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
u'width', u'date_captured', u'flickr_url', u'id']
image_dirs: list of directories containing the image files.
panoptic_masks_dir: `str` of the panoptic masks directory.
bbox_annotations:
list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
coordinates in the official COCO dataset are given as [x, y, width,
height] tuples using absolute coordinates where x, y represent the
top-left (0-indexed) corner. This function converts to the format
expected by the Tensorflow Object Detection API (which is which is
[ymin, xmin, ymax, xmax] with coordinates normalized relative to image
size).
id_to_name_map: a dict mapping category IDs to string names.
caption_annotations:
list of dict with keys: [u'id', u'image_id', u'str'].
panoptic_annotation: dict with keys: [u'image_id', u'file_name',
u'segments_info']. Where the value for segments_info is a list of dicts,
with each dict containing information for a single segment in the mask.
is_category_thing: `bool`, whether it is a category thing.
include_panoptic_masks: `bool`, whether to include panoptic masks.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
Returns:
example: The converted tf.Example
num_annotations_skipped: Number of (invalid) annotations that were ignored.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
does not exist, or is not unique across image directories.
"""
image_height = image['height']
image_width = image['width']
filename = image['file_name']
image_id = image['id']
if len(image_dirs) > 1:
full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
if not full_existing_paths:
raise ValueError(
'{} does not exist across image directories.'.format(filename))
if len(full_existing_paths) > 1:
raise ValueError(
'{} is not unique across image directories'.format(filename))
full_path, = full_existing_paths
# If there is only one image directory, it's not worth checking for existence,
# since trying to open the file will raise an informative error message if it
# does not exist.
else:
image_dir, = image_dirs
full_path = os.path.join(image_dir, filename)
with tf.io.gfile.GFile(full_path, 'rb') as fid:
encoded_jpg = fid.read()
feature_dict = tfrecord_lib.image_info_to_feature_dict(
image_height, image_width, filename, image_id, encoded_jpg, 'jpg')
num_annotations_skipped = 0
if bbox_annotations:
box_feature_dict, num_skipped = bbox_annotations_to_feature_dict(
bbox_annotations, image_height, image_width, id_to_name_map,
include_masks)
num_annotations_skipped += num_skipped
feature_dict.update(box_feature_dict)
if caption_annotations:
encoded_captions = encode_caption_annotations(caption_annotations)
feature_dict.update(
{'image/caption': tfrecord_lib.convert_to_feature(encoded_captions)})
if panoptic_annotation:
segments_info = panoptic_annotation['segments_info']
panoptic_mask_filename = os.path.join(
panoptic_masks_dir,
panoptic_annotation['file_name'])
encoded_panoptic_masks = generate_coco_panoptics_masks(
segments_info, panoptic_mask_filename, include_panoptic_masks,
is_category_thing)
feature_dict.update(
{'image/segmentation/class/encoded': tfrecord_lib.convert_to_feature(
encoded_panoptic_masks['semantic_segmentation_mask'])})
if include_panoptic_masks:
feature_dict.update({
'image/panoptic/category_mask': tfrecord_lib.convert_to_feature(
encoded_panoptic_masks['category_mask']),
'image/panoptic/instance_mask': tfrecord_lib.convert_to_feature(
encoded_panoptic_masks['instance_mask'])
})
example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
return example, num_annotations_skipped
def _load_object_annotations(object_annotations_file):
"""Loads object annotation JSON file."""
with tf.io.gfile.GFile(object_annotations_file, 'r') as fid:
obj_annotations = json.load(fid)
images = obj_annotations['images']
id_to_name_map = dict((element['id'], element['name']) for element in
obj_annotations['categories'])
img_to_obj_annotation = collections.defaultdict(list)
logging.info('Building bounding box index.')
for annotation in obj_annotations['annotations']:
image_id = annotation['image_id']
img_to_obj_annotation[image_id].append(annotation)
missing_annotation_count = 0
for image in images:
image_id = image['id']
if image_id not in img_to_obj_annotation:
missing_annotation_count += 1
logging.info('%d images are missing bboxes.', missing_annotation_count)
return img_to_obj_annotation, id_to_name_map
def _load_caption_annotations(caption_annotations_file):
"""Loads caption annotation JSON file."""
with tf.io.gfile.GFile(caption_annotations_file, 'r') as fid:
caption_annotations = json.load(fid)
img_to_caption_annotation = collections.defaultdict(list)
logging.info('Building caption index.')
for annotation in caption_annotations['annotations']:
image_id = annotation['image_id']
img_to_caption_annotation[image_id].append(annotation)
missing_annotation_count = 0
images = caption_annotations['images']
for image in images:
image_id = image['id']
if image_id not in img_to_caption_annotation:
missing_annotation_count += 1
logging.info('%d images are missing captions.', missing_annotation_count)
return img_to_caption_annotation
def _load_panoptic_annotations(panoptic_annotations_file):
"""Loads panoptic annotation from file."""
with tf.io.gfile.GFile(panoptic_annotations_file, 'r') as fid:
panoptic_annotations = json.load(fid)
img_to_panoptic_annotation = dict()
logging.info('Building panoptic index.')
for annotation in panoptic_annotations['annotations']:
image_id = annotation['image_id']
img_to_panoptic_annotation[image_id] = annotation
is_category_thing = dict()
for category_info in panoptic_annotations['categories']:
is_category_thing[category_info['id']] = category_info['isthing'] == 1
missing_annotation_count = 0
images = panoptic_annotations['images']
for image in images:
image_id = image['id']
if image_id not in img_to_panoptic_annotation:
missing_annotation_count += 1
logging.info(
'%d images are missing panoptic annotations.', missing_annotation_count)
return img_to_panoptic_annotation, is_category_thing
def _load_images_info(images_info_file):
with tf.io.gfile.GFile(images_info_file, 'r') as fid:
info_dict = json.load(fid)
return info_dict['images']
def generate_annotations(images, image_dirs,
panoptic_masks_dir=None,
img_to_obj_annotation=None,
img_to_caption_annotation=None,
img_to_panoptic_annotation=None,
is_category_thing=None,
id_to_name_map=None,
include_panoptic_masks=False,
include_masks=False):
"""Generator for COCO annotations."""
for image in images:
object_annotation = (img_to_obj_annotation.get(image['id'], None) if
img_to_obj_annotation else None)
caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
img_to_caption_annotation else None)
panoptic_annotation = (img_to_panoptic_annotation.get(image['id'], None) if
img_to_panoptic_annotation else None)
yield (image, image_dirs, panoptic_masks_dir, object_annotation,
id_to_name_map, caption_annotaion, panoptic_annotation,
is_category_thing, include_panoptic_masks, include_masks)
def _create_tf_record_from_coco_annotations(images_info_file,
image_dirs,
output_path,
num_shards,
object_annotations_file=None,
caption_annotations_file=None,
panoptic_masks_dir=None,
panoptic_annotations_file=None,
include_panoptic_masks=False,
include_masks=False):
"""Loads COCO annotation json files and converts to tf.Record format.
Args:
images_info_file: JSON file containing image info. The number of tf.Examples
in the output tf Record files is exactly equal to the number of image info
entries in this file. This can be any of train/val/test annotation json
files Eg. 'image_info_test-dev2017.json',
'instance_annotations_train2017.json',
'caption_annotations_train2017.json', etc.
image_dirs: List of directories containing the image files.
output_path: Path to output tf.Record file.
num_shards: Number of output files to create.
object_annotations_file: JSON file containing bounding box annotations.
caption_annotations_file: JSON file containing caption annotations.
panoptic_masks_dir: Directory containing panoptic masks.
panoptic_annotations_file: JSON file containing panoptic annotations.
include_panoptic_masks: Whether to include 'category_mask'
and 'instance_mask', which is required by the panoptic quality evaluator.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
"""
logging.info('writing to output path: %s', output_path)
images = _load_images_info(images_info_file)
img_to_obj_annotation = None
img_to_caption_annotation = None
id_to_name_map = None
img_to_panoptic_annotation = None
is_category_thing = None
if object_annotations_file:
img_to_obj_annotation, id_to_name_map = (
_load_object_annotations(object_annotations_file))
if caption_annotations_file:
img_to_caption_annotation = (
_load_caption_annotations(caption_annotations_file))
if panoptic_annotations_file:
img_to_panoptic_annotation, is_category_thing = (
_load_panoptic_annotations(panoptic_annotations_file))
coco_annotations_iter = generate_annotations(
images=images,
image_dirs=image_dirs,
panoptic_masks_dir=panoptic_masks_dir,
img_to_obj_annotation=img_to_obj_annotation,
img_to_caption_annotation=img_to_caption_annotation,
img_to_panoptic_annotation=img_to_panoptic_annotation,
is_category_thing=is_category_thing,
id_to_name_map=id_to_name_map,
include_panoptic_masks=include_panoptic_masks,
include_masks=include_masks)
num_skipped = tfrecord_lib.write_tf_record_dataset(
output_path, coco_annotations_iter, create_tf_example, num_shards)
logging.info('Finished writing, skipped %d annotations.', num_skipped)
def main(_):
assert FLAGS.image_dir, '`image_dir` missing.'
assert (FLAGS.image_info_file or FLAGS.object_annotations_file or
FLAGS.caption_annotations_file), ('All annotation files are '
'missing.')
if FLAGS.image_info_file:
images_info_file = FLAGS.image_info_file
elif FLAGS.object_annotations_file:
images_info_file = FLAGS.object_annotations_file
else:
images_info_file = FLAGS.caption_annotations_file
directory = os.path.dirname(FLAGS.output_file_prefix)
if not tf.io.gfile.isdir(directory):
tf.io.gfile.makedirs(directory)
_create_tf_record_from_coco_annotations(images_info_file, FLAGS.image_dir,
FLAGS.output_file_prefix,
FLAGS.num_shards,
FLAGS.object_annotations_file,
FLAGS.caption_annotations_file,
FLAGS.panoptic_masks_dir,
FLAGS.panoptic_annotations_file,
FLAGS.include_panoptic_masks,
FLAGS.include_masks)
if __name__ == '__main__':
app.run(main)
#!/bin/bash
#
# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
base_image_dir="/tmp/coco_images"
output_dir="/tmp/coco_few_shot"
while getopts ":i:o:" o; do
case "${o}" in
o) output_dir=${OPTARG} ;;
i) base_image_dir=${OPTARG} ;;
*) echo "Usage: ${0} [-i <base_image_dir>] [-o <output_dir>]" 1>&2; exit 1 ;;
esac
done
cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
-P "${tmp_dir}" -A "trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json" \
"http://${cocosplit_url}/"
mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
rm -rf "${tmp_dir}/${cocosplit_url}/"
python process_coco_few_shot_json_files.py \
--logtostderr --workdir="${tmp_dir}"
for seed in {0..9}; do
for shots in 1 3 5 10 30; do
python create_coco_tf_record.py \
--logtostderr \
--image_dir="${base_image_dir}/train2014" \
--image_dir="${base_image_dir}/val2014" \
--image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
--object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
--num_shards=4
done
done
python create_coco_tf_record.py \
--logtostderr \
--image_dir="${base_image_dir}/train2014" \
--image_dir="${base_image_dir}/val2014" \
--image_info_file="${tmp_dir}/datasplit/5k.json" \
--object_annotations_file="${tmp_dir}/datasplit/5k.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/5k" \
--num_shards=10
python create_coco_tf_record.py \
--logtostderr \
--image_dir="${base_image_dir}/train2014" \
--image_dir="${base_image_dir}/val2014" \
--image_info_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
--object_annotations_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/trainvalno5k_base" \
--num_shards=200
python create_coco_tf_record.py \
--logtostderr \
--image_dir="${base_image_dir}/train2014" \
--image_dir="${base_image_dir}/val2014" \
--image_info_file="${tmp_dir}/datasplit/5k_base.json" \
--object_annotations_file="${tmp_dir}/datasplit/5k_base.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/5k_base" \
--num_shards=10
rm -rf "${tmp_dir}"
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processes the JSON files for COCO few-shot.
We assume that `workdir` mirrors the contents of
http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
"Frustratingly Simple Few-Shot Object Detection" paper uses.
"""
import collections
import itertools
import json
import logging
import os
from absl import app
from absl import flags
import tensorflow as tf
logger = tf.get_logger()
logger.setLevel(logging.INFO)
flags.DEFINE_string('workdir', None, 'Working directory.')
FLAGS = flags.FLAGS
CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
'parking meter', 'person', 'pizza', 'potted plant',
'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
'stop sign', 'suitcase', 'surfboard', 'teddy bear',
'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
'wine glass', 'zebra']
SEEDS = list(range(10))
SHOTS = [1, 3, 5, 10, 30]
FILE_SUFFIXES = collections.defaultdict(list)
for _seed, _shots in itertools.product(SEEDS, SHOTS):
for _category in CATEGORIES:
FILE_SUFFIXES[(_seed, _shots)].append(
'{}full_box_{}shot_{}_trainval.json'.format(
# http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
#
# datasplit/
# trainvalno5k.json
# 5k.json
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
# seed{1-9}/
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
#
# This means that the JSON files for seed0 are located in the root
# directory rather than in a `seed?/` subdirectory, hence the
# conditional expression below.
'' if _seed == 0 else 'seed{}/'.format(_seed),
_shots,
_category))
# Base class IDs, as defined in
# https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/evaluation/coco_evaluation.py#L60-L65
BASE_CLASS_IDS = [8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 65, 70, 73, 74, 75,
76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
def main(unused_argv):
workdir = FLAGS.workdir
# Filter novel class annotations from the training and validation sets.
for name in ('trainvalno5k', '5k'):
file_path = os.path.join(workdir, 'datasplit', '{}.json'.format(name))
with tf.io.gfile.GFile(file_path, 'r') as f:
json_dict = json.load(f)
json_dict['annotations'] = [a for a in json_dict['annotations']
if a['category_id'] in BASE_CLASS_IDS]
output_path = os.path.join(
workdir, 'datasplit', '{}_base.json'.format(name))
with tf.io.gfile.GFile(output_path, 'w') as f:
json.dump(json_dict, f)
for seed, shots in itertools.product(SEEDS, SHOTS):
# Retrieve all examples for a given seed and shots setting.
file_paths = [os.path.join(workdir, suffix)
for suffix in FILE_SUFFIXES[(seed, shots)]]
json_dicts = []
for file_path in file_paths:
with tf.io.gfile.GFile(file_path, 'r') as f:
json_dicts.append(json.load(f))
# Make sure that all JSON files for a given seed and shots setting have the
# same metadata. We count on this to fuse them later on.
metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
'categories': d['categories']} for d in json_dicts]
if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
raise RuntimeError(
'JSON files for {} shots (seed {}) '.format(shots, seed) +
'have different info, licences, or categories fields')
# Retrieve images across all JSON files.
images = sum((d['images'] for d in json_dicts), [])
# Remove duplicate image entries.
images = list({image['id']: image for image in images}.values())
output_dict = {
'info': json_dicts[0]['info'],
'licenses': json_dicts[0]['licenses'],
'categories': json_dicts[0]['categories'],
'images': images,
'annotations': sum((d['annotations'] for d in json_dicts), [])
}
output_path = os.path.join(workdir,
'{}shot_seed{}.json'.format(shots, seed))
with tf.io.gfile.GFile(output_path, 'w') as f:
json.dump(output_dict, f)
logger.info('Processed %d shots (seed %d) and saved to %s',
shots, seed, output_path)
if __name__ == '__main__':
flags.mark_flag_as_required('workdir')
app.run(main)
#!/bin/bash
sudo apt update
sudo apt install unzip aria2 -y
DATA_DIR=$1
aria2c -j 8 -Z \
http://images.cocodataset.org/annotations/annotations_trainval2017.zip \
http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip \
http://images.cocodataset.org/zips/train2017.zip \
http://images.cocodataset.org/zips/val2017.zip \
--dir=$DATA_DIR;
unzip $DATA_DIR/"*".zip -d $DATA_DIR;
mkdir $DATA_DIR/zips && mv $DATA_DIR/*.zip $DATA_DIR/zips;
unzip $DATA_DIR/annotations/panoptic_train2017.zip -d $DATA_DIR
unzip $DATA_DIR/annotations/panoptic_val2017.zip -d $DATA_DIR
python3 official/vision/beta/data/create_coco_tf_record.py \
--logtostderr \
--image_dir="$DATA_DIR/val2017" \
--object_annotations_file="$DATA_DIR/annotations/instances_val2017.json" \
--output_file_prefix="$DATA_DIR/tfrecords/val" \
--panoptic_annotations_file="$DATA_DIR/annotations/panoptic_val2017.json" \
--panoptic_masks_dir="$DATA_DIR/panoptic_val2017" \
--num_shards=8 \
--include_masks \
--include_panoptic_masks
python3 official/vision/beta/data/create_coco_tf_record.py \
--logtostderr \
--image_dir="$DATA_DIR/train2017" \
--object_annotations_file="$DATA_DIR/annotations/instances_train2017.json" \
--output_file_prefix="$DATA_DIR/tfrecords/train" \
--panoptic_annotations_file="$DATA_DIR/annotations/panoptic_train2017.json" \
--panoptic_masks_dir="$DATA_DIR/panoptic_train2017" \
--num_shards=32 \
--include_masks \
--include_panoptic_masks
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper functions for creating TFRecord datasets."""
import hashlib
import io
import itertools
from absl import logging
import numpy as np
from PIL import Image
import tensorflow as tf
import multiprocessing as mp
def convert_to_feature(value, value_type=None):
"""Converts the given python object to a tf.train.Feature.
Args:
value: int, float, bytes or a list of them.
value_type: optional, if specified, forces the feature to be of the given
type. Otherwise, type is inferred automatically. Can be one of
['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list']
Returns:
feature: A tf.train.Feature object.
"""
if value_type is None:
element = value[0] if isinstance(value, list) else value
if isinstance(element, bytes):
value_type = 'bytes'
elif isinstance(element, (int, np.integer)):
value_type = 'int64'
elif isinstance(element, (float, np.floating)):
value_type = 'float'
else:
raise ValueError('Cannot convert type {} to feature'.
format(type(element)))
if isinstance(value, list):
value_type = value_type + '_list'
if value_type == 'int64':
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
elif value_type == 'int64_list':
value = np.asarray(value).astype(np.int64).reshape(-1)
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
elif value_type == 'float':
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
elif value_type == 'float_list':
value = np.asarray(value).astype(np.float32).reshape(-1)
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
elif value_type == 'bytes':
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
elif value_type == 'bytes_list':
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
else:
raise ValueError('Unknown value_type parameter - {}'.format(value_type))
def image_info_to_feature_dict(height, width, filename, image_id,
encoded_str, encoded_format):
"""Convert image information to a dict of features."""
key = hashlib.sha256(encoded_str).hexdigest()
return {
'image/height': convert_to_feature(height),
'image/width': convert_to_feature(width),
'image/filename': convert_to_feature(filename.encode('utf8')),
'image/source_id': convert_to_feature(str(image_id).encode('utf8')),
'image/key/sha256': convert_to_feature(key.encode('utf8')),
'image/encoded': convert_to_feature(encoded_str),
'image/format': convert_to_feature(encoded_format.encode('utf8')),
}
def read_image(image_path):
pil_image = Image.open(image_path)
return np.asarray(pil_image)
def encode_mask_as_png(mask):
pil_image = Image.fromarray(mask)
output_io = io.BytesIO()
pil_image.save(output_io, format='PNG')
return output_io.getvalue()
def write_tf_record_dataset(output_path, annotation_iterator,
process_func, num_shards,
use_multiprocessing=True, unpack_arguments=True):
"""Iterates over annotations, processes them and writes into TFRecords.
Args:
output_path: The prefix path to create TF record files.
annotation_iterator: An iterator of tuples containing details about the
dataset.
process_func: A function which takes the elements from the tuples of
annotation_iterator as arguments and returns a tuple of (tf.train.Example,
int). The integer indicates the number of annotations that were skipped.
num_shards: int, the number of shards to write for the dataset.
use_multiprocessing:
Whether or not to use multiple processes to write TF Records.
unpack_arguments:
Whether to unpack the tuples from annotation_iterator as individual
arguments to the process func or to pass the returned value as it is.
Returns:
num_skipped: The total number of skipped annotations.
"""
writers = [
tf.io.TFRecordWriter(
output_path + '-%05d-of-%05d.tfrecord' % (i, num_shards))
for i in range(num_shards)
]
total_num_annotations_skipped = 0
if use_multiprocessing:
pool = mp.Pool()
if unpack_arguments:
tf_example_iterator = pool.starmap(process_func, annotation_iterator)
else:
tf_example_iterator = pool.imap(process_func, annotation_iterator)
else:
if unpack_arguments:
tf_example_iterator = itertools.starmap(process_func, annotation_iterator)
else:
tf_example_iterator = map(process_func, annotation_iterator)
for idx, (tf_example, num_annotations_skipped) in enumerate(
tf_example_iterator):
if idx % 100 == 0:
logging.info('On image %d', idx)
total_num_annotations_skipped += num_annotations_skipped
writers[idx % num_shards].write(tf_example.SerializeToString())
if use_multiprocessing:
pool.close()
pool.join()
for writer in writers:
writer.close()
logging.info('Finished writing, skipped %d annotations.',
total_num_annotations_skipped)
return total_num_annotations_skipped
def check_and_make_dir(directory):
"""Creates the directory if it doesn't exist."""
if not tf.io.gfile.isdir(directory):
tf.io.gfile.makedirs(directory)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tfrecord_lib."""
import os
from absl import flags
from absl.testing import parameterized
import tensorflow as tf
from official.vision.data import tfrecord_lib
FLAGS = flags.FLAGS
def process_sample(x):
d = {'x': x}
return tf.train.Example(features=tf.train.Features(feature=d)), 0
def parse_function(example_proto):
feature_description = {
'x': tf.io.FixedLenFeature([], tf.int64, default_value=-1)
}
return tf.io.parse_single_example(example_proto, feature_description)
class TfrecordLibTest(parameterized.TestCase):
def test_write_tf_record_dataset(self):
data = [(tfrecord_lib.convert_to_feature(i),) for i in range(17)]
path = os.path.join(FLAGS.test_tmpdir, 'train')
tfrecord_lib.write_tf_record_dataset(
path, data, process_sample, 3, use_multiprocessing=False)
tfrecord_files = tf.io.gfile.glob(path + '*')
self.assertLen(tfrecord_files, 3)
dataset = tf.data.TFRecordDataset(tfrecord_files)
dataset = dataset.map(parse_function)
read_values = set(d['x'] for d in dataset.as_numpy_iterator())
self.assertSetEqual(read_values, set(range(17)))
def test_convert_to_feature_float(self):
proto = tfrecord_lib.convert_to_feature(0.0)
self.assertEqual(proto.float_list.value[0], 0.0)
def test_convert_to_feature_int(self):
proto = tfrecord_lib.convert_to_feature(0)
self.assertEqual(proto.int64_list.value[0], 0)
def test_convert_to_feature_bytes(self):
proto = tfrecord_lib.convert_to_feature(b'123')
self.assertEqual(proto.bytes_list.value[0], b'123')
def test_convert_to_feature_float_list(self):
proto = tfrecord_lib.convert_to_feature([0.0, 1.0])
self.assertSequenceAlmostEqual(proto.float_list.value, [0.0, 1.0])
def test_convert_to_feature_int_list(self):
proto = tfrecord_lib.convert_to_feature([0, 1])
self.assertSequenceAlmostEqual(proto.int64_list.value, [0, 1])
def test_convert_to_feature_bytes_list(self):
proto = tfrecord_lib.convert_to_feature([b'123', b'456'])
self.assertSequenceAlmostEqual(proto.bytes_list.value, [b'123', b'456'])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classification decoder and parser."""
from typing import Any, Dict, List, Optional
# Import libraries
import tensorflow as tf
from official.vision.configs import common
from official.vision.dataloaders import decoder
from official.vision.dataloaders import parser
from official.vision.ops import augment
from official.vision.ops import preprocess_ops
MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
DEFAULT_IMAGE_FIELD_KEY = 'image/encoded'
DEFAULT_LABEL_FIELD_KEY = 'image/class/label'
class Decoder(decoder.Decoder):
"""A tf.Example decoder for classification task."""
def __init__(self,
image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
is_multilabel: bool = False,
keys_to_features: Optional[Dict[str, Any]] = None):
if not keys_to_features:
keys_to_features = {
image_field_key:
tf.io.FixedLenFeature((), tf.string, default_value=''),
}
if is_multilabel:
keys_to_features.update(
{label_field_key: tf.io.VarLenFeature(dtype=tf.int64)})
else:
keys_to_features.update({
label_field_key:
tf.io.FixedLenFeature((), tf.int64, default_value=-1)
})
self._keys_to_features = keys_to_features
def decode(self, serialized_example):
return tf.io.parse_single_example(
serialized_example, self._keys_to_features)
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def __init__(self,
output_size: List[int],
num_classes: float,
image_field_key: str = DEFAULT_IMAGE_FIELD_KEY,
label_field_key: str = DEFAULT_LABEL_FIELD_KEY,
decode_jpeg_only: bool = True,
aug_rand_hflip: bool = True,
aug_type: Optional[common.Augmentation] = None,
color_jitter: float = 0.,
random_erasing: Optional[common.RandomErasing] = None,
is_multilabel: bool = False,
dtype: str = 'float32'):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
num_classes: `float`, number of classes.
image_field_key: `str`, the key name to encoded image in tf.Example.
label_field_key: `str`, the key name to label in tf.Example.
decode_jpeg_only: `bool`, if True, only JPEG format is decoded, this is
faster than decoding other types. Default is True.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
color_jitter: Magnitude of color jitter. If > 0, the value is used to
generate random scale factor for brightness, contrast and saturation.
See `preprocess_ops.color_jitter` for more details.
random_erasing: if not None, augment input image by random erasing. See
`augment.RandomErasing` for more details.
is_multilabel: A `bool`, whether or not each example has multiple labels.
dtype: `str`, cast output image in dtype. It can be 'float32', 'float16',
or 'bfloat16'.
"""
self._output_size = output_size
self._aug_rand_hflip = aug_rand_hflip
self._num_classes = num_classes
self._image_field_key = image_field_key
if dtype == 'float32':
self._dtype = tf.float32
elif dtype == 'float16':
self._dtype = tf.float16
elif dtype == 'bfloat16':
self._dtype = tf.bfloat16
else:
raise ValueError('dtype {!r} is not supported!'.format(dtype))
if aug_type:
if aug_type.type == 'autoaug':
self._augmenter = augment.AutoAugment(
augmentation_name=aug_type.autoaug.augmentation_name,
cutout_const=aug_type.autoaug.cutout_const,
translate_const=aug_type.autoaug.translate_const)
elif aug_type.type == 'randaug':
self._augmenter = augment.RandAugment(
num_layers=aug_type.randaug.num_layers,
magnitude=aug_type.randaug.magnitude,
cutout_const=aug_type.randaug.cutout_const,
translate_const=aug_type.randaug.translate_const,
prob_to_apply=aug_type.randaug.prob_to_apply,
exclude_ops=aug_type.randaug.exclude_ops)
else:
raise ValueError('Augmentation policy {} not supported.'.format(
aug_type.type))
else:
self._augmenter = None
self._label_field_key = label_field_key
self._color_jitter = color_jitter
if random_erasing:
self._random_erasing = augment.RandomErasing(
probability=random_erasing.probability,
min_area=random_erasing.min_area,
max_area=random_erasing.max_area,
min_aspect=random_erasing.min_aspect,
max_aspect=random_erasing.max_aspect,
min_count=random_erasing.min_count,
max_count=random_erasing.max_count,
trials=random_erasing.trials)
else:
self._random_erasing = None
self._is_multilabel = is_multilabel
self._decode_jpeg_only = decode_jpeg_only
def _parse_train_data(self, decoded_tensors):
"""Parses data for training."""
image = self._parse_train_image(decoded_tensors)
label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
if self._is_multilabel:
if isinstance(label, tf.sparse.SparseTensor):
label = tf.sparse.to_dense(label)
label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
return image, label
def _parse_eval_data(self, decoded_tensors):
"""Parses data for evaluation."""
image = self._parse_eval_image(decoded_tensors)
label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32)
if self._is_multilabel:
if isinstance(label, tf.sparse.SparseTensor):
label = tf.sparse.to_dense(label)
label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0)
return image, label
def _parse_train_image(self, decoded_tensors):
"""Parses image data for training."""
image_bytes = decoded_tensors[self._image_field_key]
if self._decode_jpeg_only:
image_shape = tf.image.extract_jpeg_shape(image_bytes)
# Crops image.
cropped_image = preprocess_ops.random_crop_image_v2(
image_bytes, image_shape)
image = tf.cond(
tf.reduce_all(tf.equal(tf.shape(cropped_image), image_shape)),
lambda: preprocess_ops.center_crop_image_v2(image_bytes, image_shape),
lambda: cropped_image)
else:
# Decodes image.
image = tf.io.decode_image(image_bytes, channels=3)
image.set_shape([None, None, 3])
# Crops image.
cropped_image = preprocess_ops.random_crop_image(image)
image = tf.cond(
tf.reduce_all(tf.equal(tf.shape(cropped_image), tf.shape(image))),
lambda: preprocess_ops.center_crop_image(image),
lambda: cropped_image)
if self._aug_rand_hflip:
image = tf.image.random_flip_left_right(image)
# Color jitter.
if self._color_jitter > 0:
image = preprocess_ops.color_jitter(image, self._color_jitter,
self._color_jitter,
self._color_jitter)
# Resizes image.
image = tf.image.resize(
image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
image.set_shape([self._output_size[0], self._output_size[1], 3])
# Apply autoaug or randaug.
if self._augmenter is not None:
image = self._augmenter.distort(image)
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image,
offset=MEAN_RGB,
scale=STDDEV_RGB)
# Random erasing after the image has been normalized
if self._random_erasing is not None:
image = self._random_erasing.distort(image)
# Convert image to self._dtype.
image = tf.image.convert_image_dtype(image, self._dtype)
return image
def _parse_eval_image(self, decoded_tensors):
"""Parses image data for evaluation."""
image_bytes = decoded_tensors[self._image_field_key]
if self._decode_jpeg_only:
image_shape = tf.image.extract_jpeg_shape(image_bytes)
# Center crops.
image = preprocess_ops.center_crop_image_v2(image_bytes, image_shape)
else:
# Decodes image.
image = tf.io.decode_image(image_bytes, channels=3)
image.set_shape([None, None, 3])
# Center crops.
image = preprocess_ops.center_crop_image(image)
image = tf.image.resize(
image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
image.set_shape([self._output_size[0], self._output_size[1], 3])
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image,
offset=MEAN_RGB,
scale=STDDEV_RGB)
# Convert image to self._dtype.
image = tf.image.convert_image_dtype(image, self._dtype)
return image
@classmethod
def inference_fn(cls,
image: tf.Tensor,
input_image_size: List[int],
num_channels: int = 3) -> tf.Tensor:
"""Builds image model inputs for serving."""
image = tf.cast(image, dtype=tf.float32)
image = preprocess_ops.center_crop_image(image)
image = tf.image.resize(
image, input_image_size, method=tf.image.ResizeMethod.BILINEAR)
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(
image, offset=MEAN_RGB, scale=STDDEV_RGB)
image.set_shape(input_image_size + [num_channels])
return image
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The generic decoder interface."""
import abc
class Decoder(object):
"""Decodes the raw data into tensors."""
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def decode(self, serialized_example):
"""Decodes the serialized example into tensors.
Args:
serialized_example: a serialized string tensor that encodes the data.
Returns:
decoded_tensors: a dict of Tensors.
"""
pass
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Dataset reader for vision model garden."""
from typing import Any, Callable, Optional, Tuple
import tensorflow as tf
from official.core import config_definitions as cfg
from official.core import input_reader
def calculate_batch_sizes(total_batch_size: int,
pseudo_label_ratio: float) -> Tuple[int, int]:
"""Calculates labeled and pseudo-labeled dataset batch sizes.
Returns (labeled_batch_size, pseudo_labeled_batch_size) given a
total batch size and pseudo-label data ratio.
Args:
total_batch_size: The total batch size for all data.
pseudo_label_ratio: A non-negative float ratio of pseudo-labeled
to labeled data in a batch.
Returns:
(labeled_batch_size, pseudo_labeled_batch_size) as ints.
Raises:
ValueError: If total_batch_size is negative.
ValueError: If pseudo_label_ratio is negative.
"""
if total_batch_size < 0:
raise ValueError('Invalid total_batch_size: {}'.format(total_batch_size))
if pseudo_label_ratio < 0.0:
raise ValueError(
'Invalid pseudo_label_ratio: {}'.format(pseudo_label_ratio))
ratio_factor = pseudo_label_ratio / (1.0 + pseudo_label_ratio)
pseudo_labeled_batch_size = int(round(total_batch_size * ratio_factor))
labeled_batch_size = total_batch_size - pseudo_labeled_batch_size
return labeled_batch_size, pseudo_labeled_batch_size
class CombinationDatasetInputReader(input_reader.InputReader):
"""Combination dataset input reader."""
def __init__(self,
params: cfg.DataConfig,
dataset_fn=tf.data.TFRecordDataset,
pseudo_label_dataset_fn=tf.data.TFRecordDataset,
decoder_fn: Optional[Callable[..., Any]] = None,
sample_fn: Optional[Callable[..., Any]] = None,
parser_fn: Optional[Callable[..., Any]] = None,
transform_and_batch_fn: Optional[Callable[
[tf.data.Dataset, Optional[tf.distribute.InputContext]],
tf.data.Dataset]] = None,
postprocess_fn: Optional[Callable[..., Any]] = None):
"""Initializes an CombinationDatasetInputReader instance.
This class mixes a labeled and pseudo-labeled dataset. The params
must contain "pseudo_label_data.input_path" to specify the
pseudo-label dataset files and "pseudo_label_data.data_ratio"
to specify a per-batch mixing ratio of pseudo-label examples to
labeled dataset examples.
Args:
params: A config_definitions.DataConfig object.
dataset_fn: A `tf.data.Dataset` that consumes the input files. For
example, it can be `tf.data.TFRecordDataset`.
pseudo_label_dataset_fn: A `tf.data.Dataset` that consumes the input
files. For example, it can be `tf.data.TFRecordDataset`.
decoder_fn: An optional `callable` that takes the serialized data string
and decodes them into the raw tensor dictionary.
sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
input and outputs the transformed dataset. It performs sampling on the
decoded raw tensors dict before the parser_fn.
parser_fn: An optional `callable` that takes the decoded raw tensors dict
and parse them into a dictionary of tensors that can be consumed by the
model. It will be executed after decoder_fn.
transform_and_batch_fn: An optional `callable` that takes a
`tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
input, and returns a `tf.data.Dataset` object. It will be executed after
`parser_fn` to transform and batch the dataset; if None, after
`parser_fn` is executed, the dataset will be batched into per-replica
batch size.
postprocess_fn: A optional `callable` that processes batched tensors. It
will be executed after batching.
Raises:
ValueError: If drop_remainder is False.
"""
super().__init__(params=params,
dataset_fn=dataset_fn,
decoder_fn=decoder_fn,
sample_fn=sample_fn,
parser_fn=parser_fn,
transform_and_batch_fn=transform_and_batch_fn,
postprocess_fn=postprocess_fn)
self._pseudo_label_file_pattern = params.pseudo_label_data.input_path
self._pseudo_label_dataset_fn = pseudo_label_dataset_fn
self._pseudo_label_data_ratio = params.pseudo_label_data.data_ratio
self._pseudo_label_matched_files = input_reader.match_files(
self._pseudo_label_file_pattern)
if not self._drop_remainder:
raise ValueError(
'Must use drop_remainder=True with CombinationDatasetInputReader')
def read(
self,
input_context: Optional[tf.distribute.InputContext] = None
) -> tf.data.Dataset:
"""Generates a tf.data.Dataset object."""
labeled_batch_size, pl_batch_size = calculate_batch_sizes(
self._global_batch_size, self._pseudo_label_data_ratio)
if not labeled_batch_size and pl_batch_size:
raise ValueError(
'Invalid batch_size: {} and pseudo_label_data_ratio: {}, '
'resulting in a 0 batch size for one of the datasets.'.format(
self._global_batch_size, self._pseudo_label_data_ratio))
def _read_decode_and_parse_dataset(matched_files, dataset_fn, batch_size,
input_context, tfds_builder):
dataset = self._read_data_source(matched_files, dataset_fn, input_context,
tfds_builder)
return self._decode_and_parse_dataset(dataset, batch_size, input_context)
labeled_dataset = _read_decode_and_parse_dataset(
matched_files=self._matched_files,
dataset_fn=self._dataset_fn,
batch_size=labeled_batch_size,
input_context=input_context,
tfds_builder=self._tfds_builder)
pseudo_labeled_dataset = _read_decode_and_parse_dataset(
matched_files=self._pseudo_label_matched_files,
dataset_fn=self._pseudo_label_dataset_fn,
batch_size=pl_batch_size,
input_context=input_context,
tfds_builder=False)
def concat_fn(d1, d2):
return tf.nest.map_structure(
lambda x1, x2: tf.concat([x1, x2], axis=0), d1, d2)
dataset_concat = tf.data.Dataset.zip(
(labeled_dataset, pseudo_labeled_dataset))
dataset_concat = dataset_concat.map(
concat_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def maybe_map_fn(dataset, fn):
return dataset if fn is None else dataset.map(
fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_concat = maybe_map_fn(dataset_concat, self._postprocess_fn)
dataset_concat = self._maybe_apply_data_service(dataset_concat,
input_context)
if self._deterministic is not None:
options = tf.data.Options()
options.experimental_deterministic = self._deterministic
dataset_concat = dataset_concat.with_options(options)
return dataset_concat.prefetch(tf.data.experimental.AUTOTUNE)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Factory for getting TF-Vision input readers."""
from official.common import dataset_fn as dataset_fn_util
from official.core import config_definitions as cfg
from official.core import input_reader as core_input_reader
from official.vision.dataloaders import input_reader as vision_input_reader
def input_reader_generator(params: cfg.DataConfig,
**kwargs) -> core_input_reader.InputReader:
"""Instantiates an input reader class according to the params.
Args:
params: A config_definitions.DataConfig object.
**kwargs: Additional arguments passed to input reader initialization.
Returns:
An InputReader object.
"""
if params.is_training and params.get('pseudo_label_data', False):
return vision_input_reader.CombinationDatasetInputReader(
params,
pseudo_label_dataset_fn=dataset_fn_util.pick_dataset_fn(
params.pseudo_label_data.file_type),
**kwargs)
else:
return core_input_reader.InputReader(params, **kwargs)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for Mask R-CNN."""
# Import libraries
import tensorflow as tf
from official.vision.dataloaders import parser
from official.vision.dataloaders import utils
from official.vision.ops import anchor
from official.vision.ops import box_ops
from official.vision.ops import preprocess_ops
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def __init__(self,
output_size,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
rpn_match_threshold=0.7,
rpn_unmatched_threshold=0.3,
rpn_batch_size_per_im=256,
rpn_fg_fraction=0.5,
aug_rand_hflip=False,
aug_scale_min=1.0,
aug_scale_max=1.0,
skip_crowd_during_training=True,
max_num_instances=100,
include_mask=False,
mask_crop_size=112,
dtype='float32'):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
rpn_match_threshold:
rpn_unmatched_threshold:
rpn_batch_size_per_im:
rpn_fg_fraction:
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
include_mask: a bool to indicate whether parse mask groundtruth.
mask_crop_size: the size which groundtruth mask is cropped to.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
self._max_num_instances = max_num_instances
self._skip_crowd_during_training = skip_crowd_during_training
# Anchor.
self._output_size = output_size
self._min_level = min_level
self._max_level = max_level
self._num_scales = num_scales
self._aspect_ratios = aspect_ratios
self._anchor_size = anchor_size
# Target assigning.
self._rpn_match_threshold = rpn_match_threshold
self._rpn_unmatched_threshold = rpn_unmatched_threshold
self._rpn_batch_size_per_im = rpn_batch_size_per_im
self._rpn_fg_fraction = rpn_fg_fraction
# Data augmentation.
self._aug_rand_hflip = aug_rand_hflip
self._aug_scale_min = aug_scale_min
self._aug_scale_max = aug_scale_max
# Mask.
self._include_mask = include_mask
self._mask_crop_size = mask_crop_size
# Image output dtype.
self._dtype = dtype
def _parse_train_data(self, data):
"""Parses data for training.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following describes
{key: value} pairs in the dictionary.
image_info: a 2D `Tensor` that encodes the information of the image and
the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
rpn_score_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
rpn_box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
gt_boxes: Groundtruth bounding box annotations. The box is represented
in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
image that is fed to the network. The tennsor is padded with -1 to
the fixed dimension [self._max_num_instances, 4].
gt_classes: Groundtruth classes annotations. The tennsor is padded
with -1 to the fixed dimension [self._max_num_instances].
gt_masks: groundtrugh masks cropped by the bounding box and
resized to a fixed size determined by mask_crop_size.
"""
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
if self._include_mask:
masks = data['groundtruth_instance_masks']
is_crowds = data['groundtruth_is_crowd']
# Skips annotations with `is_crowd` = True.
if self._skip_crowd_during_training:
num_groundtruths = tf.shape(classes)[0]
with tf.control_dependencies([num_groundtruths, is_crowds]):
indices = tf.cond(
tf.greater(tf.size(is_crowds), 0),
lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
lambda: tf.cast(tf.range(num_groundtruths), tf.int64))
classes = tf.gather(classes, indices)
boxes = tf.gather(boxes, indices)
if self._include_mask:
masks = tf.gather(masks, indices)
# Gets original image and its size.
image = data['image']
image_shape = tf.shape(image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Flips image randomly during training.
if self._aug_rand_hflip:
if self._include_mask:
image, boxes, masks = preprocess_ops.random_horizontal_flip(
image, boxes, masks)
else:
image, boxes, _ = preprocess_ops.random_horizontal_flip(
image, boxes)
# Converts boxes from normalized coordinates to pixel coordinates.
# Now the coordinates of boxes are w.r.t. the original image.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(
self._output_size, 2 ** self._max_level),
aug_scale_min=self._aug_scale_min,
aug_scale_max=self._aug_scale_max)
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
# Now the coordinates of boxes are w.r.t the scaled image.
image_scale = image_info[2, :]
offset = image_info[3, :]
boxes = preprocess_ops.resize_and_crop_boxes(
boxes, image_scale, image_info[1, :], offset)
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
if self._include_mask:
masks = tf.gather(masks, indices)
# Transfer boxes to the original image space and do normalization.
cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape)
num_masks = tf.shape(masks)[0]
masks = tf.image.crop_and_resize(
tf.expand_dims(masks, axis=-1),
cropped_boxes,
box_indices=tf.range(num_masks, dtype=tf.int32),
crop_size=[self._mask_crop_size, self._mask_crop_size],
method='bilinear')
masks = tf.squeeze(masks, axis=-1)
# Assigns anchor targets.
# Note that after the target assignment, box targets are absolute pixel
# offsets w.r.t. the scaled image.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
anchor_labeler = anchor.RpnAnchorLabeler(
self._rpn_match_threshold,
self._rpn_unmatched_threshold,
self._rpn_batch_size_per_im,
self._rpn_fg_fraction)
rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
anchor_boxes, boxes,
tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))
# Casts input image to self._dtype
image = tf.cast(image, dtype=self._dtype)
# Packs labels for model_fn outputs.
labels = {
'anchor_boxes':
anchor_boxes,
'image_info':
image_info,
'rpn_score_targets':
rpn_score_targets,
'rpn_box_targets':
rpn_box_targets,
'gt_boxes':
preprocess_ops.clip_or_pad_to_fixed_size(boxes,
self._max_num_instances,
-1),
'gt_classes':
preprocess_ops.clip_or_pad_to_fixed_size(classes,
self._max_num_instances,
-1),
}
if self._include_mask:
labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size(
masks, self._max_num_instances, -1)
return image, labels
def _parse_eval_data(self, data):
"""Parses data for evaluation.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
A dictionary of {'images': image, 'labels': labels} where
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following
describes {key: value} pairs in the dictionary.
source_ids: Source image id. Default value -1 if the source id is
empty in the groundtruth annotation.
image_info: a 2D `Tensor` that encodes the information of the image
and the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each
level.
"""
# Gets original image and its size.
image = data['image']
image_shape = tf.shape(image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(
self._output_size, 2 ** self._max_level),
aug_scale_min=1.0,
aug_scale_max=1.0)
image_height, image_width, _ = image.get_shape().as_list()
# Casts input image to self._dtype
image = tf.cast(image, dtype=self._dtype)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape)
# Compute Anchor boxes.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
labels = {
'image_info': image_info,
'anchor_boxes': anchor_boxes,
}
groundtruths = {
'source_id': data['source_id'],
'height': data['height'],
'width': data['width'],
'num_detections': tf.shape(data['groundtruth_classes'])[0],
'boxes': boxes,
'classes': data['groundtruth_classes'],
'areas': data['groundtruth_area'],
'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
}
groundtruths['source_id'] = utils.process_source_id(
groundtruths['source_id'])
groundtruths = utils.pad_groundtruths_to_fixed_size(
groundtruths, self._max_num_instances)
labels['groundtruths'] = groundtruths
return image, labels
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The generic parser interface."""
import abc
class Parser(object):
"""Parses data and produces tensors to be consumed by models."""
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def _parse_train_data(self, decoded_tensors):
"""Generates images and labels that are usable for model training.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
@abc.abstractmethod
def _parse_eval_data(self, decoded_tensors):
"""Generates images and labels that are usable for model evaluation.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
def parse_fn(self, is_training):
"""Returns a parse fn that reads and parses raw tensors from the decoder.
Args:
is_training: a `bool` to indicate whether it is in training mode.
Returns:
parse: a `callable` that takes the serialized example and generate the
images, labels tuple where labels is a dict of Tensors that contains
labels.
"""
def parse(decoded_tensors):
"""Parses the serialized example data."""
if is_training:
return self._parse_train_data(decoded_tensors)
else:
return self._parse_eval_data(decoded_tensors)
return parse
@classmethod
def inference_fn(cls, inputs):
"""Parses inputs for predictions.
Args:
inputs: A Tensor, or dictionary of Tensors.
Returns:
processed_inputs: An input tensor to the model.
"""
pass
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for RetinaNet.
Parse image and ground truths in a dataset to training targets and package them
into (image, labels) tuple for RetinaNet.
"""
# Import libraries
from absl import logging
import tensorflow as tf
from official.vision.dataloaders import parser
from official.vision.dataloaders import utils
from official.vision.ops import anchor
from official.vision.ops import augment
from official.vision.ops import box_ops
from official.vision.ops import preprocess_ops
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def __init__(self,
output_size,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
match_threshold=0.5,
unmatched_threshold=0.5,
aug_type=None,
aug_rand_hflip=False,
aug_scale_min=1.0,
aug_scale_max=1.0,
use_autoaugment=False,
autoaugment_policy_name='v0',
skip_crowd_during_training=True,
max_num_instances=100,
dtype='bfloat16',
mode=None):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added on each
level. For instances, num_scales=2 adds one additional intermediate
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
match_threshold: `float` number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: `float` number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
aug_rand_hflip: `bool`, if True, augment training with random horizontal
flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
during training.
autoaugment_policy_name: `string` that specifies the name of the
AutoAugment policy that will be used during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
prediction with groundtruths in the outputs.
"""
self._mode = mode
self._max_num_instances = max_num_instances
self._skip_crowd_during_training = skip_crowd_during_training
# Anchor.
self._output_size = output_size
self._min_level = min_level
self._max_level = max_level
self._num_scales = num_scales
self._aspect_ratios = aspect_ratios
self._anchor_size = anchor_size
self._match_threshold = match_threshold
self._unmatched_threshold = unmatched_threshold
# Data augmentation.
self._aug_rand_hflip = aug_rand_hflip
self._aug_scale_min = aug_scale_min
self._aug_scale_max = aug_scale_max
# Data augmentation with AutoAugment or RandAugment.
self._augmenter = None
if aug_type is not None:
if aug_type.type == 'autoaug':
logging.info('Using AutoAugment.')
self._augmenter = augment.AutoAugment(
augmentation_name=aug_type.autoaug.augmentation_name,
cutout_const=aug_type.autoaug.cutout_const,
translate_const=aug_type.autoaug.translate_const)
elif aug_type.type == 'randaug':
logging.info('Using RandAugment.')
self._augmenter = augment.RandAugment.build_for_detection(
num_layers=aug_type.randaug.num_layers,
magnitude=aug_type.randaug.magnitude,
cutout_const=aug_type.randaug.cutout_const,
translate_const=aug_type.randaug.translate_const,
prob_to_apply=aug_type.randaug.prob_to_apply,
exclude_ops=aug_type.randaug.exclude_ops)
else:
raise ValueError(f'Augmentation policy {aug_type.type} not supported.')
# Deprecated. Data Augmentation with AutoAugment.
self._use_autoaugment = use_autoaugment
self._autoaugment_policy_name = autoaugment_policy_name
# Data type.
self._dtype = dtype
def _parse_train_data(self, data):
"""Parses data for training and evaluation."""
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
# If not empty, `attributes` is a dict of (name, ground_truth) pairs.
# `ground_gruth` of attributes is assumed in shape [N, attribute_size].
# TODO(xianzhi): support parsing attributes weights.
attributes = data.get('groundtruth_attributes', {})
is_crowds = data['groundtruth_is_crowd']
# Skips annotations with `is_crowd` = True.
if self._skip_crowd_during_training:
num_groundtrtuhs = tf.shape(input=classes)[0]
with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
indices = tf.cond(
pred=tf.greater(tf.size(input=is_crowds), 0),
true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
classes = tf.gather(classes, indices)
boxes = tf.gather(boxes, indices)
for k, v in attributes.items():
attributes[k] = tf.gather(v, indices)
# Gets original image.
image = data['image']
# Apply autoaug or randaug.
if self._augmenter is not None:
image, boxes = self._augmenter.distort_with_boxes(image, boxes)
image_shape = tf.shape(input=image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Flips image randomly during training.
if self._aug_rand_hflip:
image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(self._output_size,
2**self._max_level),
aug_scale_min=self._aug_scale_min,
aug_scale_max=self._aug_scale_max)
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
image_scale = image_info[2, :]
offset = image_info[3, :]
boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
image_info[1, :], offset)
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
for k, v in attributes.items():
attributes[k] = tf.gather(v, indices)
# Assigns anchors.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
self._unmatched_threshold)
(cls_targets, box_targets, att_targets, cls_weights,
box_weights) = anchor_labeler.label_anchors(
anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)
# Casts input image to desired data type.
image = tf.cast(image, dtype=self._dtype)
# Packs labels for model_fn outputs.
labels = {
'cls_targets': cls_targets,
'box_targets': box_targets,
'anchor_boxes': anchor_boxes,
'cls_weights': cls_weights,
'box_weights': box_weights,
'image_info': image_info,
}
if att_targets:
labels['attribute_targets'] = att_targets
return image, labels
def _parse_eval_data(self, data):
"""Parses data for training and evaluation."""
groundtruths = {}
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
# If not empty, `attributes` is a dict of (name, ground_truth) pairs.
# `ground_gruth` of attributes is assumed in shape [N, attribute_size].
# TODO(xianzhi): support parsing attributes weights.
attributes = data.get('groundtruth_attributes', {})
# Gets original image and its size.
image = data['image']
image_shape = tf.shape(input=image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(self._output_size,
2**self._max_level),
aug_scale_min=1.0,
aug_scale_max=1.0)
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
image_scale = image_info[2, :]
offset = image_info[3, :]
boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
image_info[1, :], offset)
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
for k, v in attributes.items():
attributes[k] = tf.gather(v, indices)
# Assigns anchors.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
self._unmatched_threshold)
(cls_targets, box_targets, att_targets, cls_weights,
box_weights) = anchor_labeler.label_anchors(
anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)
# Casts input image to desired data type.
image = tf.cast(image, dtype=self._dtype)
# Sets up groundtruth data for evaluation.
groundtruths = {
'source_id': data['source_id'],
'height': data['height'],
'width': data['width'],
'num_detections': tf.shape(data['groundtruth_classes']),
'image_info': image_info,
'boxes': box_ops.denormalize_boxes(
data['groundtruth_boxes'], image_shape),
'classes': data['groundtruth_classes'],
'areas': data['groundtruth_area'],
'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
}
if 'groundtruth_attributes' in data:
groundtruths['attributes'] = data['groundtruth_attributes']
groundtruths['source_id'] = utils.process_source_id(
groundtruths['source_id'])
groundtruths = utils.pad_groundtruths_to_fixed_size(
groundtruths, self._max_num_instances)
# Packs labels for model_fn outputs.
labels = {
'cls_targets': cls_targets,
'box_targets': box_targets,
'anchor_boxes': anchor_boxes,
'cls_weights': cls_weights,
'box_weights': box_weights,
'image_info': image_info,
'groundtruths': groundtruths,
}
if att_targets:
labels['attribute_targets'] = att_targets
return image, labels
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for segmentation datasets."""
import tensorflow as tf
from official.vision.dataloaders import decoder
from official.vision.dataloaders import parser
from official.vision.ops import preprocess_ops
class Decoder(decoder.Decoder):
"""A tf.Example decoder for segmentation task."""
def __init__(self):
self._keys_to_features = {
'image/encoded': tf.io.FixedLenFeature((), tf.string, default_value=''),
'image/height': tf.io.FixedLenFeature((), tf.int64, default_value=0),
'image/width': tf.io.FixedLenFeature((), tf.int64, default_value=0),
'image/segmentation/class/encoded':
tf.io.FixedLenFeature((), tf.string, default_value='')
}
def decode(self, serialized_example):
return tf.io.parse_single_example(
serialized_example, self._keys_to_features)
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors.
"""
def __init__(self,
output_size,
crop_size=None,
resize_eval_groundtruth=True,
groundtruth_padded_size=None,
ignore_label=255,
aug_rand_hflip=False,
preserve_aspect_ratio=True,
aug_scale_min=1.0,
aug_scale_max=1.0,
dtype='float32'):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
crop_size: `Tensor` or `list` for [height, width] of the crop. If
specified a training crop of size crop_size is returned. This is useful
for cropping original images during training while evaluating on
original image sizes.
resize_eval_groundtruth: `bool`, if True, eval groundtruth masks are
resized to output_size.
groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
resize_eval_groundtruth is set to False, the groundtruth masks are
padded to this size.
ignore_label: `int` the pixel with ignore label will not used for training
and evaluation.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved,
otherwise, the image is resized to output_size.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
self._output_size = output_size
self._crop_size = crop_size
self._resize_eval_groundtruth = resize_eval_groundtruth
if (not resize_eval_groundtruth) and (groundtruth_padded_size is None):
raise ValueError('groundtruth_padded_size ([height, width]) needs to be'
'specified when resize_eval_groundtruth is False.')
self._groundtruth_padded_size = groundtruth_padded_size
self._ignore_label = ignore_label
self._preserve_aspect_ratio = preserve_aspect_ratio
# Data augmentation.
self._aug_rand_hflip = aug_rand_hflip
self._aug_scale_min = aug_scale_min
self._aug_scale_max = aug_scale_max
# dtype.
self._dtype = dtype
def _prepare_image_and_label(self, data):
"""Prepare normalized image and label."""
image = tf.io.decode_image(data['image/encoded'], channels=3)
label = tf.io.decode_image(data['image/segmentation/class/encoded'],
channels=1)
height = data['image/height']
width = data['image/width']
image = tf.reshape(image, (height, width, 3))
label = tf.reshape(label, (1, height, width))
label = tf.cast(label, tf.float32)
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
if not self._preserve_aspect_ratio:
label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
image = tf.image.resize(image, self._output_size, method='bilinear')
label = tf.image.resize(label, self._output_size, method='nearest')
label = tf.reshape(label[:, :, -1], [1] + self._output_size)
return image, label
def _parse_train_data(self, data):
"""Parses data for training and evaluation."""
image, label = self._prepare_image_and_label(data)
if self._crop_size:
label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
# If output_size is specified, resize image, and label to desired
# output_size.
if self._output_size:
image = tf.image.resize(image, self._output_size, method='bilinear')
label = tf.image.resize(label, self._output_size, method='nearest')
image_mask = tf.concat([image, label], axis=2)
image_mask_crop = tf.image.random_crop(image_mask,
self._crop_size + [4])
image = image_mask_crop[:, :, :-1]
label = tf.reshape(image_mask_crop[:, :, -1], [1] + self._crop_size)
# Flips image randomly during training.
if self._aug_rand_hflip:
image, _, label = preprocess_ops.random_horizontal_flip(
image, masks=label)
train_image_size = self._crop_size if self._crop_size else self._output_size
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
train_image_size,
train_image_size,
aug_scale_min=self._aug_scale_min,
aug_scale_max=self._aug_scale_max)
# Resizes and crops boxes.
image_scale = image_info[2, :]
offset = image_info[3, :]
# Pad label and make sure the padded region assigned to the ignore label.
# The label is first offset by +1 and then padded with 0.
label += 1
label = tf.expand_dims(label, axis=3)
label = preprocess_ops.resize_and_crop_masks(
label, image_scale, train_image_size, offset)
label -= 1
label = tf.where(tf.equal(label, -1),
self._ignore_label * tf.ones_like(label), label)
label = tf.squeeze(label, axis=0)
valid_mask = tf.not_equal(label, self._ignore_label)
labels = {
'masks': label,
'valid_masks': valid_mask,
'image_info': image_info,
}
# Cast image as self._dtype
image = tf.cast(image, dtype=self._dtype)
return image, labels
def _parse_eval_data(self, data):
"""Parses data for training and evaluation."""
image, label = self._prepare_image_and_label(data)
# The label is first offset by +1 and then padded with 0.
label += 1
label = tf.expand_dims(label, axis=3)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image, self._output_size, self._output_size)
if self._resize_eval_groundtruth:
# Resizes eval masks to match input image sizes. In that case, mean IoU
# is computed on output_size not the original size of the images.
image_scale = image_info[2, :]
offset = image_info[3, :]
label = preprocess_ops.resize_and_crop_masks(label, image_scale,
self._output_size, offset)
else:
label = tf.image.pad_to_bounding_box(
label, 0, 0, self._groundtruth_padded_size[0],
self._groundtruth_padded_size[1])
label -= 1
label = tf.where(tf.equal(label, -1),
self._ignore_label * tf.ones_like(label), label)
label = tf.squeeze(label, axis=0)
valid_mask = tf.not_equal(label, self._ignore_label)
labels = {
'masks': label,
'valid_masks': valid_mask,
'image_info': image_info
}
# Cast image as self._dtype
image = tf.cast(image, dtype=self._dtype)
return image, labels
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import tensorflow as tf
from official.vision.dataloaders import decoder
def _generate_source_id(image_bytes):
# Hashing using 22 bits since float32 has only 23 mantissa bits.
return tf.strings.as_string(
tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 22 - 1))
class TfExampleDecoder(decoder.Decoder):
"""Tensorflow Example proto decoder."""
def __init__(self,
include_mask=False,
regenerate_source_id=False,
mask_binarize_threshold=None):
self._include_mask = include_mask
self._regenerate_source_id = regenerate_source_id
self._keys_to_features = {
'image/encoded': tf.io.FixedLenFeature((), tf.string),
'image/height': tf.io.FixedLenFeature((), tf.int64),
'image/width': tf.io.FixedLenFeature((), tf.int64),
'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
'image/object/class/label': tf.io.VarLenFeature(tf.int64),
'image/object/area': tf.io.VarLenFeature(tf.float32),
'image/object/is_crowd': tf.io.VarLenFeature(tf.int64),
}
self._mask_binarize_threshold = mask_binarize_threshold
if include_mask:
self._keys_to_features.update({
'image/object/mask': tf.io.VarLenFeature(tf.string),
})
if not regenerate_source_id:
self._keys_to_features.update({
'image/source_id': tf.io.FixedLenFeature((), tf.string),
})
def _decode_image(self, parsed_tensors):
"""Decodes the image and set its static shape."""
image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3)
image.set_shape([None, None, 3])
return image
def _decode_boxes(self, parsed_tensors):
"""Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
xmin = parsed_tensors['image/object/bbox/xmin']
xmax = parsed_tensors['image/object/bbox/xmax']
ymin = parsed_tensors['image/object/bbox/ymin']
ymax = parsed_tensors['image/object/bbox/ymax']
return tf.stack([ymin, xmin, ymax, xmax], axis=-1)
def _decode_classes(self, parsed_tensors):
return parsed_tensors['image/object/class/label']
def _decode_areas(self, parsed_tensors):
xmin = parsed_tensors['image/object/bbox/xmin']
xmax = parsed_tensors['image/object/bbox/xmax']
ymin = parsed_tensors['image/object/bbox/ymin']
ymax = parsed_tensors['image/object/bbox/ymax']
height = tf.cast(parsed_tensors['image/height'], dtype=tf.float32)
width = tf.cast(parsed_tensors['image/width'], dtype=tf.float32)
return tf.cond(
tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0),
lambda: parsed_tensors['image/object/area'],
lambda: (xmax - xmin) * (ymax - ymin) * height * width)
def _decode_masks(self, parsed_tensors):
"""Decode a set of PNG masks to the tf.float32 tensors."""
def _decode_png_mask(png_bytes):
mask = tf.squeeze(
tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1)
mask = tf.cast(mask, dtype=tf.float32)
mask.set_shape([None, None])
return mask
height = parsed_tensors['image/height']
width = parsed_tensors['image/width']
masks = parsed_tensors['image/object/mask']
return tf.cond(
pred=tf.greater(tf.size(input=masks), 0),
true_fn=lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32),
false_fn=lambda: tf.zeros([0, height, width], dtype=tf.float32))
def decode(self, serialized_example):
"""Decode the serialized example.
Args:
serialized_example: a single serialized tf.Example string.
Returns:
decoded_tensors: a dictionary of tensors with the following fields:
- source_id: a string scalar tensor.
- image: a uint8 tensor of shape [None, None, 3].
- height: an integer scalar tensor.
- width: an integer scalar tensor.
- groundtruth_classes: a int64 tensor of shape [None].
- groundtruth_is_crowd: a bool tensor of shape [None].
- groundtruth_area: a float32 tensor of shape [None].
- groundtruth_boxes: a float32 tensor of shape [None, 4].
- groundtruth_instance_masks: a float32 tensor of shape
[None, None, None].
- groundtruth_instance_masks_png: a string tensor of shape [None].
"""
parsed_tensors = tf.io.parse_single_example(
serialized=serialized_example, features=self._keys_to_features)
for k in parsed_tensors:
if isinstance(parsed_tensors[k], tf.SparseTensor):
if parsed_tensors[k].dtype == tf.string:
parsed_tensors[k] = tf.sparse.to_dense(
parsed_tensors[k], default_value='')
else:
parsed_tensors[k] = tf.sparse.to_dense(
parsed_tensors[k], default_value=0)
if self._regenerate_source_id:
source_id = _generate_source_id(parsed_tensors['image/encoded'])
else:
source_id = tf.cond(
tf.greater(tf.strings.length(parsed_tensors['image/source_id']), 0),
lambda: parsed_tensors['image/source_id'],
lambda: _generate_source_id(parsed_tensors['image/encoded']))
image = self._decode_image(parsed_tensors)
boxes = self._decode_boxes(parsed_tensors)
classes = self._decode_classes(parsed_tensors)
areas = self._decode_areas(parsed_tensors)
is_crowds = tf.cond(
tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0),
lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool),
lambda: tf.zeros_like(classes, dtype=tf.bool))
if self._include_mask:
masks = self._decode_masks(parsed_tensors)
if self._mask_binarize_threshold is not None:
masks = tf.cast(masks > self._mask_binarize_threshold, tf.float32)
decoded_tensors = {
'source_id': source_id,
'image': image,
'height': parsed_tensors['image/height'],
'width': parsed_tensors['image/width'],
'groundtruth_classes': classes,
'groundtruth_is_crowd': is_crowds,
'groundtruth_area': areas,
'groundtruth_boxes': boxes,
}
if self._include_mask:
decoded_tensors.update({
'groundtruth_instance_masks': masks,
'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'],
})
return decoded_tensors
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment