Unverified Commit 0225b135 authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

parents 7479dbb8 4c571a3c
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Factory for getting TF-Vision input readers."""
from official.common import dataset_fn as dataset_fn_util
from official.core import config_definitions as cfg
from official.core import input_reader as core_input_reader
from official.vision.dataloaders import input_reader as vision_input_reader
def input_reader_generator(params: cfg.DataConfig,
**kwargs) -> core_input_reader.InputReader:
"""Instantiates an input reader class according to the params.
Args:
params: A config_definitions.DataConfig object.
**kwargs: Additional arguments passed to input reader initialization.
Returns:
An InputReader object.
"""
if params.is_training and params.get('pseudo_label_data', False):
return vision_input_reader.CombinationDatasetInputReader(
params,
pseudo_label_dataset_fn=dataset_fn_util.pick_dataset_fn(
params.pseudo_label_data.file_type),
**kwargs)
else:
return core_input_reader.InputReader(params, **kwargs)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for Mask R-CNN."""
# Import libraries
import tensorflow as tf
from official.vision.dataloaders import parser
from official.vision.dataloaders import utils
from official.vision.ops import anchor
from official.vision.ops import box_ops
from official.vision.ops import preprocess_ops
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def __init__(self,
output_size,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
rpn_match_threshold=0.7,
rpn_unmatched_threshold=0.3,
rpn_batch_size_per_im=256,
rpn_fg_fraction=0.5,
aug_rand_hflip=False,
aug_scale_min=1.0,
aug_scale_max=1.0,
skip_crowd_during_training=True,
max_num_instances=100,
include_mask=False,
mask_crop_size=112,
dtype='float32'):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
rpn_match_threshold:
rpn_unmatched_threshold:
rpn_batch_size_per_im:
rpn_fg_fraction:
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
include_mask: a bool to indicate whether parse mask groundtruth.
mask_crop_size: the size which groundtruth mask is cropped to.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
self._max_num_instances = max_num_instances
self._skip_crowd_during_training = skip_crowd_during_training
# Anchor.
self._output_size = output_size
self._min_level = min_level
self._max_level = max_level
self._num_scales = num_scales
self._aspect_ratios = aspect_ratios
self._anchor_size = anchor_size
# Target assigning.
self._rpn_match_threshold = rpn_match_threshold
self._rpn_unmatched_threshold = rpn_unmatched_threshold
self._rpn_batch_size_per_im = rpn_batch_size_per_im
self._rpn_fg_fraction = rpn_fg_fraction
# Data augmentation.
self._aug_rand_hflip = aug_rand_hflip
self._aug_scale_min = aug_scale_min
self._aug_scale_max = aug_scale_max
# Mask.
self._include_mask = include_mask
self._mask_crop_size = mask_crop_size
# Image output dtype.
self._dtype = dtype
def _parse_train_data(self, data):
"""Parses data for training.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following describes
{key: value} pairs in the dictionary.
image_info: a 2D `Tensor` that encodes the information of the image and
the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
rpn_score_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
rpn_box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
gt_boxes: Groundtruth bounding box annotations. The box is represented
in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
image that is fed to the network. The tennsor is padded with -1 to
the fixed dimension [self._max_num_instances, 4].
gt_classes: Groundtruth classes annotations. The tennsor is padded
with -1 to the fixed dimension [self._max_num_instances].
gt_masks: groundtrugh masks cropped by the bounding box and
resized to a fixed size determined by mask_crop_size.
"""
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
if self._include_mask:
masks = data['groundtruth_instance_masks']
is_crowds = data['groundtruth_is_crowd']
# Skips annotations with `is_crowd` = True.
if self._skip_crowd_during_training:
num_groundtruths = tf.shape(classes)[0]
with tf.control_dependencies([num_groundtruths, is_crowds]):
indices = tf.cond(
tf.greater(tf.size(is_crowds), 0),
lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
lambda: tf.cast(tf.range(num_groundtruths), tf.int64))
classes = tf.gather(classes, indices)
boxes = tf.gather(boxes, indices)
if self._include_mask:
masks = tf.gather(masks, indices)
# Gets original image and its size.
image = data['image']
image_shape = tf.shape(image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Flips image randomly during training.
if self._aug_rand_hflip:
if self._include_mask:
image, boxes, masks = preprocess_ops.random_horizontal_flip(
image, boxes, masks)
else:
image, boxes, _ = preprocess_ops.random_horizontal_flip(
image, boxes)
# Converts boxes from normalized coordinates to pixel coordinates.
# Now the coordinates of boxes are w.r.t. the original image.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(
self._output_size, 2 ** self._max_level),
aug_scale_min=self._aug_scale_min,
aug_scale_max=self._aug_scale_max)
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
# Now the coordinates of boxes are w.r.t the scaled image.
image_scale = image_info[2, :]
offset = image_info[3, :]
boxes = preprocess_ops.resize_and_crop_boxes(
boxes, image_scale, image_info[1, :], offset)
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
if self._include_mask:
masks = tf.gather(masks, indices)
# Transfer boxes to the original image space and do normalization.
cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape)
num_masks = tf.shape(masks)[0]
masks = tf.image.crop_and_resize(
tf.expand_dims(masks, axis=-1),
cropped_boxes,
box_indices=tf.range(num_masks, dtype=tf.int32),
crop_size=[self._mask_crop_size, self._mask_crop_size],
method='bilinear')
masks = tf.squeeze(masks, axis=-1)
# Assigns anchor targets.
# Note that after the target assignment, box targets are absolute pixel
# offsets w.r.t. the scaled image.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
anchor_labeler = anchor.RpnAnchorLabeler(
self._rpn_match_threshold,
self._rpn_unmatched_threshold,
self._rpn_batch_size_per_im,
self._rpn_fg_fraction)
rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
anchor_boxes, boxes,
tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))
# Casts input image to self._dtype
image = tf.cast(image, dtype=self._dtype)
# Packs labels for model_fn outputs.
labels = {
'anchor_boxes':
anchor_boxes,
'image_info':
image_info,
'rpn_score_targets':
rpn_score_targets,
'rpn_box_targets':
rpn_box_targets,
'gt_boxes':
preprocess_ops.clip_or_pad_to_fixed_size(boxes,
self._max_num_instances,
-1),
'gt_classes':
preprocess_ops.clip_or_pad_to_fixed_size(classes,
self._max_num_instances,
-1),
}
if self._include_mask:
labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size(
masks, self._max_num_instances, -1)
return image, labels
def _parse_eval_data(self, data):
"""Parses data for evaluation.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
A dictionary of {'images': image, 'labels': labels} where
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following
describes {key: value} pairs in the dictionary.
source_ids: Source image id. Default value -1 if the source id is
empty in the groundtruth annotation.
image_info: a 2D `Tensor` that encodes the information of the image
and the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each
level.
"""
# Gets original image and its size.
image = data['image']
image_shape = tf.shape(image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(
self._output_size, 2 ** self._max_level),
aug_scale_min=1.0,
aug_scale_max=1.0)
image_height, image_width, _ = image.get_shape().as_list()
# Casts input image to self._dtype
image = tf.cast(image, dtype=self._dtype)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape)
# Compute Anchor boxes.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
labels = {
'image_info': image_info,
'anchor_boxes': anchor_boxes,
}
groundtruths = {
'source_id': data['source_id'],
'height': data['height'],
'width': data['width'],
'num_detections': tf.shape(data['groundtruth_classes'])[0],
'boxes': boxes,
'classes': data['groundtruth_classes'],
'areas': data['groundtruth_area'],
'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
}
groundtruths['source_id'] = utils.process_source_id(
groundtruths['source_id'])
groundtruths = utils.pad_groundtruths_to_fixed_size(
groundtruths, self._max_num_instances)
labels['groundtruths'] = groundtruths
return image, labels
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The generic parser interface."""
import abc
class Parser(object):
"""Parses data and produces tensors to be consumed by models."""
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def _parse_train_data(self, decoded_tensors):
"""Generates images and labels that are usable for model training.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
@abc.abstractmethod
def _parse_eval_data(self, decoded_tensors):
"""Generates images and labels that are usable for model evaluation.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
def parse_fn(self, is_training):
"""Returns a parse fn that reads and parses raw tensors from the decoder.
Args:
is_training: a `bool` to indicate whether it is in training mode.
Returns:
parse: a `callable` that takes the serialized example and generate the
images, labels tuple where labels is a dict of Tensors that contains
labels.
"""
def parse(decoded_tensors):
"""Parses the serialized example data."""
if is_training:
return self._parse_train_data(decoded_tensors)
else:
return self._parse_eval_data(decoded_tensors)
return parse
@classmethod
def inference_fn(cls, inputs):
"""Parses inputs for predictions.
Args:
inputs: A Tensor, or dictionary of Tensors.
Returns:
processed_inputs: An input tensor to the model.
"""
pass
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for RetinaNet.
Parse image and ground truths in a dataset to training targets and package them
into (image, labels) tuple for RetinaNet.
"""
# Import libraries
from absl import logging
import tensorflow as tf
from official.vision.dataloaders import parser
from official.vision.dataloaders import utils
from official.vision.ops import anchor
from official.vision.ops import augment
from official.vision.ops import box_ops
from official.vision.ops import preprocess_ops
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def __init__(self,
output_size,
min_level,
max_level,
num_scales,
aspect_ratios,
anchor_size,
match_threshold=0.5,
unmatched_threshold=0.5,
aug_type=None,
aug_rand_hflip=False,
aug_scale_min=1.0,
aug_scale_max=1.0,
use_autoaugment=False,
autoaugment_policy_name='v0',
skip_crowd_during_training=True,
max_num_instances=100,
dtype='bfloat16',
mode=None):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added on each
level. For instances, num_scales=2 adds one additional intermediate
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
match_threshold: `float` number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: `float` number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
aug_rand_hflip: `bool`, if True, augment training with random horizontal
flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
during training.
autoaugment_policy_name: `string` that specifies the name of the
AutoAugment policy that will be used during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
prediction with groundtruths in the outputs.
"""
self._mode = mode
self._max_num_instances = max_num_instances
self._skip_crowd_during_training = skip_crowd_during_training
# Anchor.
self._output_size = output_size
self._min_level = min_level
self._max_level = max_level
self._num_scales = num_scales
self._aspect_ratios = aspect_ratios
self._anchor_size = anchor_size
self._match_threshold = match_threshold
self._unmatched_threshold = unmatched_threshold
# Data augmentation.
self._aug_rand_hflip = aug_rand_hflip
self._aug_scale_min = aug_scale_min
self._aug_scale_max = aug_scale_max
# Data augmentation with AutoAugment or RandAugment.
self._augmenter = None
if aug_type is not None:
if aug_type.type == 'autoaug':
logging.info('Using AutoAugment.')
self._augmenter = augment.AutoAugment(
augmentation_name=aug_type.autoaug.augmentation_name,
cutout_const=aug_type.autoaug.cutout_const,
translate_const=aug_type.autoaug.translate_const)
elif aug_type.type == 'randaug':
logging.info('Using RandAugment.')
self._augmenter = augment.RandAugment.build_for_detection(
num_layers=aug_type.randaug.num_layers,
magnitude=aug_type.randaug.magnitude,
cutout_const=aug_type.randaug.cutout_const,
translate_const=aug_type.randaug.translate_const,
prob_to_apply=aug_type.randaug.prob_to_apply,
exclude_ops=aug_type.randaug.exclude_ops)
else:
raise ValueError(f'Augmentation policy {aug_type.type} not supported.')
# Deprecated. Data Augmentation with AutoAugment.
self._use_autoaugment = use_autoaugment
self._autoaugment_policy_name = autoaugment_policy_name
# Data type.
self._dtype = dtype
def _parse_train_data(self, data):
"""Parses data for training and evaluation."""
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
# If not empty, `attributes` is a dict of (name, ground_truth) pairs.
# `ground_gruth` of attributes is assumed in shape [N, attribute_size].
# TODO(xianzhi): support parsing attributes weights.
attributes = data.get('groundtruth_attributes', {})
is_crowds = data['groundtruth_is_crowd']
# Skips annotations with `is_crowd` = True.
if self._skip_crowd_during_training:
num_groundtrtuhs = tf.shape(input=classes)[0]
with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
indices = tf.cond(
pred=tf.greater(tf.size(input=is_crowds), 0),
true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
classes = tf.gather(classes, indices)
boxes = tf.gather(boxes, indices)
for k, v in attributes.items():
attributes[k] = tf.gather(v, indices)
# Gets original image.
image = data['image']
# Apply autoaug or randaug.
if self._augmenter is not None:
image, boxes = self._augmenter.distort_with_boxes(image, boxes)
image_shape = tf.shape(input=image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Flips image randomly during training.
if self._aug_rand_hflip:
image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(self._output_size,
2**self._max_level),
aug_scale_min=self._aug_scale_min,
aug_scale_max=self._aug_scale_max)
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
image_scale = image_info[2, :]
offset = image_info[3, :]
boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
image_info[1, :], offset)
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
for k, v in attributes.items():
attributes[k] = tf.gather(v, indices)
# Assigns anchors.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
self._unmatched_threshold)
(cls_targets, box_targets, att_targets, cls_weights,
box_weights) = anchor_labeler.label_anchors(
anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)
# Casts input image to desired data type.
image = tf.cast(image, dtype=self._dtype)
# Packs labels for model_fn outputs.
labels = {
'cls_targets': cls_targets,
'box_targets': box_targets,
'anchor_boxes': anchor_boxes,
'cls_weights': cls_weights,
'box_weights': box_weights,
'image_info': image_info,
}
if att_targets:
labels['attribute_targets'] = att_targets
return image, labels
def _parse_eval_data(self, data):
"""Parses data for training and evaluation."""
groundtruths = {}
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
# If not empty, `attributes` is a dict of (name, ground_truth) pairs.
# `ground_gruth` of attributes is assumed in shape [N, attribute_size].
# TODO(xianzhi): support parsing attributes weights.
attributes = data.get('groundtruth_attributes', {})
# Gets original image and its size.
image = data['image']
image_shape = tf.shape(input=image)[0:2]
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(self._output_size,
2**self._max_level),
aug_scale_min=1.0,
aug_scale_max=1.0)
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
image_scale = image_info[2, :]
offset = image_info[3, :]
boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
image_info[1, :], offset)
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
for k, v in attributes.items():
attributes[k] = tf.gather(v, indices)
# Assigns anchors.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
self._unmatched_threshold)
(cls_targets, box_targets, att_targets, cls_weights,
box_weights) = anchor_labeler.label_anchors(
anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)
# Casts input image to desired data type.
image = tf.cast(image, dtype=self._dtype)
# Sets up groundtruth data for evaluation.
groundtruths = {
'source_id': data['source_id'],
'height': data['height'],
'width': data['width'],
'num_detections': tf.shape(data['groundtruth_classes']),
'image_info': image_info,
'boxes': box_ops.denormalize_boxes(
data['groundtruth_boxes'], image_shape),
'classes': data['groundtruth_classes'],
'areas': data['groundtruth_area'],
'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
}
if 'groundtruth_attributes' in data:
groundtruths['attributes'] = data['groundtruth_attributes']
groundtruths['source_id'] = utils.process_source_id(
groundtruths['source_id'])
groundtruths = utils.pad_groundtruths_to_fixed_size(
groundtruths, self._max_num_instances)
# Packs labels for model_fn outputs.
labels = {
'cls_targets': cls_targets,
'box_targets': box_targets,
'anchor_boxes': anchor_boxes,
'cls_weights': cls_weights,
'box_weights': box_weights,
'image_info': image_info,
'groundtruths': groundtruths,
}
if att_targets:
labels['attribute_targets'] = att_targets
return image, labels
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for segmentation datasets."""
import tensorflow as tf
from official.vision.dataloaders import decoder
from official.vision.dataloaders import parser
from official.vision.ops import preprocess_ops
class Decoder(decoder.Decoder):
"""A tf.Example decoder for segmentation task."""
def __init__(self):
self._keys_to_features = {
'image/encoded': tf.io.FixedLenFeature((), tf.string, default_value=''),
'image/height': tf.io.FixedLenFeature((), tf.int64, default_value=0),
'image/width': tf.io.FixedLenFeature((), tf.int64, default_value=0),
'image/segmentation/class/encoded':
tf.io.FixedLenFeature((), tf.string, default_value='')
}
def decode(self, serialized_example):
return tf.io.parse_single_example(
serialized_example, self._keys_to_features)
class Parser(parser.Parser):
"""Parser to parse an image and its annotations into a dictionary of tensors.
"""
def __init__(self,
output_size,
crop_size=None,
resize_eval_groundtruth=True,
groundtruth_padded_size=None,
ignore_label=255,
aug_rand_hflip=False,
preserve_aspect_ratio=True,
aug_scale_min=1.0,
aug_scale_max=1.0,
dtype='float32'):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
crop_size: `Tensor` or `list` for [height, width] of the crop. If
specified a training crop of size crop_size is returned. This is useful
for cropping original images during training while evaluating on
original image sizes.
resize_eval_groundtruth: `bool`, if True, eval groundtruth masks are
resized to output_size.
groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
resize_eval_groundtruth is set to False, the groundtruth masks are
padded to this size.
ignore_label: `int` the pixel with ignore label will not used for training
and evaluation.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved,
otherwise, the image is resized to output_size.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
self._output_size = output_size
self._crop_size = crop_size
self._resize_eval_groundtruth = resize_eval_groundtruth
if (not resize_eval_groundtruth) and (groundtruth_padded_size is None):
raise ValueError('groundtruth_padded_size ([height, width]) needs to be'
'specified when resize_eval_groundtruth is False.')
self._groundtruth_padded_size = groundtruth_padded_size
self._ignore_label = ignore_label
self._preserve_aspect_ratio = preserve_aspect_ratio
# Data augmentation.
self._aug_rand_hflip = aug_rand_hflip
self._aug_scale_min = aug_scale_min
self._aug_scale_max = aug_scale_max
# dtype.
self._dtype = dtype
def _prepare_image_and_label(self, data):
"""Prepare normalized image and label."""
image = tf.io.decode_image(data['image/encoded'], channels=3)
label = tf.io.decode_image(data['image/segmentation/class/encoded'],
channels=1)
height = data['image/height']
width = data['image/width']
image = tf.reshape(image, (height, width, 3))
label = tf.reshape(label, (1, height, width))
label = tf.cast(label, tf.float32)
# Normalizes image with mean and std pixel values.
image = preprocess_ops.normalize_image(image)
if not self._preserve_aspect_ratio:
label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
image = tf.image.resize(image, self._output_size, method='bilinear')
label = tf.image.resize(label, self._output_size, method='nearest')
label = tf.reshape(label[:, :, -1], [1] + self._output_size)
return image, label
def _parse_train_data(self, data):
"""Parses data for training and evaluation."""
image, label = self._prepare_image_and_label(data)
if self._crop_size:
label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
# If output_size is specified, resize image, and label to desired
# output_size.
if self._output_size:
image = tf.image.resize(image, self._output_size, method='bilinear')
label = tf.image.resize(label, self._output_size, method='nearest')
image_mask = tf.concat([image, label], axis=2)
image_mask_crop = tf.image.random_crop(image_mask,
self._crop_size + [4])
image = image_mask_crop[:, :, :-1]
label = tf.reshape(image_mask_crop[:, :, -1], [1] + self._crop_size)
# Flips image randomly during training.
if self._aug_rand_hflip:
image, _, label = preprocess_ops.random_horizontal_flip(
image, masks=label)
train_image_size = self._crop_size if self._crop_size else self._output_size
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image,
train_image_size,
train_image_size,
aug_scale_min=self._aug_scale_min,
aug_scale_max=self._aug_scale_max)
# Resizes and crops boxes.
image_scale = image_info[2, :]
offset = image_info[3, :]
# Pad label and make sure the padded region assigned to the ignore label.
# The label is first offset by +1 and then padded with 0.
label += 1
label = tf.expand_dims(label, axis=3)
label = preprocess_ops.resize_and_crop_masks(
label, image_scale, train_image_size, offset)
label -= 1
label = tf.where(tf.equal(label, -1),
self._ignore_label * tf.ones_like(label), label)
label = tf.squeeze(label, axis=0)
valid_mask = tf.not_equal(label, self._ignore_label)
labels = {
'masks': label,
'valid_masks': valid_mask,
'image_info': image_info,
}
# Cast image as self._dtype
image = tf.cast(image, dtype=self._dtype)
return image, labels
def _parse_eval_data(self, data):
"""Parses data for training and evaluation."""
image, label = self._prepare_image_and_label(data)
# The label is first offset by +1 and then padded with 0.
label += 1
label = tf.expand_dims(label, axis=3)
# Resizes and crops image.
image, image_info = preprocess_ops.resize_and_crop_image(
image, self._output_size, self._output_size)
if self._resize_eval_groundtruth:
# Resizes eval masks to match input image sizes. In that case, mean IoU
# is computed on output_size not the original size of the images.
image_scale = image_info[2, :]
offset = image_info[3, :]
label = preprocess_ops.resize_and_crop_masks(label, image_scale,
self._output_size, offset)
else:
label = tf.image.pad_to_bounding_box(
label, 0, 0, self._groundtruth_padded_size[0],
self._groundtruth_padded_size[1])
label -= 1
label = tf.where(tf.equal(label, -1),
self._ignore_label * tf.ones_like(label), label)
label = tf.squeeze(label, axis=0)
valid_mask = tf.not_equal(label, self._ignore_label)
labels = {
'masks': label,
'valid_masks': valid_mask,
'image_info': image_info
}
# Cast image as self._dtype
image = tf.cast(image, dtype=self._dtype)
return image, labels
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import tensorflow as tf
from official.vision.dataloaders import decoder
def _generate_source_id(image_bytes):
# Hashing using 22 bits since float32 has only 23 mantissa bits.
return tf.strings.as_string(
tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 22 - 1))
class TfExampleDecoder(decoder.Decoder):
"""Tensorflow Example proto decoder."""
def __init__(self,
include_mask=False,
regenerate_source_id=False,
mask_binarize_threshold=None):
self._include_mask = include_mask
self._regenerate_source_id = regenerate_source_id
self._keys_to_features = {
'image/encoded': tf.io.FixedLenFeature((), tf.string),
'image/height': tf.io.FixedLenFeature((), tf.int64),
'image/width': tf.io.FixedLenFeature((), tf.int64),
'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
'image/object/class/label': tf.io.VarLenFeature(tf.int64),
'image/object/area': tf.io.VarLenFeature(tf.float32),
'image/object/is_crowd': tf.io.VarLenFeature(tf.int64),
}
self._mask_binarize_threshold = mask_binarize_threshold
if include_mask:
self._keys_to_features.update({
'image/object/mask': tf.io.VarLenFeature(tf.string),
})
if not regenerate_source_id:
self._keys_to_features.update({
'image/source_id': tf.io.FixedLenFeature((), tf.string),
})
def _decode_image(self, parsed_tensors):
"""Decodes the image and set its static shape."""
image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3)
image.set_shape([None, None, 3])
return image
def _decode_boxes(self, parsed_tensors):
"""Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
xmin = parsed_tensors['image/object/bbox/xmin']
xmax = parsed_tensors['image/object/bbox/xmax']
ymin = parsed_tensors['image/object/bbox/ymin']
ymax = parsed_tensors['image/object/bbox/ymax']
return tf.stack([ymin, xmin, ymax, xmax], axis=-1)
def _decode_classes(self, parsed_tensors):
return parsed_tensors['image/object/class/label']
def _decode_areas(self, parsed_tensors):
xmin = parsed_tensors['image/object/bbox/xmin']
xmax = parsed_tensors['image/object/bbox/xmax']
ymin = parsed_tensors['image/object/bbox/ymin']
ymax = parsed_tensors['image/object/bbox/ymax']
height = tf.cast(parsed_tensors['image/height'], dtype=tf.float32)
width = tf.cast(parsed_tensors['image/width'], dtype=tf.float32)
return tf.cond(
tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0),
lambda: parsed_tensors['image/object/area'],
lambda: (xmax - xmin) * (ymax - ymin) * height * width)
def _decode_masks(self, parsed_tensors):
"""Decode a set of PNG masks to the tf.float32 tensors."""
def _decode_png_mask(png_bytes):
mask = tf.squeeze(
tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1)
mask = tf.cast(mask, dtype=tf.float32)
mask.set_shape([None, None])
return mask
height = parsed_tensors['image/height']
width = parsed_tensors['image/width']
masks = parsed_tensors['image/object/mask']
return tf.cond(
pred=tf.greater(tf.size(input=masks), 0),
true_fn=lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32),
false_fn=lambda: tf.zeros([0, height, width], dtype=tf.float32))
def decode(self, serialized_example):
"""Decode the serialized example.
Args:
serialized_example: a single serialized tf.Example string.
Returns:
decoded_tensors: a dictionary of tensors with the following fields:
- source_id: a string scalar tensor.
- image: a uint8 tensor of shape [None, None, 3].
- height: an integer scalar tensor.
- width: an integer scalar tensor.
- groundtruth_classes: a int64 tensor of shape [None].
- groundtruth_is_crowd: a bool tensor of shape [None].
- groundtruth_area: a float32 tensor of shape [None].
- groundtruth_boxes: a float32 tensor of shape [None, 4].
- groundtruth_instance_masks: a float32 tensor of shape
[None, None, None].
- groundtruth_instance_masks_png: a string tensor of shape [None].
"""
parsed_tensors = tf.io.parse_single_example(
serialized=serialized_example, features=self._keys_to_features)
for k in parsed_tensors:
if isinstance(parsed_tensors[k], tf.SparseTensor):
if parsed_tensors[k].dtype == tf.string:
parsed_tensors[k] = tf.sparse.to_dense(
parsed_tensors[k], default_value='')
else:
parsed_tensors[k] = tf.sparse.to_dense(
parsed_tensors[k], default_value=0)
if self._regenerate_source_id:
source_id = _generate_source_id(parsed_tensors['image/encoded'])
else:
source_id = tf.cond(
tf.greater(tf.strings.length(parsed_tensors['image/source_id']), 0),
lambda: parsed_tensors['image/source_id'],
lambda: _generate_source_id(parsed_tensors['image/encoded']))
image = self._decode_image(parsed_tensors)
boxes = self._decode_boxes(parsed_tensors)
classes = self._decode_classes(parsed_tensors)
areas = self._decode_areas(parsed_tensors)
is_crowds = tf.cond(
tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0),
lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool),
lambda: tf.zeros_like(classes, dtype=tf.bool))
if self._include_mask:
masks = self._decode_masks(parsed_tensors)
if self._mask_binarize_threshold is not None:
masks = tf.cast(masks > self._mask_binarize_threshold, tf.float32)
decoded_tensors = {
'source_id': source_id,
'image': image,
'height': parsed_tensors['image/height'],
'width': parsed_tensors['image/width'],
'groundtruth_classes': classes,
'groundtruth_is_crowd': is_crowds,
'groundtruth_area': areas,
'groundtruth_boxes': boxes,
}
if self._include_mask:
decoded_tensors.update({
'groundtruth_instance_masks': masks,
'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'],
})
return decoded_tensors
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tf_example_decoder.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.dataloaders import tf_example_decoder
from official.vision.dataloaders import tfexample_utils
class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
(100, 100, 0, True),
(100, 100, 1, True),
(100, 100, 2, True),
(100, 100, 0, False),
(100, 100, 1, False),
(100, 100, 2, False),
)
def test_result_shape(self,
image_height,
image_width,
num_instances,
regenerate_source_id):
decoder = tf_example_decoder.TfExampleDecoder(
include_mask=True, regenerate_source_id=regenerate_source_id)
serialized_example = tfexample_utils.create_detection_test_example(
image_height=image_height,
image_width=image_width,
image_channel=3,
num_instances=num_instances).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(value=serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
if not regenerate_source_id:
self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
def test_result_content(self):
decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG')
image_height = 4
image_width = 4
num_instances = 2
xmins = [0, 0.25]
xmaxs = [0.5, 1.0]
ymins = [0, 0]
ymaxs = [0.5, 1.0]
labels = [3, 1]
areas = [
0.25 * image_height * image_width, 0.75 * image_height * image_width
]
is_crowds = [1, 0]
mask_content = [[[255, 255, 0, 0],
[255, 255, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]],
[[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255]]]
masks = [
tfexample_utils.encode_image(np.uint8(m), fmt='PNG')
for m in list(mask_content)
]
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[tfexample_utils.DUMP_SOURCE_ID]))),
'image/height': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/label': (tf.train.Feature(
int64_list=tf.train.Int64List(value=labels))),
'image/object/is_crowd': (tf.train.Feature(
int64_list=tf.train.Int64List(value=is_crowds))),
'image/object/area': (tf.train.Feature(
float_list=tf.train.FloatList(value=areas))),
'image/object/mask': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
})).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(value=serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
self.assertAllEqual(image_content, results['image'])
self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
self.assertAllEqual(
[3, 1], results['groundtruth_classes'])
self.assertAllEqual(
[True, False], results['groundtruth_is_crowd'])
self.assertNDArrayNear(
[0.25 * image_height * image_width, 0.75 * image_height * image_width],
results['groundtruth_area'], 1e-4)
self.assertNDArrayNear(
[[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
results['groundtruth_boxes'], 1e-4)
self.assertNDArrayNear(
mask_content, results['groundtruth_instance_masks'], 1e-4)
self.assertAllEqual(
masks, results['groundtruth_instance_masks_png'])
def test_handling_missing_fields(self):
decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG')
image_height = 4
image_width = 4
num_instances = 2
xmins = [0, 0.25]
xmaxs = [0.5, 1.0]
ymins = [0, 0]
ymaxs = [0.5, 1.0]
labels = [3, 1]
mask_content = [[[255, 255, 0, 0],
[255, 255, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]],
[[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255]]]
masks = [
tfexample_utils.encode_image(np.uint8(m), fmt='PNG')
for m in list(mask_content)
]
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[tfexample_utils.DUMP_SOURCE_ID]))),
'image/height': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/label': (tf.train.Feature(
int64_list=tf.train.Int64List(value=labels))),
'image/object/mask': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
})).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
self.assertAllEqual(image_content, results['image'])
self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
self.assertAllEqual(
[3, 1], results['groundtruth_classes'])
self.assertAllEqual(
[False, False], results['groundtruth_is_crowd'])
self.assertNDArrayNear(
[0.25 * image_height * image_width, 0.75 * image_height * image_width],
results['groundtruth_area'], 1e-4)
self.assertNDArrayNear(
[[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
results['groundtruth_boxes'], 1e-4)
self.assertNDArrayNear(
mask_content, results['groundtruth_instance_masks'], 1e-4)
self.assertAllEqual(
masks, results['groundtruth_instance_masks_png'])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import csv
# Import libraries
import tensorflow as tf
from official.vision.dataloaders import tf_example_decoder
class TfExampleDecoderLabelMap(tf_example_decoder.TfExampleDecoder):
"""Tensorflow Example proto decoder."""
def __init__(self, label_map, include_mask=False, regenerate_source_id=False,
mask_binarize_threshold=None):
super(TfExampleDecoderLabelMap, self).__init__(
include_mask=include_mask, regenerate_source_id=regenerate_source_id,
mask_binarize_threshold=mask_binarize_threshold)
self._keys_to_features.update({
'image/object/class/text': tf.io.VarLenFeature(tf.string),
})
name_to_id = self._process_label_map(label_map)
self._name_to_id_table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(
keys=tf.constant(list(name_to_id.keys()), dtype=tf.string),
values=tf.constant(list(name_to_id.values()), dtype=tf.int64)),
default_value=-1)
def _process_label_map(self, label_map):
if label_map.endswith('.csv'):
name_to_id = self._process_csv(label_map)
else:
raise ValueError('The label map file is in incorrect format.')
return name_to_id
def _process_csv(self, label_map):
name_to_id = {}
with tf.io.gfile.GFile(label_map, 'r') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
if len(row) != 2:
raise ValueError('Each row of the csv label map file must be in '
'`id,name` format. length = {}'.format(len(row)))
id_index = int(row[0])
name = row[1]
name_to_id[name] = id_index
return name_to_id
def _decode_classes(self, parsed_tensors):
return self._name_to_id_table.lookup(
parsed_tensors['image/object/class/text'])
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tf_example_label_map_decoder.py."""
import os
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.dataloaders import tf_example_label_map_decoder
from official.vision.dataloaders import tfexample_utils
LABEL_MAP_CSV_CONTENT = '0,class_0\n1,class_1\n2,class_2'
class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
(100, 100, 0),
(100, 100, 1),
(100, 100, 2),
(100, 100, 0),
(100, 100, 1),
(100, 100, 2),
)
def test_result_shape(self, image_height, image_width, num_instances):
label_map_dir = self.get_temp_dir()
label_map_name = 'label_map.csv'
label_map_path = os.path.join(label_map_dir, label_map_name)
with open(label_map_path, 'w') as f:
f.write(LABEL_MAP_CSV_CONTENT)
decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
label_map_path, include_mask=True)
serialized_example = tfexample_utils.create_detection_test_example(
image_height=image_height,
image_width=image_width,
image_channel=3,
num_instances=num_instances).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(value=serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
def test_result_content(self):
label_map_dir = self.get_temp_dir()
label_map_name = 'label_map.csv'
label_map_path = os.path.join(label_map_dir, label_map_name)
with open(label_map_path, 'w') as f:
f.write(LABEL_MAP_CSV_CONTENT)
decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
label_map_path, include_mask=True)
image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG')
image_height = 4
image_width = 4
num_instances = 2
xmins = [0, 0.25]
xmaxs = [0.5, 1.0]
ymins = [0, 0]
ymaxs = [0.5, 1.0]
labels = [b'class_2', b'class_0']
areas = [
0.25 * image_height * image_width, 0.75 * image_height * image_width
]
is_crowds = [1, 0]
mask_content = [[[255, 255, 0, 0],
[255, 255, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]],
[[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255],
[0, 255, 255, 255]]]
masks = [
tfexample_utils.encode_image(np.uint8(m), fmt='PNG')
for m in list(mask_content)
]
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[tfexample_utils.DUMP_SOURCE_ID]))),
'image/height': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/text': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=labels))),
'image/object/is_crowd': (tf.train.Feature(
int64_list=tf.train.Int64List(value=is_crowds))),
'image/object/area': (tf.train.Feature(
float_list=tf.train.FloatList(value=areas))),
'image/object/mask': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
})).SerializeToString()
decoded_tensors = decoder.decode(
tf.convert_to_tensor(value=serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertAllEqual(
(image_height, image_width, 3), results['image'].shape)
self.assertAllEqual(image_content, results['image'])
self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
self.assertEqual(image_height, results['height'])
self.assertEqual(image_width, results['width'])
self.assertAllEqual(
(num_instances,), results['groundtruth_classes'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_is_crowd'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_area'].shape)
self.assertAllEqual(
(num_instances, 4), results['groundtruth_boxes'].shape)
self.assertAllEqual(
(num_instances, image_height, image_width),
results['groundtruth_instance_masks'].shape)
self.assertAllEqual(
(num_instances,), results['groundtruth_instance_masks_png'].shape)
self.assertAllEqual(
[2, 0], results['groundtruth_classes'])
self.assertAllEqual(
[True, False], results['groundtruth_is_crowd'])
self.assertNDArrayNear(
[0.25 * image_height * image_width, 0.75 * image_height * image_width],
results['groundtruth_area'], 1e-4)
self.assertNDArrayNear(
[[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
results['groundtruth_boxes'], 1e-4)
self.assertNDArrayNear(
mask_content, results['groundtruth_instance_masks'], 1e-4)
self.assertAllEqual(
masks, results['groundtruth_instance_masks_png'])
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS Classification decoders."""
import tensorflow as tf
from official.vision.dataloaders import decoder
class ClassificationDecorder(decoder.Decoder):
"""A tf.Example decoder for tfds classification datasets."""
def decode(self, serialized_example):
sample_dict = {
'image/encoded':
tf.io.encode_jpeg(serialized_example['image'], quality=100),
'image/class/label':
serialized_example['label'],
}
return sample_dict
TFDS_ID_TO_DECODER_MAP = {
'cifar10': ClassificationDecorder,
'cifar100': ClassificationDecorder,
'imagenet2012': ClassificationDecorder,
}
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS detection decoders."""
import tensorflow as tf
from official.vision.dataloaders import decoder
class MSCOCODecoder(decoder.Decoder):
"""A tf.Example decoder for tfds coco datasets."""
def decode(self, serialized_example):
"""Decode the serialized example.
Args:
serialized_example: a dictonary example produced by tfds.
Returns:
decoded_tensors: a dictionary of tensors with the following fields:
- source_id: a string scalar tensor.
- image: a uint8 tensor of shape [None, None, 3].
- height: an integer scalar tensor.
- width: an integer scalar tensor.
- groundtruth_classes: a int64 tensor of shape [None].
- groundtruth_is_crowd: a bool tensor of shape [None].
- groundtruth_area: a float32 tensor of shape [None].
- groundtruth_boxes: a float32 tensor of shape [None, 4].
"""
decoded_tensors = {
'source_id': tf.strings.as_string(serialized_example['image/id']),
'image': serialized_example['image'],
'height': tf.cast(tf.shape(serialized_example['image'])[0], tf.int64),
'width': tf.cast(tf.shape(serialized_example['image'])[1], tf.int64),
'groundtruth_classes': serialized_example['objects']['label'],
'groundtruth_is_crowd': serialized_example['objects']['is_crowd'],
'groundtruth_area': tf.cast(
serialized_example['objects']['area'], tf.float32),
'groundtruth_boxes': serialized_example['objects']['bbox'],
}
return decoded_tensors
TFDS_ID_TO_DECODER_MAP = {
'coco/2017': MSCOCODecoder,
'coco/2014': MSCOCODecoder,
'coco': MSCOCODecoder
}
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS factory functions."""
from official.vision.dataloaders import decoder as base_decoder
from official.vision.dataloaders import tfds_detection_decoders
from official.vision.dataloaders import tfds_segmentation_decoders
from official.vision.dataloaders import tfds_classification_decoders
def get_classification_decoder(tfds_name: str) -> base_decoder.Decoder:
"""Gets classification decoder.
Args:
tfds_name: `str`, name of the tfds classification decoder.
Returns:
`base_decoder.Decoder` instance.
Raises:
ValueError if the tfds_name doesn't exist in the available decoders.
"""
if tfds_name in tfds_classification_decoders.TFDS_ID_TO_DECODER_MAP:
decoder = tfds_classification_decoders.TFDS_ID_TO_DECODER_MAP[tfds_name]()
else:
raise ValueError(
f'TFDS Classification {tfds_name} is not supported')
return decoder
def get_detection_decoder(tfds_name: str) -> base_decoder.Decoder:
"""Gets detection decoder.
Args:
tfds_name: `str`, name of the tfds detection decoder.
Returns:
`base_decoder.Decoder` instance.
Raises:
ValueError if the tfds_name doesn't exist in the available decoders.
"""
if tfds_name in tfds_detection_decoders.TFDS_ID_TO_DECODER_MAP:
decoder = tfds_detection_decoders.TFDS_ID_TO_DECODER_MAP[tfds_name]()
else:
raise ValueError(f'TFDS Detection {tfds_name} is not supported')
return decoder
def get_segmentation_decoder(tfds_name: str) -> base_decoder.Decoder:
"""Gets segmentation decoder.
Args:
tfds_name: `str`, name of the tfds segmentation decoder.
Returns:
`base_decoder.Decoder` instance.
Raises:
ValueError if the tfds_name doesn't exist in the available decoders.
"""
if tfds_name in tfds_segmentation_decoders.TFDS_ID_TO_DECODER_MAP:
decoder = tfds_segmentation_decoders.TFDS_ID_TO_DECODER_MAP[tfds_name]()
else:
raise ValueError(f'TFDS Segmentation {tfds_name} is not supported')
return decoder
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tfds factory functions."""
from absl.testing import parameterized
import tensorflow as tf
from official.vision.dataloaders import decoder as base_decoder
from official.vision.dataloaders import tfds_factory
class TFDSFactoryTest(tf.test.TestCase, parameterized.TestCase):
def _create_test_example(self):
serialized_example = {
'image': tf.ones(shape=(100, 100, 3), dtype=tf.uint8),
'label': 1,
'image/id': 0,
'objects': {
'label': 1,
'is_crowd': 0,
'area': 0.5,
'bbox': [0.1, 0.2, 0.3, 0.4]
},
'segmentation_label': tf.ones((100, 100, 1), dtype=tf.uint8),
'image_left': tf.ones(shape=(100, 100, 3), dtype=tf.uint8)
}
return serialized_example
@parameterized.parameters(
('imagenet2012'),
('cifar10'),
('cifar100'),
)
def test_classification_decoder(self, tfds_name):
decoder = tfds_factory.get_classification_decoder(tfds_name)
self.assertIsInstance(decoder, base_decoder.Decoder)
decoded_tensor = decoder.decode(self._create_test_example())
self.assertLen(decoded_tensor, 2)
self.assertIn('image/encoded', decoded_tensor)
self.assertIn('image/class/label', decoded_tensor)
@parameterized.parameters(
('flowers'),
('coco'),
)
def test_doesnt_exit_classification_decoder(self, tfds_name):
with self.assertRaises(ValueError):
_ = tfds_factory.get_classification_decoder(tfds_name)
@parameterized.parameters(
('coco'),
('coco/2014'),
('coco/2017'),
)
def test_detection_decoder(self, tfds_name):
decoder = tfds_factory.get_detection_decoder(tfds_name)
self.assertIsInstance(decoder, base_decoder.Decoder)
decoded_tensor = decoder.decode(self._create_test_example())
self.assertLen(decoded_tensor, 8)
self.assertIn('image', decoded_tensor)
self.assertIn('source_id', decoded_tensor)
self.assertIn('height', decoded_tensor)
self.assertIn('width', decoded_tensor)
self.assertIn('groundtruth_classes', decoded_tensor)
self.assertIn('groundtruth_is_crowd', decoded_tensor)
self.assertIn('groundtruth_area', decoded_tensor)
self.assertIn('groundtruth_boxes', decoded_tensor)
@parameterized.parameters(
('pascal'),
('cityscapes'),
)
def test_doesnt_exit_detection_decoder(self, tfds_name):
with self.assertRaises(ValueError):
_ = tfds_factory.get_detection_decoder(tfds_name)
@parameterized.parameters(
('cityscapes'),
('cityscapes/semantic_segmentation'),
('cityscapes/semantic_segmentation_extra'),
)
def test_segmentation_decoder(self, tfds_name):
decoder = tfds_factory.get_segmentation_decoder(tfds_name)
self.assertIsInstance(decoder, base_decoder.Decoder)
decoded_tensor = decoder.decode(self._create_test_example())
self.assertLen(decoded_tensor, 4)
self.assertIn('image/encoded', decoded_tensor)
self.assertIn('image/segmentation/class/encoded', decoded_tensor)
self.assertIn('image/height', decoded_tensor)
self.assertIn('image/width', decoded_tensor)
@parameterized.parameters(
('coco'),
('imagenet'),
)
def test_doesnt_exit_segmentation_decoder(self, tfds_name):
with self.assertRaises(ValueError):
_ = tfds_factory.get_segmentation_decoder(tfds_name)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS Semantic Segmentation decoders."""
import tensorflow as tf
from official.vision.dataloaders import decoder
class CityScapesDecorder(decoder.Decoder):
"""A tf.Example decoder for tfds cityscapes datasets."""
def __init__(self):
# Original labels to trainable labels map, 255 is the ignore class.
self._label_map = {
-1: 255,
0: 255,
1: 255,
2: 255,
3: 255,
4: 255,
5: 255,
6: 255,
7: 0,
8: 1,
9: 255,
10: 255,
11: 2,
12: 3,
13: 4,
14: 255,
15: 255,
16: 255,
17: 5,
18: 255,
19: 6,
20: 7,
21: 8,
22: 9,
23: 10,
24: 11,
25: 12,
26: 13,
27: 14,
28: 15,
29: 255,
30: 255,
31: 16,
32: 17,
33: 18,
}
def decode(self, serialized_example):
# Convert labels according to the self._label_map
label = serialized_example['segmentation_label']
for original_label in self._label_map:
label = tf.where(label == original_label,
self._label_map[original_label] * tf.ones_like(label),
label)
sample_dict = {
'image/encoded':
tf.io.encode_jpeg(serialized_example['image_left'], quality=100),
'image/height': serialized_example['image_left'].shape[0],
'image/width': serialized_example['image_left'].shape[1],
'image/segmentation/class/encoded':
tf.io.encode_png(label),
}
return sample_dict
TFDS_ID_TO_DECODER_MAP = {
'cityscapes': CityScapesDecorder,
'cityscapes/semantic_segmentation': CityScapesDecorder,
'cityscapes/semantic_segmentation_extra': CityScapesDecorder,
}
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Utility functions to create tf.Example and tf.SequnceExample for test.
Example:video classification end-to-end test
i.e. from reading input file to train and eval.
```python
class FooTrainTest(tf.test.TestCase):
def setUp(self):
super(TrainTest, self).setUp()
# Write the fake tf.train.SequenceExample to file for test.
data_dir = os.path.join(self.get_temp_dir(), 'data')
tf.io.gfile.makedirs(data_dir)
self._data_path = os.path.join(data_dir, 'data.tfrecord')
examples = [
tfexample_utils.make_video_test_example(
image_shape=(36, 36, 3),
audio_shape=(20, 128),
label=random.randint(0, 100)) for _ in range(2)
]
tfexample_utils.dump_to_tfrecord(self._data_path, tf_examples=examples)
def test_foo(self):
dataset = tf.data.TFRecordDataset(self._data_path)
...
```
"""
import io
from typing import Sequence, Union
import numpy as np
from PIL import Image
import tensorflow as tf
IMAGE_KEY = 'image/encoded'
CLASSIFICATION_LABEL_KEY = 'image/class/label'
DISTILATION_LABEL_KEY = 'image/class/soft_labels'
LABEL_KEY = 'clip/label/index'
AUDIO_KEY = 'features/audio'
DUMP_SOURCE_ID = b'123'
def encode_image(image_array: np.array, fmt: str) -> bytes:
image = Image.fromarray(image_array)
with io.BytesIO() as output:
image.save(output, format=fmt)
return output.getvalue()
def make_image_bytes(shape: Sequence[int], fmt: str = 'JPEG') -> bytes:
"""Generates image and return bytes in specified format."""
random_image = np.random.randint(0, 256, size=shape, dtype=np.uint8)
return encode_image(random_image, fmt=fmt)
def put_int64_to_context(seq_example: tf.train.SequenceExample,
label: int = 0,
key: str = LABEL_KEY):
"""Puts int64 to SequenceExample context with key."""
seq_example.context.feature[key].int64_list.value[:] = [label]
def put_bytes_list_to_feature(seq_example: tf.train.SequenceExample,
raw_image_bytes: bytes,
key: str = IMAGE_KEY,
repeat_num: int = 2):
"""Puts bytes list to SequenceExample context with key."""
for _ in range(repeat_num):
seq_example.feature_lists.feature_list.get_or_create(
key).feature.add().bytes_list.value[:] = [raw_image_bytes]
def put_float_list_to_feature(seq_example: tf.train.SequenceExample,
value: Sequence[Sequence[float]], key: str):
"""Puts float list to SequenceExample context with key."""
for s in value:
seq_example.feature_lists.feature_list.get_or_create(
key).feature.add().float_list.value[:] = s
def make_video_test_example(image_shape: Sequence[int] = (263, 320, 3),
audio_shape: Sequence[int] = (10, 256),
label: int = 42):
"""Generates data for testing video models (inc. RGB, audio, & label)."""
raw_image_bytes = make_image_bytes(shape=image_shape)
random_audio = np.random.normal(size=audio_shape).tolist()
seq_example = tf.train.SequenceExample()
put_int64_to_context(seq_example, label=label, key=LABEL_KEY)
put_bytes_list_to_feature(
seq_example, raw_image_bytes, key=IMAGE_KEY, repeat_num=4)
put_float_list_to_feature(seq_example, value=random_audio, key=AUDIO_KEY)
return seq_example
def dump_to_tfrecord(record_file: str,
tf_examples: Sequence[Union[tf.train.Example,
tf.train.SequenceExample]]):
"""Writes serialized Example to TFRecord file with path."""
with tf.io.TFRecordWriter(record_file) as writer:
for tf_example in tf_examples:
writer.write(tf_example.SerializeToString())
def _encode_image(image_array: np.ndarray, fmt: str) -> bytes:
"""Util function to encode an image."""
image = Image.fromarray(image_array)
with io.BytesIO() as output:
image.save(output, format=fmt)
return output.getvalue()
def create_classification_example(
image_height: int,
image_width: int,
image_format: str = 'JPEG',
is_multilabel: bool = False) -> tf.train.Example:
"""Creates image and labels for image classification input pipeline."""
image = _encode_image(
np.uint8(np.random.rand(image_height, image_width, 3) * 255),
fmt=image_format)
labels = [0, 1] if is_multilabel else [0]
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
IMAGE_KEY: (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
CLASSIFICATION_LABEL_KEY: (tf.train.Feature(
int64_list=tf.train.Int64List(value=labels))),
})).SerializeToString()
return serialized_example
def create_distillation_example(
image_height: int,
image_width: int,
num_labels: int,
image_format: str = 'JPEG') -> tf.train.Example:
"""Creates image and labels for image classification with distillation."""
image = _encode_image(
np.uint8(np.random.rand(image_height, image_width, 3) * 255),
fmt=image_format)
soft_labels = [0.6] * num_labels
serialized_example = tf.train.Example(
features=tf.train.Features(
feature={
IMAGE_KEY: (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
DISTILATION_LABEL_KEY: (tf.train.Feature(
float_list=tf.train.FloatList(value=soft_labels))),
})).SerializeToString()
return serialized_example
def create_3d_image_test_example(image_height: int, image_width: int,
image_volume: int,
image_channel: int) -> tf.train.Example:
"""Creates 3D image and label."""
images = np.random.rand(image_height, image_width, image_volume,
image_channel)
images = images.astype(np.float32)
labels = np.random.randint(
low=2, size=(image_height, image_width, image_volume, image_channel))
labels = labels.astype(np.float32)
feature = {
IMAGE_KEY: (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[images.tobytes()]))),
CLASSIFICATION_LABEL_KEY: (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[labels.tobytes()])))
}
return tf.train.Example(features=tf.train.Features(feature=feature))
def create_detection_test_example(image_height: int, image_width: int,
image_channel: int,
num_instances: int) -> tf.train.Example:
"""Creates and returns a test example containing box and mask annotations.
Args:
image_height: The height of test image.
image_width: The width of test image.
image_channel: The channel of test image.
num_instances: The number of object instances per image.
Returns:
A tf.train.Example for testing.
"""
image = make_image_bytes([image_height, image_width, image_channel])
if num_instances == 0:
xmins = []
xmaxs = []
ymins = []
ymaxs = []
labels = []
areas = []
is_crowds = []
masks = []
labels_text = []
else:
xmins = list(np.random.rand(num_instances))
xmaxs = list(np.random.rand(num_instances))
ymins = list(np.random.rand(num_instances))
ymaxs = list(np.random.rand(num_instances))
labels_text = [b'class_1'] * num_instances
labels = list(np.random.randint(100, size=num_instances))
areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
is_crowds = [0] * num_instances
masks = []
for _ in range(num_instances):
mask = make_image_bytes([image_height, image_width], fmt='PNG')
masks.append(mask)
return tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/source_id': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
'image/height': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width]))),
'image/object/bbox/xmin': (tf.train.Feature(
float_list=tf.train.FloatList(value=xmins))),
'image/object/bbox/xmax': (tf.train.Feature(
float_list=tf.train.FloatList(value=xmaxs))),
'image/object/bbox/ymin': (tf.train.Feature(
float_list=tf.train.FloatList(value=ymins))),
'image/object/bbox/ymax': (tf.train.Feature(
float_list=tf.train.FloatList(value=ymaxs))),
'image/object/class/label': (tf.train.Feature(
int64_list=tf.train.Int64List(value=labels))),
'image/object/class/text': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=labels_text))),
'image/object/is_crowd': (tf.train.Feature(
int64_list=tf.train.Int64List(value=is_crowds))),
'image/object/area': (tf.train.Feature(
float_list=tf.train.FloatList(value=areas))),
'image/object/mask': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=masks))),
}))
def create_segmentation_test_example(image_height: int, image_width: int,
image_channel: int) -> tf.train.Example:
"""Creates and returns a test example containing mask annotations.
Args:
image_height: The height of test image.
image_width: The width of test image.
image_channel: The channel of test image.
Returns:
A tf.train.Example for testing.
"""
image = make_image_bytes([image_height, image_width, image_channel])
mask = make_image_bytes([image_height, image_width], fmt='PNG')
return tf.train.Example(
features=tf.train.Features(
feature={
'image/encoded': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[image]))),
'image/segmentation/class/encoded': (tf.train.Feature(
bytes_list=tf.train.BytesList(value=[mask]))),
'image/height': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_height]))),
'image/width': (tf.train.Feature(
int64_list=tf.train.Int64List(value=[image_width])))
}))
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data loader utils."""
from typing import Dict
# Import libraries
import tensorflow as tf
from official.vision.ops import preprocess_ops
def process_source_id(source_id: tf.Tensor) -> tf.Tensor:
"""Processes source_id to the right format.
Args:
source_id: A `tf.Tensor` that contains the source ID. It can be empty.
Returns:
A formatted source ID.
"""
if source_id.dtype == tf.string:
source_id = tf.strings.to_number(source_id, tf.int64)
with tf.control_dependencies([source_id]):
source_id = tf.cond(
pred=tf.equal(tf.size(input=source_id), 0),
true_fn=lambda: tf.cast(tf.constant(-1), tf.int64),
false_fn=lambda: tf.identity(source_id))
return source_id
def pad_groundtruths_to_fixed_size(groundtruths: Dict[str, tf.Tensor],
size: int) -> Dict[str, tf.Tensor]:
"""Pads the first dimension of groundtruths labels to the fixed size.
Args:
groundtruths: A dictionary of {`str`: `tf.Tensor`} that contains groundtruth
annotations of `boxes`, `is_crowds`, `areas` and `classes`.
size: An `int` that specifies the expected size of the first dimension of
padded tensors.
Returns:
A dictionary of the same keys as input and padded tensors as values.
"""
groundtruths['boxes'] = preprocess_ops.clip_or_pad_to_fixed_size(
groundtruths['boxes'], size, -1)
groundtruths['is_crowds'] = preprocess_ops.clip_or_pad_to_fixed_size(
groundtruths['is_crowds'], size, 0)
groundtruths['areas'] = preprocess_ops.clip_or_pad_to_fixed_size(
groundtruths['areas'], size, -1)
groundtruths['classes'] = preprocess_ops.clip_or_pad_to_fixed_size(
groundtruths['classes'], size, -1)
if 'attributes' in groundtruths:
for k, v in groundtruths['attributes'].items():
groundtruths['attributes'][k] = preprocess_ops.clip_or_pad_to_fixed_size(
v, size, -1)
return groundtruths
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for dataloader utils functions."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
from official.vision.dataloaders import utils
class UtilsTest(tf.test.TestCase, parameterized.TestCase):
def test_process_empty_source_id(self):
source_id = tf.constant([], dtype=tf.int64)
source_id = tf.strings.as_string(source_id)
self.assertEqual(-1, utils.process_source_id(source_id=source_id))
@parameterized.parameters(
([128, 256], [128, 256]),
([128, 32, 16], [128, 32, 16]),
)
def test_process_source_id(self, source_id, expected_result):
source_id = tf.constant(source_id, dtype=tf.int64)
source_id = tf.strings.as_string(source_id)
self.assertSequenceAlmostEqual(expected_result,
utils.process_source_id(source_id=source_id))
@parameterized.parameters(
([[10, 20, 30, 40]], [[100]], [[0]], 10, None),
([[0.1, 0.2, 0.5, 0.6]], [[0.5]], [[1]], 2, [[1.0, 2.0]]),
)
def test_pad_groundtruths_to_fixed_size(self, boxes, area, classes, size,
attributes):
groundtruths = {}
groundtruths['boxes'] = tf.constant(boxes)
groundtruths['is_crowds'] = tf.constant([[0]])
groundtruths['areas'] = tf.constant(area)
groundtruths['classes'] = tf.constant(classes)
if attributes:
groundtruths['attributes'] = {'depth': tf.constant(attributes)}
actual_result = utils.pad_groundtruths_to_fixed_size(
groundtruths=groundtruths, size=size)
# Check that the first dimension is padded to the expected size.
for key in actual_result:
if key == 'attributes':
for _, v in actual_result[key].items():
pad_shape = v.shape[0]
self.assertEqual(size, pad_shape)
else:
pad_shape = actual_result[key].shape[0]
self.assertEqual(size, pad_shape)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Parser for video and label datasets."""
from typing import Dict, Optional, Tuple, Union
from absl import logging
import tensorflow as tf
from official.vision.configs import video_classification as exp_cfg
from official.vision.dataloaders import decoder
from official.vision.dataloaders import parser
from official.vision.ops import augment
from official.vision.ops import preprocess_ops_3d
IMAGE_KEY = 'image/encoded'
LABEL_KEY = 'clip/label/index'
def process_image(image: tf.Tensor,
is_training: bool = True,
num_frames: int = 32,
stride: int = 1,
random_stride_range: int = 0,
num_test_clips: int = 1,
min_resize: int = 256,
crop_size: int = 224,
num_crops: int = 1,
zero_centering_image: bool = False,
min_aspect_ratio: float = 0.5,
max_aspect_ratio: float = 2,
min_area_ratio: float = 0.49,
max_area_ratio: float = 1.0,
augmenter: Optional[augment.ImageAugment] = None,
seed: Optional[int] = None) -> tf.Tensor:
"""Processes a serialized image tensor.
Args:
image: Input Tensor of shape [timesteps] and type tf.string of serialized
frames.
is_training: Whether or not in training mode. If True, random sample, crop
and left right flip is used.
num_frames: Number of frames per subclip.
stride: Temporal stride to sample frames.
random_stride_range: An int indicating the min and max bounds to uniformly
sample different strides from the video. E.g., a value of 1 with stride=2
will uniformly sample a stride in {1, 2, 3} for each video in a batch.
Only used enabled training for the purposes of frame-rate augmentation.
Defaults to 0, which disables random sampling.
num_test_clips: Number of test clips (1 by default). If more than 1, this
will sample multiple linearly spaced clips within each video at test time.
If 1, then a single clip in the middle of the video is sampled. The clips
are aggreagated in the batch dimension.
min_resize: Frames are resized so that min(height, width) is min_resize.
crop_size: Final size of the frame after cropping the resized frames. Both
height and width are the same.
num_crops: Number of crops to perform on the resized frames.
zero_centering_image: If True, frames are normalized to values in [-1, 1].
If False, values in [0, 1].
min_aspect_ratio: The minimum aspect range for cropping.
max_aspect_ratio: The maximum aspect range for cropping.
min_area_ratio: The minimum area range for cropping.
max_area_ratio: The maximum area range for cropping.
augmenter: Image augmenter to distort each image.
seed: A deterministic seed to use when sampling.
Returns:
Processed frames. Tensor of shape
[num_frames * num_test_clips, crop_size, crop_size, 3].
"""
# Validate parameters.
if is_training and num_test_clips != 1:
logging.warning(
'`num_test_clips` %d is ignored since `is_training` is `True`.',
num_test_clips)
if random_stride_range < 0:
raise ValueError('Random stride range should be >= 0, got {}'.format(
random_stride_range))
# Temporal sampler.
if is_training:
if random_stride_range > 0:
# Uniformly sample different frame-rates
stride = tf.random.uniform(
[],
tf.maximum(stride - random_stride_range, 1),
stride + random_stride_range,
dtype=tf.int32)
# Sample random clip.
image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride,
seed)
elif num_test_clips > 1:
# Sample linspace clips.
image = preprocess_ops_3d.sample_linspace_sequence(image, num_test_clips,
num_frames, stride)
else:
# Sample middle clip.
image = preprocess_ops_3d.sample_sequence(image, num_frames, False, stride)
# Decode JPEG string to tf.uint8.
if image.dtype == tf.string:
image = preprocess_ops_3d.decode_jpeg(image, 3)
if is_training:
# Standard image data augmentation: random resized crop and random flip.
image = preprocess_ops_3d.random_crop_resize(
image, crop_size, crop_size, num_frames, 3,
(min_aspect_ratio, max_aspect_ratio),
(min_area_ratio, max_area_ratio))
image = preprocess_ops_3d.random_flip_left_right(image, seed)
if augmenter is not None:
image = augmenter.distort(image)
else:
# Resize images (resize happens only if necessary to save compute).
image = preprocess_ops_3d.resize_smallest(image, min_resize)
# Crop of the frames.
image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, False,
num_crops)
# Cast the frames in float32, normalizing according to zero_centering_image.
return preprocess_ops_3d.normalize_image(image, zero_centering_image)
def postprocess_image(image: tf.Tensor,
is_training: bool = True,
num_frames: int = 32,
num_test_clips: int = 1,
num_test_crops: int = 1) -> tf.Tensor:
"""Processes a batched Tensor of frames.
The same parameters used in process should be used here.
Args:
image: Input Tensor of shape [batch, timesteps, height, width, 3].
is_training: Whether or not in training mode. If True, random sample, crop
and left right flip is used.
num_frames: Number of frames per subclip.
num_test_clips: Number of test clips (1 by default). If more than 1, this
will sample multiple linearly spaced clips within each video at test time.
If 1, then a single clip in the middle of the video is sampled. The clips
are aggreagated in the batch dimension.
num_test_crops: Number of test crops (1 by default). If more than 1, there
are multiple crops for each clip at test time. If 1, there is a single
central crop. The crops are aggreagated in the batch dimension.
Returns:
Processed frames. Tensor of shape
[batch * num_test_clips * num_test_crops, num_frames, height, width, 3].
"""
num_views = num_test_clips * num_test_crops
if num_views > 1 and not is_training:
# In this case, multiple views are merged together in batch dimenstion which
# will be batch * num_views.
image = tf.reshape(image, [-1, num_frames] + image.shape[2:].as_list())
return image
def process_label(label: tf.Tensor,
one_hot_label: bool = True,
num_classes: Optional[int] = None) -> tf.Tensor:
"""Processes label Tensor."""
# Validate parameters.
if one_hot_label and not num_classes:
raise ValueError(
'`num_classes` should be given when requesting one hot label.')
# Cast to tf.int32.
label = tf.cast(label, dtype=tf.int32)
if one_hot_label:
# Replace label index by one hot representation.
label = tf.one_hot(label, num_classes)
if len(label.shape.as_list()) > 1:
label = tf.reduce_sum(label, axis=0)
if num_classes == 1:
# The trick for single label.
label = 1 - label
return label
class Decoder(decoder.Decoder):
"""A tf.Example decoder for classification task."""
def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY):
self._context_description = {
# One integer stored in context.
label_key: tf.io.VarLenFeature(tf.int64),
}
self._sequence_description = {
# Each image is a string encoding JPEG.
image_key: tf.io.FixedLenSequenceFeature((), tf.string),
}
def add_feature(self, feature_name: str,
feature_type: Union[tf.io.VarLenFeature,
tf.io.FixedLenFeature,
tf.io.FixedLenSequenceFeature]):
self._sequence_description[feature_name] = feature_type
def add_context(self, feature_name: str,
feature_type: Union[tf.io.VarLenFeature,
tf.io.FixedLenFeature,
tf.io.FixedLenSequenceFeature]):
self._context_description[feature_name] = feature_type
def decode(self, serialized_example):
"""Parses a single tf.Example into image and label tensors."""
result = {}
context, sequences = tf.io.parse_single_sequence_example(
serialized_example, self._context_description,
self._sequence_description)
result.update(context)
result.update(sequences)
for key, value in result.items():
if isinstance(value, tf.SparseTensor):
result[key] = tf.sparse.to_dense(value)
return result
class VideoTfdsDecoder(decoder.Decoder):
"""A tf.SequenceExample decoder for tfds video classification datasets."""
def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY):
self._image_key = image_key
self._label_key = label_key
def decode(self, features):
"""Decode the TFDS FeatureDict.
Args:
features: features from TFDS video dataset.
See https://www.tensorflow.org/datasets/catalog/ucf101 for example.
Returns:
Dict of tensors.
"""
sample_dict = {
self._image_key: features['video'],
self._label_key: features['label'],
}
return sample_dict
class Parser(parser.Parser):
"""Parses a video and label dataset."""
def __init__(self,
input_params: exp_cfg.DataConfig,
image_key: str = IMAGE_KEY,
label_key: str = LABEL_KEY):
self._num_frames = input_params.feature_shape[0]
self._stride = input_params.temporal_stride
self._random_stride_range = input_params.random_stride_range
self._num_test_clips = input_params.num_test_clips
self._min_resize = input_params.min_image_size
self._crop_size = input_params.feature_shape[1]
self._num_crops = input_params.num_test_crops
self._one_hot_label = input_params.one_hot
self._num_classes = input_params.num_classes
self._image_key = image_key
self._label_key = label_key
self._dtype = tf.dtypes.as_dtype(input_params.dtype)
self._output_audio = input_params.output_audio
self._min_aspect_ratio = input_params.aug_min_aspect_ratio
self._max_aspect_ratio = input_params.aug_max_aspect_ratio
self._min_area_ratio = input_params.aug_min_area_ratio
self._max_area_ratio = input_params.aug_max_area_ratio
if self._output_audio:
self._audio_feature = input_params.audio_feature
self._audio_shape = input_params.audio_feature_shape
self._augmenter = None
if input_params.aug_type is not None:
aug_type = input_params.aug_type
if aug_type == 'autoaug':
logging.info('Using AutoAugment.')
self._augmenter = augment.AutoAugment()
elif aug_type == 'randaug':
logging.info('Using RandAugment.')
self._augmenter = augment.RandAugment()
else:
raise ValueError('Augmentation policy {} is not supported.'.format(
aug_type))
def _parse_train_data(
self, decoded_tensors: Dict[str, tf.Tensor]
) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
"""Parses data for training."""
# Process image and label.
image = decoded_tensors[self._image_key]
image = process_image(
image=image,
is_training=True,
num_frames=self._num_frames,
stride=self._stride,
random_stride_range=self._random_stride_range,
num_test_clips=self._num_test_clips,
min_resize=self._min_resize,
crop_size=self._crop_size,
min_aspect_ratio=self._min_aspect_ratio,
max_aspect_ratio=self._max_aspect_ratio,
min_area_ratio=self._min_area_ratio,
max_area_ratio=self._max_area_ratio,
augmenter=self._augmenter)
image = tf.cast(image, dtype=self._dtype)
features = {'image': image}
label = decoded_tensors[self._label_key]
label = process_label(label, self._one_hot_label, self._num_classes)
if self._output_audio:
audio = decoded_tensors[self._audio_feature]
audio = tf.cast(audio, dtype=self._dtype)
# TODO(yeqing): synchronize audio/video sampling. Especially randomness.
audio = preprocess_ops_3d.sample_sequence(
audio, self._audio_shape[0], random=False, stride=1)
audio = tf.ensure_shape(audio, self._audio_shape)
features['audio'] = audio
return features, label
def _parse_eval_data(
self, decoded_tensors: Dict[str, tf.Tensor]
) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
"""Parses data for evaluation."""
image = decoded_tensors[self._image_key]
image = process_image(
image=image,
is_training=False,
num_frames=self._num_frames,
stride=self._stride,
num_test_clips=self._num_test_clips,
min_resize=self._min_resize,
crop_size=self._crop_size,
num_crops=self._num_crops)
image = tf.cast(image, dtype=self._dtype)
features = {'image': image}
label = decoded_tensors[self._label_key]
label = process_label(label, self._one_hot_label, self._num_classes)
if self._output_audio:
audio = decoded_tensors[self._audio_feature]
audio = tf.cast(audio, dtype=self._dtype)
audio = preprocess_ops_3d.sample_sequence(
audio, self._audio_shape[0], random=False, stride=1)
audio = tf.ensure_shape(audio, self._audio_shape)
features['audio'] = audio
return features, label
class PostBatchProcessor(object):
"""Processes a video and label dataset which is batched."""
def __init__(self, input_params: exp_cfg.DataConfig):
self._is_training = input_params.is_training
self._num_frames = input_params.feature_shape[0]
self._num_test_clips = input_params.num_test_clips
self._num_test_crops = input_params.num_test_crops
def __call__(self, features: Dict[str, tf.Tensor],
label: tf.Tensor) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
"""Parses a single tf.Example into image and label tensors."""
for key in ['image']:
if key in features:
features[key] = postprocess_image(
image=features[key],
is_training=self._is_training,
num_frames=self._num_frames,
num_test_clips=self._num_test_clips,
num_test_crops=self._num_test_crops)
return features, label
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
import io
# Import libraries
import numpy as np
from PIL import Image
import tensorflow as tf
import tensorflow_datasets as tfds
from official.vision.configs import video_classification as exp_cfg
from official.vision.dataloaders import video_input
AUDIO_KEY = 'features/audio'
def fake_seq_example():
# Create fake data.
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
random_image = Image.fromarray(random_image)
label = 42
with io.BytesIO() as buffer:
random_image.save(buffer, format='JPEG')
raw_image_bytes = buffer.getvalue()
seq_example = tf.train.SequenceExample()
seq_example.feature_lists.feature_list.get_or_create(
video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
raw_image_bytes
]
seq_example.feature_lists.feature_list.get_or_create(
video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
raw_image_bytes
]
seq_example.context.feature[video_input.LABEL_KEY].int64_list.value[:] = [
label
]
random_audio = np.random.normal(size=(10, 256)).tolist()
for s in random_audio:
seq_example.feature_lists.feature_list.get_or_create(
AUDIO_KEY).feature.add().float_list.value[:] = s
return seq_example, label
class DecoderTest(tf.test.TestCase):
"""A tf.SequenceExample decoder for the video classification task."""
def test_decoder(self):
decoder = video_input.Decoder()
seq_example, label = fake_seq_example()
serialized_example = seq_example.SerializeToString()
decoded_tensors = decoder.decode(tf.convert_to_tensor(serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertCountEqual([video_input.IMAGE_KEY, video_input.LABEL_KEY],
results.keys())
self.assertEqual(label, results[video_input.LABEL_KEY])
def test_decode_audio(self):
decoder = video_input.Decoder()
decoder.add_feature(AUDIO_KEY, tf.io.VarLenFeature(dtype=tf.float32))
seq_example, label = fake_seq_example()
serialized_example = seq_example.SerializeToString()
decoded_tensors = decoder.decode(tf.convert_to_tensor(serialized_example))
results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
self.assertCountEqual(
[video_input.IMAGE_KEY, video_input.LABEL_KEY, AUDIO_KEY],
results.keys())
self.assertEqual(label, results[video_input.LABEL_KEY])
self.assertEqual(results[AUDIO_KEY].shape, (10, 256))
def test_tfds_decode(self):
with tfds.testing.mock_data(num_examples=1):
dataset = tfds.load('ucf101', split='train').take(1)
data = next(iter(dataset))
decoder = video_input.VideoTfdsDecoder()
decoded_tensors = decoder.decode(data)
self.assertContainsSubset([video_input.LABEL_KEY, video_input.IMAGE_KEY],
decoded_tensors.keys())
class VideoAndLabelParserTest(tf.test.TestCase):
def test_video_input(self):
params = exp_cfg.kinetics600(is_training=True)
params.feature_shape = (2, 224, 224, 3)
params.min_image_size = 224
decoder = video_input.Decoder()
parser = video_input.Parser(params).parse_fn(params.is_training)
seq_example, label = fake_seq_example()
input_tensor = tf.constant(seq_example.SerializeToString())
decoded_tensors = decoder.decode(input_tensor)
output_tensor = parser(decoded_tensors)
image_features, label = output_tensor
image = image_features['image']
self.assertAllEqual(image.shape, (2, 224, 224, 3))
self.assertAllEqual(label.shape, (600,))
def test_video_audio_input(self):
params = exp_cfg.kinetics600(is_training=True)
params.feature_shape = (2, 224, 224, 3)
params.min_image_size = 224
params.output_audio = True
params.audio_feature = AUDIO_KEY
params.audio_feature_shape = (15, 256)
decoder = video_input.Decoder()
decoder.add_feature(params.audio_feature,
tf.io.VarLenFeature(dtype=tf.float32))
parser = video_input.Parser(params).parse_fn(params.is_training)
seq_example, label = fake_seq_example()
input_tensor = tf.constant(seq_example.SerializeToString())
decoded_tensors = decoder.decode(input_tensor)
output_tensor = parser(decoded_tensors)
features, label = output_tensor
image = features['image']
audio = features['audio']
self.assertAllEqual(image.shape, (2, 224, 224, 3))
self.assertAllEqual(label.shape, (600,))
self.assertEqual(audio.shape, (15, 256))
def test_video_input_random_stride(self):
params = exp_cfg.kinetics600(is_training=True)
params.feature_shape = (2, 224, 224, 3)
params.min_image_size = 224
params.temporal_stride = 2
params.random_stride_range = 1
decoder = video_input.Decoder()
parser = video_input.Parser(params).parse_fn(params.is_training)
seq_example, label = fake_seq_example()
input_tensor = tf.constant(seq_example.SerializeToString())
decoded_tensors = decoder.decode(input_tensor)
output_tensor = parser(decoded_tensors)
image_features, label = output_tensor
image = image_features['image']
self.assertAllEqual(image.shape, (2, 224, 224, 3))
self.assertAllEqual(label.shape, (600,))
def test_video_input_augmentation_returns_shape(self):
params = exp_cfg.kinetics600(is_training=True)
params.feature_shape = (2, 224, 224, 3)
params.min_image_size = 224
params.temporal_stride = 2
params.aug_type = 'autoaug'
decoder = video_input.Decoder()
parser = video_input.Parser(params).parse_fn(params.is_training)
seq_example, label = fake_seq_example()
input_tensor = tf.constant(seq_example.SerializeToString())
decoded_tensors = decoder.decode(input_tensor)
output_tensor = parser(decoded_tensors)
image_features, label = output_tensor
image = image_features['image']
self.assertAllEqual(image.shape, (2, 224, 224, 3))
self.assertAllEqual(label.shape, (600,))
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment