Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
0225b135
Unverified
Commit
0225b135
authored
Mar 05, 2022
by
Srihari Humbarwadi
Committed by
GitHub
Mar 05, 2022
Browse files
Merge branch 'tensorflow:master' into panoptic-deeplab-modeling
parents
7479dbb8
4c571a3c
Changes
332
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3116 additions
and
0 deletions
+3116
-0
official/vision/dataloaders/input_reader_factory.py
official/vision/dataloaders/input_reader_factory.py
+44
-0
official/vision/dataloaders/maskrcnn_input.py
official/vision/dataloaders/maskrcnn_input.py
+345
-0
official/vision/dataloaders/parser.py
official/vision/dataloaders/parser.py
+81
-0
official/vision/dataloaders/retinanet_input.py
official/vision/dataloaders/retinanet_input.py
+328
-0
official/vision/dataloaders/segmentation_input.py
official/vision/dataloaders/segmentation_input.py
+218
-0
official/vision/dataloaders/tf_example_decoder.py
official/vision/dataloaders/tf_example_decoder.py
+176
-0
official/vision/dataloaders/tf_example_decoder_test.py
official/vision/dataloaders/tf_example_decoder_test.py
+267
-0
official/vision/dataloaders/tf_example_label_map_decoder.py
official/vision/dataloaders/tf_example_label_map_decoder.py
+67
-0
official/vision/dataloaders/tf_example_label_map_decoder_test.py
...l/vision/dataloaders/tf_example_label_map_decoder_test.py
+188
-0
official/vision/dataloaders/tfds_classification_decoders.py
official/vision/dataloaders/tfds_classification_decoders.py
+38
-0
official/vision/dataloaders/tfds_detection_decoders.py
official/vision/dataloaders/tfds_detection_decoders.py
+60
-0
official/vision/dataloaders/tfds_factory.py
official/vision/dataloaders/tfds_factory.py
+71
-0
official/vision/dataloaders/tfds_factory_test.py
official/vision/dataloaders/tfds_factory_test.py
+114
-0
official/vision/dataloaders/tfds_segmentation_decoders.py
official/vision/dataloaders/tfds_segmentation_decoders.py
+86
-0
official/vision/dataloaders/tfexample_utils.py
official/vision/dataloaders/tfexample_utils.py
+291
-0
official/vision/dataloaders/utils.py
official/vision/dataloaders/utils.py
+69
-0
official/vision/dataloaders/utils_test.py
official/vision/dataloaders/utils_test.py
+71
-0
official/vision/dataloaders/video_input.py
official/vision/dataloaders/video_input.py
+393
-0
official/vision/dataloaders/video_input_test.py
official/vision/dataloaders/video_input_test.py
+195
-0
official/vision/evaluation/__init__.py
official/vision/evaluation/__init__.py
+14
-0
No files found.
official/vision/dataloaders/input_reader_factory.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Factory for getting TF-Vision input readers."""
from
official.common
import
dataset_fn
as
dataset_fn_util
from
official.core
import
config_definitions
as
cfg
from
official.core
import
input_reader
as
core_input_reader
from
official.vision.dataloaders
import
input_reader
as
vision_input_reader
def
input_reader_generator
(
params
:
cfg
.
DataConfig
,
**
kwargs
)
->
core_input_reader
.
InputReader
:
"""Instantiates an input reader class according to the params.
Args:
params: A config_definitions.DataConfig object.
**kwargs: Additional arguments passed to input reader initialization.
Returns:
An InputReader object.
"""
if
params
.
is_training
and
params
.
get
(
'pseudo_label_data'
,
False
):
return
vision_input_reader
.
CombinationDatasetInputReader
(
params
,
pseudo_label_dataset_fn
=
dataset_fn_util
.
pick_dataset_fn
(
params
.
pseudo_label_data
.
file_type
),
**
kwargs
)
else
:
return
core_input_reader
.
InputReader
(
params
,
**
kwargs
)
official/vision/dataloaders/maskrcnn_input.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for Mask R-CNN."""
# Import libraries
import
tensorflow
as
tf
from
official.vision.dataloaders
import
parser
from
official.vision.dataloaders
import
utils
from
official.vision.ops
import
anchor
from
official.vision.ops
import
box_ops
from
official.vision.ops
import
preprocess_ops
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
rpn_match_threshold
=
0.7
,
rpn_unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
include_mask
=
False
,
mask_crop_size
=
112
,
dtype
=
'float32'
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
rpn_match_threshold:
rpn_unmatched_threshold:
rpn_batch_size_per_im:
rpn_fg_fraction:
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
include_mask: a bool to indicate whether parse mask groundtruth.
mask_crop_size: the size which groundtruth mask is cropped to.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
self
.
_max_num_instances
=
max_num_instances
self
.
_skip_crowd_during_training
=
skip_crowd_during_training
# Anchor.
self
.
_output_size
=
output_size
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_num_scales
=
num_scales
self
.
_aspect_ratios
=
aspect_ratios
self
.
_anchor_size
=
anchor_size
# Target assigning.
self
.
_rpn_match_threshold
=
rpn_match_threshold
self
.
_rpn_unmatched_threshold
=
rpn_unmatched_threshold
self
.
_rpn_batch_size_per_im
=
rpn_batch_size_per_im
self
.
_rpn_fg_fraction
=
rpn_fg_fraction
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# Mask.
self
.
_include_mask
=
include_mask
self
.
_mask_crop_size
=
mask_crop_size
# Image output dtype.
self
.
_dtype
=
dtype
def
_parse_train_data
(
self
,
data
):
"""Parses data for training.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following describes
{key: value} pairs in the dictionary.
image_info: a 2D `Tensor` that encodes the information of the image and
the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
rpn_score_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
rpn_box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
gt_boxes: Groundtruth bounding box annotations. The box is represented
in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
image that is fed to the network. The tennsor is padded with -1 to
the fixed dimension [self._max_num_instances, 4].
gt_classes: Groundtruth classes annotations. The tennsor is padded
with -1 to the fixed dimension [self._max_num_instances].
gt_masks: groundtrugh masks cropped by the bounding box and
resized to a fixed size determined by mask_crop_size.
"""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
if
self
.
_include_mask
:
masks
=
data
[
'groundtruth_instance_masks'
]
is_crowds
=
data
[
'groundtruth_is_crowd'
]
# Skips annotations with `is_crowd` = True.
if
self
.
_skip_crowd_during_training
:
num_groundtruths
=
tf
.
shape
(
classes
)[
0
]
with
tf
.
control_dependencies
([
num_groundtruths
,
is_crowds
]):
indices
=
tf
.
cond
(
tf
.
greater
(
tf
.
size
(
is_crowds
),
0
),
lambda
:
tf
.
where
(
tf
.
logical_not
(
is_crowds
))[:,
0
],
lambda
:
tf
.
cast
(
tf
.
range
(
num_groundtruths
),
tf
.
int64
))
classes
=
tf
.
gather
(
classes
,
indices
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
indices
)
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
if
self
.
_include_mask
:
image
,
boxes
,
masks
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
,
masks
)
else
:
image
,
boxes
,
_
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
)
# Converts boxes from normalized coordinates to pixel coordinates.
# Now the coordinates of boxes are w.r.t. the original image.
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
# Now the coordinates of boxes are w.r.t the scaled image.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
preprocess_ops
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_ops
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
indices
)
# Transfer boxes to the original image space and do normalization.
cropped_boxes
=
boxes
+
tf
.
tile
(
tf
.
expand_dims
(
offset
,
axis
=
0
),
[
1
,
2
])
cropped_boxes
/=
tf
.
tile
(
tf
.
expand_dims
(
image_scale
,
axis
=
0
),
[
1
,
2
])
cropped_boxes
=
box_ops
.
normalize_boxes
(
cropped_boxes
,
image_shape
)
num_masks
=
tf
.
shape
(
masks
)[
0
]
masks
=
tf
.
image
.
crop_and_resize
(
tf
.
expand_dims
(
masks
,
axis
=-
1
),
cropped_boxes
,
box_indices
=
tf
.
range
(
num_masks
,
dtype
=
tf
.
int32
),
crop_size
=
[
self
.
_mask_crop_size
,
self
.
_mask_crop_size
],
method
=
'bilinear'
)
masks
=
tf
.
squeeze
(
masks
,
axis
=-
1
)
# Assigns anchor targets.
# Note that after the target assignment, box targets are absolute pixel
# offsets w.r.t. the scaled image.
input_anchor
=
anchor
.
build_anchor_generator
(
min_level
=
self
.
_min_level
,
max_level
=
self
.
_max_level
,
num_scales
=
self
.
_num_scales
,
aspect_ratios
=
self
.
_aspect_ratios
,
anchor_size
=
self
.
_anchor_size
)
anchor_boxes
=
input_anchor
(
image_size
=
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
RpnAnchorLabeler
(
self
.
_rpn_match_threshold
,
self
.
_rpn_unmatched_threshold
,
self
.
_rpn_batch_size_per_im
,
self
.
_rpn_fg_fraction
)
rpn_score_targets
,
rpn_box_targets
=
anchor_labeler
.
label_anchors
(
anchor_boxes
,
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=-
1
),
dtype
=
tf
.
float32
))
# Casts input image to self._dtype
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
# Packs labels for model_fn outputs.
labels
=
{
'anchor_boxes'
:
anchor_boxes
,
'image_info'
:
image_info
,
'rpn_score_targets'
:
rpn_score_targets
,
'rpn_box_targets'
:
rpn_box_targets
,
'gt_boxes'
:
preprocess_ops
.
clip_or_pad_to_fixed_size
(
boxes
,
self
.
_max_num_instances
,
-
1
),
'gt_classes'
:
preprocess_ops
.
clip_or_pad_to_fixed_size
(
classes
,
self
.
_max_num_instances
,
-
1
),
}
if
self
.
_include_mask
:
labels
[
'gt_masks'
]
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
masks
,
self
.
_max_num_instances
,
-
1
)
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for evaluation.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
A dictionary of {'images': image, 'labels': labels} where
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following
describes {key: value} pairs in the dictionary.
source_ids: Source image id. Default value -1 if the source id is
empty in the groundtruth annotation.
image_info: a 2D `Tensor` that encodes the information of the image
and the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each
level.
"""
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Casts input image to self._dtype
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_ops
.
denormalize_boxes
(
data
[
'groundtruth_boxes'
],
image_shape
)
# Compute Anchor boxes.
input_anchor
=
anchor
.
build_anchor_generator
(
min_level
=
self
.
_min_level
,
max_level
=
self
.
_max_level
,
num_scales
=
self
.
_num_scales
,
aspect_ratios
=
self
.
_aspect_ratios
,
anchor_size
=
self
.
_anchor_size
)
anchor_boxes
=
input_anchor
(
image_size
=
(
image_height
,
image_width
))
labels
=
{
'image_info'
:
image_info
,
'anchor_boxes'
:
anchor_boxes
,
}
groundtruths
=
{
'source_id'
:
data
[
'source_id'
],
'height'
:
data
[
'height'
],
'width'
:
data
[
'width'
],
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
])[
0
],
'boxes'
:
boxes
,
'classes'
:
data
[
'groundtruth_classes'
],
'areas'
:
data
[
'groundtruth_area'
],
'is_crowds'
:
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
}
groundtruths
[
'source_id'
]
=
utils
.
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
utils
.
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
labels
[
'groundtruths'
]
=
groundtruths
return
image
,
labels
official/vision/dataloaders/parser.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The generic parser interface."""
import
abc
class
Parser
(
object
):
"""Parses data and produces tensors to be consumed by models."""
__metaclass__
=
abc
.
ABCMeta
@
abc
.
abstractmethod
def
_parse_train_data
(
self
,
decoded_tensors
):
"""Generates images and labels that are usable for model training.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
@
abc
.
abstractmethod
def
_parse_eval_data
(
self
,
decoded_tensors
):
"""Generates images and labels that are usable for model evaluation.
Args:
decoded_tensors: a dict of Tensors produced by the decoder.
Returns:
images: the image tensor.
labels: a dict of Tensors that contains labels.
"""
pass
def
parse_fn
(
self
,
is_training
):
"""Returns a parse fn that reads and parses raw tensors from the decoder.
Args:
is_training: a `bool` to indicate whether it is in training mode.
Returns:
parse: a `callable` that takes the serialized example and generate the
images, labels tuple where labels is a dict of Tensors that contains
labels.
"""
def
parse
(
decoded_tensors
):
"""Parses the serialized example data."""
if
is_training
:
return
self
.
_parse_train_data
(
decoded_tensors
)
else
:
return
self
.
_parse_eval_data
(
decoded_tensors
)
return
parse
@
classmethod
def
inference_fn
(
cls
,
inputs
):
"""Parses inputs for predictions.
Args:
inputs: A Tensor, or dictionary of Tensors.
Returns:
processed_inputs: An input tensor to the model.
"""
pass
official/vision/dataloaders/retinanet_input.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for RetinaNet.
Parse image and ground truths in a dataset to training targets and package them
into (image, labels) tuple for RetinaNet.
"""
# Import libraries
from
absl
import
logging
import
tensorflow
as
tf
from
official.vision.dataloaders
import
parser
from
official.vision.dataloaders
import
utils
from
official.vision.ops
import
anchor
from
official.vision.ops
import
augment
from
official.vision.ops
import
box_ops
from
official.vision.ops
import
preprocess_ops
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
match_threshold
=
0.5
,
unmatched_threshold
=
0.5
,
aug_type
=
None
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
use_autoaugment
=
False
,
autoaugment_policy_name
=
'v0'
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
dtype
=
'bfloat16'
,
mode
=
None
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added on each
level. For instances, num_scales=2 adds one additional intermediate
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
match_threshold: `float` number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: `float` number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
aug_rand_hflip: `bool`, if True, augment training with random horizontal
flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
during training.
autoaugment_policy_name: `string` that specifies the name of the
AutoAugment policy that will be used during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
prediction with groundtruths in the outputs.
"""
self
.
_mode
=
mode
self
.
_max_num_instances
=
max_num_instances
self
.
_skip_crowd_during_training
=
skip_crowd_during_training
# Anchor.
self
.
_output_size
=
output_size
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_num_scales
=
num_scales
self
.
_aspect_ratios
=
aspect_ratios
self
.
_anchor_size
=
anchor_size
self
.
_match_threshold
=
match_threshold
self
.
_unmatched_threshold
=
unmatched_threshold
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# Data augmentation with AutoAugment or RandAugment.
self
.
_augmenter
=
None
if
aug_type
is
not
None
:
if
aug_type
.
type
==
'autoaug'
:
logging
.
info
(
'Using AutoAugment.'
)
self
.
_augmenter
=
augment
.
AutoAugment
(
augmentation_name
=
aug_type
.
autoaug
.
augmentation_name
,
cutout_const
=
aug_type
.
autoaug
.
cutout_const
,
translate_const
=
aug_type
.
autoaug
.
translate_const
)
elif
aug_type
.
type
==
'randaug'
:
logging
.
info
(
'Using RandAugment.'
)
self
.
_augmenter
=
augment
.
RandAugment
.
build_for_detection
(
num_layers
=
aug_type
.
randaug
.
num_layers
,
magnitude
=
aug_type
.
randaug
.
magnitude
,
cutout_const
=
aug_type
.
randaug
.
cutout_const
,
translate_const
=
aug_type
.
randaug
.
translate_const
,
prob_to_apply
=
aug_type
.
randaug
.
prob_to_apply
,
exclude_ops
=
aug_type
.
randaug
.
exclude_ops
)
else
:
raise
ValueError
(
f
'Augmentation policy
{
aug_type
.
type
}
not supported.'
)
# Deprecated. Data Augmentation with AutoAugment.
self
.
_use_autoaugment
=
use_autoaugment
self
.
_autoaugment_policy_name
=
autoaugment_policy_name
# Data type.
self
.
_dtype
=
dtype
def
_parse_train_data
(
self
,
data
):
"""Parses data for training and evaluation."""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
# If not empty, `attributes` is a dict of (name, ground_truth) pairs.
# `ground_gruth` of attributes is assumed in shape [N, attribute_size].
# TODO(xianzhi): support parsing attributes weights.
attributes
=
data
.
get
(
'groundtruth_attributes'
,
{})
is_crowds
=
data
[
'groundtruth_is_crowd'
]
# Skips annotations with `is_crowd` = True.
if
self
.
_skip_crowd_during_training
:
num_groundtrtuhs
=
tf
.
shape
(
input
=
classes
)[
0
]
with
tf
.
control_dependencies
([
num_groundtrtuhs
,
is_crowds
]):
indices
=
tf
.
cond
(
pred
=
tf
.
greater
(
tf
.
size
(
input
=
is_crowds
),
0
),
true_fn
=
lambda
:
tf
.
where
(
tf
.
logical_not
(
is_crowds
))[:,
0
],
false_fn
=
lambda
:
tf
.
cast
(
tf
.
range
(
num_groundtrtuhs
),
tf
.
int64
))
classes
=
tf
.
gather
(
classes
,
indices
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
for
k
,
v
in
attributes
.
items
():
attributes
[
k
]
=
tf
.
gather
(
v
,
indices
)
# Gets original image.
image
=
data
[
'image'
]
# Apply autoaug or randaug.
if
self
.
_augmenter
is
not
None
:
image
,
boxes
=
self
.
_augmenter
.
distort_with_boxes
(
image
,
boxes
)
image_shape
=
tf
.
shape
(
input
=
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
image
,
boxes
,
_
=
preprocess_ops
.
random_horizontal_flip
(
image
,
boxes
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
preprocess_ops
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_ops
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
for
k
,
v
in
attributes
.
items
():
attributes
[
k
]
=
tf
.
gather
(
v
,
indices
)
# Assigns anchors.
input_anchor
=
anchor
.
build_anchor_generator
(
min_level
=
self
.
_min_level
,
max_level
=
self
.
_max_level
,
num_scales
=
self
.
_num_scales
,
aspect_ratios
=
self
.
_aspect_ratios
,
anchor_size
=
self
.
_anchor_size
)
anchor_boxes
=
input_anchor
(
image_size
=
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
AnchorLabeler
(
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
(
cls_targets
,
box_targets
,
att_targets
,
cls_weights
,
box_weights
)
=
anchor_labeler
.
label_anchors
(
anchor_boxes
,
boxes
,
tf
.
expand_dims
(
classes
,
axis
=
1
),
attributes
)
# Casts input image to desired data type.
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
# Packs labels for model_fn outputs.
labels
=
{
'cls_targets'
:
cls_targets
,
'box_targets'
:
box_targets
,
'anchor_boxes'
:
anchor_boxes
,
'cls_weights'
:
cls_weights
,
'box_weights'
:
box_weights
,
'image_info'
:
image_info
,
}
if
att_targets
:
labels
[
'attribute_targets'
]
=
att_targets
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for training and evaluation."""
groundtruths
=
{}
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
# If not empty, `attributes` is a dict of (name, ground_truth) pairs.
# `ground_gruth` of attributes is assumed in shape [N, attribute_size].
# TODO(xianzhi): support parsing attributes weights.
attributes
=
data
.
get
(
'groundtruth_attributes'
,
{})
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
input
=
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_ops
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
preprocess_ops
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
preprocess_ops
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_ops
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
for
k
,
v
in
attributes
.
items
():
attributes
[
k
]
=
tf
.
gather
(
v
,
indices
)
# Assigns anchors.
input_anchor
=
anchor
.
build_anchor_generator
(
min_level
=
self
.
_min_level
,
max_level
=
self
.
_max_level
,
num_scales
=
self
.
_num_scales
,
aspect_ratios
=
self
.
_aspect_ratios
,
anchor_size
=
self
.
_anchor_size
)
anchor_boxes
=
input_anchor
(
image_size
=
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
AnchorLabeler
(
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
(
cls_targets
,
box_targets
,
att_targets
,
cls_weights
,
box_weights
)
=
anchor_labeler
.
label_anchors
(
anchor_boxes
,
boxes
,
tf
.
expand_dims
(
classes
,
axis
=
1
),
attributes
)
# Casts input image to desired data type.
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
# Sets up groundtruth data for evaluation.
groundtruths
=
{
'source_id'
:
data
[
'source_id'
],
'height'
:
data
[
'height'
],
'width'
:
data
[
'width'
],
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
]),
'image_info'
:
image_info
,
'boxes'
:
box_ops
.
denormalize_boxes
(
data
[
'groundtruth_boxes'
],
image_shape
),
'classes'
:
data
[
'groundtruth_classes'
],
'areas'
:
data
[
'groundtruth_area'
],
'is_crowds'
:
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
}
if
'groundtruth_attributes'
in
data
:
groundtruths
[
'attributes'
]
=
data
[
'groundtruth_attributes'
]
groundtruths
[
'source_id'
]
=
utils
.
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
utils
.
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
# Packs labels for model_fn outputs.
labels
=
{
'cls_targets'
:
cls_targets
,
'box_targets'
:
box_targets
,
'anchor_boxes'
:
anchor_boxes
,
'cls_weights'
:
cls_weights
,
'box_weights'
:
box_weights
,
'image_info'
:
image_info
,
'groundtruths'
:
groundtruths
,
}
if
att_targets
:
labels
[
'attribute_targets'
]
=
att_targets
return
image
,
labels
official/vision/dataloaders/segmentation_input.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for segmentation datasets."""
import
tensorflow
as
tf
from
official.vision.dataloaders
import
decoder
from
official.vision.dataloaders
import
parser
from
official.vision.ops
import
preprocess_ops
class
Decoder
(
decoder
.
Decoder
):
"""A tf.Example decoder for segmentation task."""
def
__init__
(
self
):
self
.
_keys_to_features
=
{
'image/encoded'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
,
default_value
=
''
),
'image/height'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
,
default_value
=
0
),
'image/width'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
,
default_value
=
0
),
'image/segmentation/class/encoded'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
,
default_value
=
''
)
}
def
decode
(
self
,
serialized_example
):
return
tf
.
io
.
parse_single_example
(
serialized_example
,
self
.
_keys_to_features
)
class
Parser
(
parser
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors.
"""
def
__init__
(
self
,
output_size
,
crop_size
=
None
,
resize_eval_groundtruth
=
True
,
groundtruth_padded_size
=
None
,
ignore_label
=
255
,
aug_rand_hflip
=
False
,
preserve_aspect_ratio
=
True
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
dtype
=
'float32'
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
crop_size: `Tensor` or `list` for [height, width] of the crop. If
specified a training crop of size crop_size is returned. This is useful
for cropping original images during training while evaluating on
original image sizes.
resize_eval_groundtruth: `bool`, if True, eval groundtruth masks are
resized to output_size.
groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
resize_eval_groundtruth is set to False, the groundtruth masks are
padded to this size.
ignore_label: `int` the pixel with ignore label will not used for training
and evaluation.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved,
otherwise, the image is resized to output_size.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
self
.
_output_size
=
output_size
self
.
_crop_size
=
crop_size
self
.
_resize_eval_groundtruth
=
resize_eval_groundtruth
if
(
not
resize_eval_groundtruth
)
and
(
groundtruth_padded_size
is
None
):
raise
ValueError
(
'groundtruth_padded_size ([height, width]) needs to be'
'specified when resize_eval_groundtruth is False.'
)
self
.
_groundtruth_padded_size
=
groundtruth_padded_size
self
.
_ignore_label
=
ignore_label
self
.
_preserve_aspect_ratio
=
preserve_aspect_ratio
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# dtype.
self
.
_dtype
=
dtype
def
_prepare_image_and_label
(
self
,
data
):
"""Prepare normalized image and label."""
image
=
tf
.
io
.
decode_image
(
data
[
'image/encoded'
],
channels
=
3
)
label
=
tf
.
io
.
decode_image
(
data
[
'image/segmentation/class/encoded'
],
channels
=
1
)
height
=
data
[
'image/height'
]
width
=
data
[
'image/width'
]
image
=
tf
.
reshape
(
image
,
(
height
,
width
,
3
))
label
=
tf
.
reshape
(
label
,
(
1
,
height
,
width
))
label
=
tf
.
cast
(
label
,
tf
.
float32
)
# Normalizes image with mean and std pixel values.
image
=
preprocess_ops
.
normalize_image
(
image
)
if
not
self
.
_preserve_aspect_ratio
:
label
=
tf
.
reshape
(
label
,
[
data
[
'image/height'
],
data
[
'image/width'
],
1
])
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
'bilinear'
)
label
=
tf
.
image
.
resize
(
label
,
self
.
_output_size
,
method
=
'nearest'
)
label
=
tf
.
reshape
(
label
[:,
:,
-
1
],
[
1
]
+
self
.
_output_size
)
return
image
,
label
def
_parse_train_data
(
self
,
data
):
"""Parses data for training and evaluation."""
image
,
label
=
self
.
_prepare_image_and_label
(
data
)
if
self
.
_crop_size
:
label
=
tf
.
reshape
(
label
,
[
data
[
'image/height'
],
data
[
'image/width'
],
1
])
# If output_size is specified, resize image, and label to desired
# output_size.
if
self
.
_output_size
:
image
=
tf
.
image
.
resize
(
image
,
self
.
_output_size
,
method
=
'bilinear'
)
label
=
tf
.
image
.
resize
(
label
,
self
.
_output_size
,
method
=
'nearest'
)
image_mask
=
tf
.
concat
([
image
,
label
],
axis
=
2
)
image_mask_crop
=
tf
.
image
.
random_crop
(
image_mask
,
self
.
_crop_size
+
[
4
])
image
=
image_mask_crop
[:,
:,
:
-
1
]
label
=
tf
.
reshape
(
image_mask_crop
[:,
:,
-
1
],
[
1
]
+
self
.
_crop_size
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
image
,
_
,
label
=
preprocess_ops
.
random_horizontal_flip
(
image
,
masks
=
label
)
train_image_size
=
self
.
_crop_size
if
self
.
_crop_size
else
self
.
_output_size
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
train_image_size
,
train_image_size
,
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
# Resizes and crops boxes.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
# Pad label and make sure the padded region assigned to the ignore label.
# The label is first offset by +1 and then padded with 0.
label
+=
1
label
=
tf
.
expand_dims
(
label
,
axis
=
3
)
label
=
preprocess_ops
.
resize_and_crop_masks
(
label
,
image_scale
,
train_image_size
,
offset
)
label
-=
1
label
=
tf
.
where
(
tf
.
equal
(
label
,
-
1
),
self
.
_ignore_label
*
tf
.
ones_like
(
label
),
label
)
label
=
tf
.
squeeze
(
label
,
axis
=
0
)
valid_mask
=
tf
.
not_equal
(
label
,
self
.
_ignore_label
)
labels
=
{
'masks'
:
label
,
'valid_masks'
:
valid_mask
,
'image_info'
:
image_info
,
}
# Cast image as self._dtype
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for training and evaluation."""
image
,
label
=
self
.
_prepare_image_and_label
(
data
)
# The label is first offset by +1 and then padded with 0.
label
+=
1
label
=
tf
.
expand_dims
(
label
,
axis
=
3
)
# Resizes and crops image.
image
,
image_info
=
preprocess_ops
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
self
.
_output_size
)
if
self
.
_resize_eval_groundtruth
:
# Resizes eval masks to match input image sizes. In that case, mean IoU
# is computed on output_size not the original size of the images.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
label
=
preprocess_ops
.
resize_and_crop_masks
(
label
,
image_scale
,
self
.
_output_size
,
offset
)
else
:
label
=
tf
.
image
.
pad_to_bounding_box
(
label
,
0
,
0
,
self
.
_groundtruth_padded_size
[
0
],
self
.
_groundtruth_padded_size
[
1
])
label
-=
1
label
=
tf
.
where
(
tf
.
equal
(
label
,
-
1
),
self
.
_ignore_label
*
tf
.
ones_like
(
label
),
label
)
label
=
tf
.
squeeze
(
label
,
axis
=
0
)
valid_mask
=
tf
.
not_equal
(
label
,
self
.
_ignore_label
)
labels
=
{
'masks'
:
label
,
'valid_masks'
:
valid_mask
,
'image_info'
:
image_info
}
# Cast image as self._dtype
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
return
image
,
labels
official/vision/dataloaders/tf_example_decoder.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import
tensorflow
as
tf
from
official.vision.dataloaders
import
decoder
def
_generate_source_id
(
image_bytes
):
# Hashing using 22 bits since float32 has only 23 mantissa bits.
return
tf
.
strings
.
as_string
(
tf
.
strings
.
to_hash_bucket_fast
(
image_bytes
,
2
**
22
-
1
))
class
TfExampleDecoder
(
decoder
.
Decoder
):
"""Tensorflow Example proto decoder."""
def
__init__
(
self
,
include_mask
=
False
,
regenerate_source_id
=
False
,
mask_binarize_threshold
=
None
):
self
.
_include_mask
=
include_mask
self
.
_regenerate_source_id
=
regenerate_source_id
self
.
_keys_to_features
=
{
'image/encoded'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
),
'image/height'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
),
'image/width'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
),
'image/object/bbox/xmin'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/bbox/xmax'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/bbox/ymin'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/bbox/ymax'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/class/label'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
'image/object/area'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/is_crowd'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
}
self
.
_mask_binarize_threshold
=
mask_binarize_threshold
if
include_mask
:
self
.
_keys_to_features
.
update
({
'image/object/mask'
:
tf
.
io
.
VarLenFeature
(
tf
.
string
),
})
if
not
regenerate_source_id
:
self
.
_keys_to_features
.
update
({
'image/source_id'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
),
})
def
_decode_image
(
self
,
parsed_tensors
):
"""Decodes the image and set its static shape."""
image
=
tf
.
io
.
decode_image
(
parsed_tensors
[
'image/encoded'
],
channels
=
3
)
image
.
set_shape
([
None
,
None
,
3
])
return
image
def
_decode_boxes
(
self
,
parsed_tensors
):
"""Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
xmin
=
parsed_tensors
[
'image/object/bbox/xmin'
]
xmax
=
parsed_tensors
[
'image/object/bbox/xmax'
]
ymin
=
parsed_tensors
[
'image/object/bbox/ymin'
]
ymax
=
parsed_tensors
[
'image/object/bbox/ymax'
]
return
tf
.
stack
([
ymin
,
xmin
,
ymax
,
xmax
],
axis
=-
1
)
def
_decode_classes
(
self
,
parsed_tensors
):
return
parsed_tensors
[
'image/object/class/label'
]
def
_decode_areas
(
self
,
parsed_tensors
):
xmin
=
parsed_tensors
[
'image/object/bbox/xmin'
]
xmax
=
parsed_tensors
[
'image/object/bbox/xmax'
]
ymin
=
parsed_tensors
[
'image/object/bbox/ymin'
]
ymax
=
parsed_tensors
[
'image/object/bbox/ymax'
]
height
=
tf
.
cast
(
parsed_tensors
[
'image/height'
],
dtype
=
tf
.
float32
)
width
=
tf
.
cast
(
parsed_tensors
[
'image/width'
],
dtype
=
tf
.
float32
)
return
tf
.
cond
(
tf
.
greater
(
tf
.
shape
(
parsed_tensors
[
'image/object/area'
])[
0
],
0
),
lambda
:
parsed_tensors
[
'image/object/area'
],
lambda
:
(
xmax
-
xmin
)
*
(
ymax
-
ymin
)
*
height
*
width
)
def
_decode_masks
(
self
,
parsed_tensors
):
"""Decode a set of PNG masks to the tf.float32 tensors."""
def
_decode_png_mask
(
png_bytes
):
mask
=
tf
.
squeeze
(
tf
.
io
.
decode_png
(
png_bytes
,
channels
=
1
,
dtype
=
tf
.
uint8
),
axis
=-
1
)
mask
=
tf
.
cast
(
mask
,
dtype
=
tf
.
float32
)
mask
.
set_shape
([
None
,
None
])
return
mask
height
=
parsed_tensors
[
'image/height'
]
width
=
parsed_tensors
[
'image/width'
]
masks
=
parsed_tensors
[
'image/object/mask'
]
return
tf
.
cond
(
pred
=
tf
.
greater
(
tf
.
size
(
input
=
masks
),
0
),
true_fn
=
lambda
:
tf
.
map_fn
(
_decode_png_mask
,
masks
,
dtype
=
tf
.
float32
),
false_fn
=
lambda
:
tf
.
zeros
([
0
,
height
,
width
],
dtype
=
tf
.
float32
))
def
decode
(
self
,
serialized_example
):
"""Decode the serialized example.
Args:
serialized_example: a single serialized tf.Example string.
Returns:
decoded_tensors: a dictionary of tensors with the following fields:
- source_id: a string scalar tensor.
- image: a uint8 tensor of shape [None, None, 3].
- height: an integer scalar tensor.
- width: an integer scalar tensor.
- groundtruth_classes: a int64 tensor of shape [None].
- groundtruth_is_crowd: a bool tensor of shape [None].
- groundtruth_area: a float32 tensor of shape [None].
- groundtruth_boxes: a float32 tensor of shape [None, 4].
- groundtruth_instance_masks: a float32 tensor of shape
[None, None, None].
- groundtruth_instance_masks_png: a string tensor of shape [None].
"""
parsed_tensors
=
tf
.
io
.
parse_single_example
(
serialized
=
serialized_example
,
features
=
self
.
_keys_to_features
)
for
k
in
parsed_tensors
:
if
isinstance
(
parsed_tensors
[
k
],
tf
.
SparseTensor
):
if
parsed_tensors
[
k
].
dtype
==
tf
.
string
:
parsed_tensors
[
k
]
=
tf
.
sparse
.
to_dense
(
parsed_tensors
[
k
],
default_value
=
''
)
else
:
parsed_tensors
[
k
]
=
tf
.
sparse
.
to_dense
(
parsed_tensors
[
k
],
default_value
=
0
)
if
self
.
_regenerate_source_id
:
source_id
=
_generate_source_id
(
parsed_tensors
[
'image/encoded'
])
else
:
source_id
=
tf
.
cond
(
tf
.
greater
(
tf
.
strings
.
length
(
parsed_tensors
[
'image/source_id'
]),
0
),
lambda
:
parsed_tensors
[
'image/source_id'
],
lambda
:
_generate_source_id
(
parsed_tensors
[
'image/encoded'
]))
image
=
self
.
_decode_image
(
parsed_tensors
)
boxes
=
self
.
_decode_boxes
(
parsed_tensors
)
classes
=
self
.
_decode_classes
(
parsed_tensors
)
areas
=
self
.
_decode_areas
(
parsed_tensors
)
is_crowds
=
tf
.
cond
(
tf
.
greater
(
tf
.
shape
(
parsed_tensors
[
'image/object/is_crowd'
])[
0
],
0
),
lambda
:
tf
.
cast
(
parsed_tensors
[
'image/object/is_crowd'
],
dtype
=
tf
.
bool
),
lambda
:
tf
.
zeros_like
(
classes
,
dtype
=
tf
.
bool
))
if
self
.
_include_mask
:
masks
=
self
.
_decode_masks
(
parsed_tensors
)
if
self
.
_mask_binarize_threshold
is
not
None
:
masks
=
tf
.
cast
(
masks
>
self
.
_mask_binarize_threshold
,
tf
.
float32
)
decoded_tensors
=
{
'source_id'
:
source_id
,
'image'
:
image
,
'height'
:
parsed_tensors
[
'image/height'
],
'width'
:
parsed_tensors
[
'image/width'
],
'groundtruth_classes'
:
classes
,
'groundtruth_is_crowd'
:
is_crowds
,
'groundtruth_area'
:
areas
,
'groundtruth_boxes'
:
boxes
,
}
if
self
.
_include_mask
:
decoded_tensors
.
update
({
'groundtruth_instance_masks'
:
masks
,
'groundtruth_instance_masks_png'
:
parsed_tensors
[
'image/object/mask'
],
})
return
decoded_tensors
official/vision/dataloaders/tf_example_decoder_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tf_example_decoder.py."""
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.dataloaders
import
tf_example_decoder
from
official.vision.dataloaders
import
tfexample_utils
class
TfExampleDecoderTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
(
100
,
100
,
0
,
True
),
(
100
,
100
,
1
,
True
),
(
100
,
100
,
2
,
True
),
(
100
,
100
,
0
,
False
),
(
100
,
100
,
1
,
False
),
(
100
,
100
,
2
,
False
),
)
def
test_result_shape
(
self
,
image_height
,
image_width
,
num_instances
,
regenerate_source_id
):
decoder
=
tf_example_decoder
.
TfExampleDecoder
(
include_mask
=
True
,
regenerate_source_id
=
regenerate_source_id
)
serialized_example
=
tfexample_utils
.
create_detection_test_example
(
image_height
=
image_height
,
image_width
=
image_width
,
image_channel
=
3
,
num_instances
=
num_instances
).
SerializeToString
()
decoded_tensors
=
decoder
.
decode
(
tf
.
convert_to_tensor
(
value
=
serialized_example
))
results
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
decoded_tensors
)
self
.
assertAllEqual
(
(
image_height
,
image_width
,
3
),
results
[
'image'
].
shape
)
if
not
regenerate_source_id
:
self
.
assertEqual
(
tfexample_utils
.
DUMP_SOURCE_ID
,
results
[
'source_id'
])
self
.
assertEqual
(
image_height
,
results
[
'height'
])
self
.
assertEqual
(
image_width
,
results
[
'width'
])
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_classes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_is_crowd'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_area'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
4
),
results
[
'groundtruth_boxes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
image_height
,
image_width
),
results
[
'groundtruth_instance_masks'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_instance_masks_png'
].
shape
)
def
test_result_content
(
self
):
decoder
=
tf_example_decoder
.
TfExampleDecoder
(
include_mask
=
True
)
image_content
=
[[[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
255
,
255
,
255
],
[
255
,
255
,
255
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
255
,
255
,
255
],
[
255
,
255
,
255
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
]]]
image
=
tfexample_utils
.
encode_image
(
np
.
uint8
(
image_content
),
fmt
=
'PNG'
)
image_height
=
4
image_width
=
4
num_instances
=
2
xmins
=
[
0
,
0.25
]
xmaxs
=
[
0.5
,
1.0
]
ymins
=
[
0
,
0
]
ymaxs
=
[
0.5
,
1.0
]
labels
=
[
3
,
1
]
areas
=
[
0.25
*
image_height
*
image_width
,
0.75
*
image_height
*
image_width
]
is_crowds
=
[
1
,
0
]
mask_content
=
[[[
255
,
255
,
0
,
0
],
[
255
,
255
,
0
,
0
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]],
[[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
]]]
masks
=
[
tfexample_utils
.
encode_image
(
np
.
uint8
(
m
),
fmt
=
'PNG'
)
for
m
in
list
(
mask_content
)
]
serialized_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
{
'image/encoded'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
image
]))),
'image/source_id'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
tfexample_utils
.
DUMP_SOURCE_ID
]))),
'image/height'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_height
]))),
'image/width'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_width
]))),
'image/object/bbox/xmin'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
xmins
))),
'image/object/bbox/xmax'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
xmaxs
))),
'image/object/bbox/ymin'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
ymins
))),
'image/object/bbox/ymax'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
ymaxs
))),
'image/object/class/label'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
labels
))),
'image/object/is_crowd'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
is_crowds
))),
'image/object/area'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
areas
))),
'image/object/mask'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
masks
))),
})).
SerializeToString
()
decoded_tensors
=
decoder
.
decode
(
tf
.
convert_to_tensor
(
value
=
serialized_example
))
results
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
decoded_tensors
)
self
.
assertAllEqual
(
(
image_height
,
image_width
,
3
),
results
[
'image'
].
shape
)
self
.
assertAllEqual
(
image_content
,
results
[
'image'
])
self
.
assertEqual
(
tfexample_utils
.
DUMP_SOURCE_ID
,
results
[
'source_id'
])
self
.
assertEqual
(
image_height
,
results
[
'height'
])
self
.
assertEqual
(
image_width
,
results
[
'width'
])
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_classes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_is_crowd'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_area'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
4
),
results
[
'groundtruth_boxes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
image_height
,
image_width
),
results
[
'groundtruth_instance_masks'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_instance_masks_png'
].
shape
)
self
.
assertAllEqual
(
[
3
,
1
],
results
[
'groundtruth_classes'
])
self
.
assertAllEqual
(
[
True
,
False
],
results
[
'groundtruth_is_crowd'
])
self
.
assertNDArrayNear
(
[
0.25
*
image_height
*
image_width
,
0.75
*
image_height
*
image_width
],
results
[
'groundtruth_area'
],
1e-4
)
self
.
assertNDArrayNear
(
[[
0
,
0
,
0.5
,
0.5
],
[
0
,
0.25
,
1.0
,
1.0
]],
results
[
'groundtruth_boxes'
],
1e-4
)
self
.
assertNDArrayNear
(
mask_content
,
results
[
'groundtruth_instance_masks'
],
1e-4
)
self
.
assertAllEqual
(
masks
,
results
[
'groundtruth_instance_masks_png'
])
def
test_handling_missing_fields
(
self
):
decoder
=
tf_example_decoder
.
TfExampleDecoder
(
include_mask
=
True
)
image_content
=
[[[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
255
,
255
,
255
],
[
255
,
255
,
255
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
255
,
255
,
255
],
[
255
,
255
,
255
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
]]]
image
=
tfexample_utils
.
encode_image
(
np
.
uint8
(
image_content
),
fmt
=
'PNG'
)
image_height
=
4
image_width
=
4
num_instances
=
2
xmins
=
[
0
,
0.25
]
xmaxs
=
[
0.5
,
1.0
]
ymins
=
[
0
,
0
]
ymaxs
=
[
0.5
,
1.0
]
labels
=
[
3
,
1
]
mask_content
=
[[[
255
,
255
,
0
,
0
],
[
255
,
255
,
0
,
0
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]],
[[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
]]]
masks
=
[
tfexample_utils
.
encode_image
(
np
.
uint8
(
m
),
fmt
=
'PNG'
)
for
m
in
list
(
mask_content
)
]
serialized_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
{
'image/encoded'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
image
]))),
'image/source_id'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
tfexample_utils
.
DUMP_SOURCE_ID
]))),
'image/height'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_height
]))),
'image/width'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_width
]))),
'image/object/bbox/xmin'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
xmins
))),
'image/object/bbox/xmax'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
xmaxs
))),
'image/object/bbox/ymin'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
ymins
))),
'image/object/bbox/ymax'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
ymaxs
))),
'image/object/class/label'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
labels
))),
'image/object/mask'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
masks
))),
})).
SerializeToString
()
decoded_tensors
=
decoder
.
decode
(
tf
.
convert_to_tensor
(
serialized_example
))
results
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
decoded_tensors
)
self
.
assertAllEqual
(
(
image_height
,
image_width
,
3
),
results
[
'image'
].
shape
)
self
.
assertAllEqual
(
image_content
,
results
[
'image'
])
self
.
assertEqual
(
tfexample_utils
.
DUMP_SOURCE_ID
,
results
[
'source_id'
])
self
.
assertEqual
(
image_height
,
results
[
'height'
])
self
.
assertEqual
(
image_width
,
results
[
'width'
])
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_classes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_is_crowd'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_area'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
4
),
results
[
'groundtruth_boxes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
image_height
,
image_width
),
results
[
'groundtruth_instance_masks'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_instance_masks_png'
].
shape
)
self
.
assertAllEqual
(
[
3
,
1
],
results
[
'groundtruth_classes'
])
self
.
assertAllEqual
(
[
False
,
False
],
results
[
'groundtruth_is_crowd'
])
self
.
assertNDArrayNear
(
[
0.25
*
image_height
*
image_width
,
0.75
*
image_height
*
image_width
],
results
[
'groundtruth_area'
],
1e-4
)
self
.
assertNDArrayNear
(
[[
0
,
0
,
0.5
,
0.5
],
[
0
,
0.25
,
1.0
,
1.0
]],
results
[
'groundtruth_boxes'
],
1e-4
)
self
.
assertNDArrayNear
(
mask_content
,
results
[
'groundtruth_instance_masks'
],
1e-4
)
self
.
assertAllEqual
(
masks
,
results
[
'groundtruth_instance_masks_png'
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/dataloaders/tf_example_label_map_decoder.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import
csv
# Import libraries
import
tensorflow
as
tf
from
official.vision.dataloaders
import
tf_example_decoder
class
TfExampleDecoderLabelMap
(
tf_example_decoder
.
TfExampleDecoder
):
"""Tensorflow Example proto decoder."""
def
__init__
(
self
,
label_map
,
include_mask
=
False
,
regenerate_source_id
=
False
,
mask_binarize_threshold
=
None
):
super
(
TfExampleDecoderLabelMap
,
self
).
__init__
(
include_mask
=
include_mask
,
regenerate_source_id
=
regenerate_source_id
,
mask_binarize_threshold
=
mask_binarize_threshold
)
self
.
_keys_to_features
.
update
({
'image/object/class/text'
:
tf
.
io
.
VarLenFeature
(
tf
.
string
),
})
name_to_id
=
self
.
_process_label_map
(
label_map
)
self
.
_name_to_id_table
=
tf
.
lookup
.
StaticHashTable
(
tf
.
lookup
.
KeyValueTensorInitializer
(
keys
=
tf
.
constant
(
list
(
name_to_id
.
keys
()),
dtype
=
tf
.
string
),
values
=
tf
.
constant
(
list
(
name_to_id
.
values
()),
dtype
=
tf
.
int64
)),
default_value
=-
1
)
def
_process_label_map
(
self
,
label_map
):
if
label_map
.
endswith
(
'.csv'
):
name_to_id
=
self
.
_process_csv
(
label_map
)
else
:
raise
ValueError
(
'The label map file is in incorrect format.'
)
return
name_to_id
def
_process_csv
(
self
,
label_map
):
name_to_id
=
{}
with
tf
.
io
.
gfile
.
GFile
(
label_map
,
'r'
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
','
)
for
row
in
reader
:
if
len
(
row
)
!=
2
:
raise
ValueError
(
'Each row of the csv label map file must be in '
'`id,name` format. length = {}'
.
format
(
len
(
row
)))
id_index
=
int
(
row
[
0
])
name
=
row
[
1
]
name_to_id
[
name
]
=
id_index
return
name_to_id
def
_decode_classes
(
self
,
parsed_tensors
):
return
self
.
_name_to_id_table
.
lookup
(
parsed_tensors
[
'image/object/class/text'
])
official/vision/dataloaders/tf_example_label_map_decoder_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tf_example_label_map_decoder.py."""
import
os
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.dataloaders
import
tf_example_label_map_decoder
from
official.vision.dataloaders
import
tfexample_utils
LABEL_MAP_CSV_CONTENT
=
'0,class_0
\n
1,class_1
\n
2,class_2'
class
TfExampleDecoderLabelMapTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
(
100
,
100
,
0
),
(
100
,
100
,
1
),
(
100
,
100
,
2
),
(
100
,
100
,
0
),
(
100
,
100
,
1
),
(
100
,
100
,
2
),
)
def
test_result_shape
(
self
,
image_height
,
image_width
,
num_instances
):
label_map_dir
=
self
.
get_temp_dir
()
label_map_name
=
'label_map.csv'
label_map_path
=
os
.
path
.
join
(
label_map_dir
,
label_map_name
)
with
open
(
label_map_path
,
'w'
)
as
f
:
f
.
write
(
LABEL_MAP_CSV_CONTENT
)
decoder
=
tf_example_label_map_decoder
.
TfExampleDecoderLabelMap
(
label_map_path
,
include_mask
=
True
)
serialized_example
=
tfexample_utils
.
create_detection_test_example
(
image_height
=
image_height
,
image_width
=
image_width
,
image_channel
=
3
,
num_instances
=
num_instances
).
SerializeToString
()
decoded_tensors
=
decoder
.
decode
(
tf
.
convert_to_tensor
(
value
=
serialized_example
))
results
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
decoded_tensors
)
self
.
assertAllEqual
(
(
image_height
,
image_width
,
3
),
results
[
'image'
].
shape
)
self
.
assertEqual
(
tfexample_utils
.
DUMP_SOURCE_ID
,
results
[
'source_id'
])
self
.
assertEqual
(
image_height
,
results
[
'height'
])
self
.
assertEqual
(
image_width
,
results
[
'width'
])
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_classes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_is_crowd'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_area'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
4
),
results
[
'groundtruth_boxes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
image_height
,
image_width
),
results
[
'groundtruth_instance_masks'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_instance_masks_png'
].
shape
)
def
test_result_content
(
self
):
label_map_dir
=
self
.
get_temp_dir
()
label_map_name
=
'label_map.csv'
label_map_path
=
os
.
path
.
join
(
label_map_dir
,
label_map_name
)
with
open
(
label_map_path
,
'w'
)
as
f
:
f
.
write
(
LABEL_MAP_CSV_CONTENT
)
decoder
=
tf_example_label_map_decoder
.
TfExampleDecoderLabelMap
(
label_map_path
,
include_mask
=
True
)
image_content
=
[[[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
255
,
255
,
255
],
[
255
,
255
,
255
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
255
,
255
,
255
],
[
255
,
255
,
255
],
[
0
,
0
,
0
]],
[[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
],
[
0
,
0
,
0
]]]
image
=
tfexample_utils
.
encode_image
(
np
.
uint8
(
image_content
),
fmt
=
'PNG'
)
image_height
=
4
image_width
=
4
num_instances
=
2
xmins
=
[
0
,
0.25
]
xmaxs
=
[
0.5
,
1.0
]
ymins
=
[
0
,
0
]
ymaxs
=
[
0.5
,
1.0
]
labels
=
[
b
'class_2'
,
b
'class_0'
]
areas
=
[
0.25
*
image_height
*
image_width
,
0.75
*
image_height
*
image_width
]
is_crowds
=
[
1
,
0
]
mask_content
=
[[[
255
,
255
,
0
,
0
],
[
255
,
255
,
0
,
0
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]],
[[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
],
[
0
,
255
,
255
,
255
]]]
masks
=
[
tfexample_utils
.
encode_image
(
np
.
uint8
(
m
),
fmt
=
'PNG'
)
for
m
in
list
(
mask_content
)
]
serialized_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
{
'image/encoded'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
image
]))),
'image/source_id'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
tfexample_utils
.
DUMP_SOURCE_ID
]))),
'image/height'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_height
]))),
'image/width'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_width
]))),
'image/object/bbox/xmin'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
xmins
))),
'image/object/bbox/xmax'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
xmaxs
))),
'image/object/bbox/ymin'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
ymins
))),
'image/object/bbox/ymax'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
ymaxs
))),
'image/object/class/text'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
labels
))),
'image/object/is_crowd'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
is_crowds
))),
'image/object/area'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
areas
))),
'image/object/mask'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
masks
))),
})).
SerializeToString
()
decoded_tensors
=
decoder
.
decode
(
tf
.
convert_to_tensor
(
value
=
serialized_example
))
results
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
decoded_tensors
)
self
.
assertAllEqual
(
(
image_height
,
image_width
,
3
),
results
[
'image'
].
shape
)
self
.
assertAllEqual
(
image_content
,
results
[
'image'
])
self
.
assertEqual
(
tfexample_utils
.
DUMP_SOURCE_ID
,
results
[
'source_id'
])
self
.
assertEqual
(
image_height
,
results
[
'height'
])
self
.
assertEqual
(
image_width
,
results
[
'width'
])
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_classes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_is_crowd'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_area'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
4
),
results
[
'groundtruth_boxes'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,
image_height
,
image_width
),
results
[
'groundtruth_instance_masks'
].
shape
)
self
.
assertAllEqual
(
(
num_instances
,),
results
[
'groundtruth_instance_masks_png'
].
shape
)
self
.
assertAllEqual
(
[
2
,
0
],
results
[
'groundtruth_classes'
])
self
.
assertAllEqual
(
[
True
,
False
],
results
[
'groundtruth_is_crowd'
])
self
.
assertNDArrayNear
(
[
0.25
*
image_height
*
image_width
,
0.75
*
image_height
*
image_width
],
results
[
'groundtruth_area'
],
1e-4
)
self
.
assertNDArrayNear
(
[[
0
,
0
,
0.5
,
0.5
],
[
0
,
0.25
,
1.0
,
1.0
]],
results
[
'groundtruth_boxes'
],
1e-4
)
self
.
assertNDArrayNear
(
mask_content
,
results
[
'groundtruth_instance_masks'
],
1e-4
)
self
.
assertAllEqual
(
masks
,
results
[
'groundtruth_instance_masks_png'
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/dataloaders/tfds_classification_decoders.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS Classification decoders."""
import
tensorflow
as
tf
from
official.vision.dataloaders
import
decoder
class
ClassificationDecorder
(
decoder
.
Decoder
):
"""A tf.Example decoder for tfds classification datasets."""
def
decode
(
self
,
serialized_example
):
sample_dict
=
{
'image/encoded'
:
tf
.
io
.
encode_jpeg
(
serialized_example
[
'image'
],
quality
=
100
),
'image/class/label'
:
serialized_example
[
'label'
],
}
return
sample_dict
TFDS_ID_TO_DECODER_MAP
=
{
'cifar10'
:
ClassificationDecorder
,
'cifar100'
:
ClassificationDecorder
,
'imagenet2012'
:
ClassificationDecorder
,
}
official/vision/dataloaders/tfds_detection_decoders.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS detection decoders."""
import
tensorflow
as
tf
from
official.vision.dataloaders
import
decoder
class
MSCOCODecoder
(
decoder
.
Decoder
):
"""A tf.Example decoder for tfds coco datasets."""
def
decode
(
self
,
serialized_example
):
"""Decode the serialized example.
Args:
serialized_example: a dictonary example produced by tfds.
Returns:
decoded_tensors: a dictionary of tensors with the following fields:
- source_id: a string scalar tensor.
- image: a uint8 tensor of shape [None, None, 3].
- height: an integer scalar tensor.
- width: an integer scalar tensor.
- groundtruth_classes: a int64 tensor of shape [None].
- groundtruth_is_crowd: a bool tensor of shape [None].
- groundtruth_area: a float32 tensor of shape [None].
- groundtruth_boxes: a float32 tensor of shape [None, 4].
"""
decoded_tensors
=
{
'source_id'
:
tf
.
strings
.
as_string
(
serialized_example
[
'image/id'
]),
'image'
:
serialized_example
[
'image'
],
'height'
:
tf
.
cast
(
tf
.
shape
(
serialized_example
[
'image'
])[
0
],
tf
.
int64
),
'width'
:
tf
.
cast
(
tf
.
shape
(
serialized_example
[
'image'
])[
1
],
tf
.
int64
),
'groundtruth_classes'
:
serialized_example
[
'objects'
][
'label'
],
'groundtruth_is_crowd'
:
serialized_example
[
'objects'
][
'is_crowd'
],
'groundtruth_area'
:
tf
.
cast
(
serialized_example
[
'objects'
][
'area'
],
tf
.
float32
),
'groundtruth_boxes'
:
serialized_example
[
'objects'
][
'bbox'
],
}
return
decoded_tensors
TFDS_ID_TO_DECODER_MAP
=
{
'coco/2017'
:
MSCOCODecoder
,
'coco/2014'
:
MSCOCODecoder
,
'coco'
:
MSCOCODecoder
}
official/vision/dataloaders/tfds_factory.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS factory functions."""
from
official.vision.dataloaders
import
decoder
as
base_decoder
from
official.vision.dataloaders
import
tfds_detection_decoders
from
official.vision.dataloaders
import
tfds_segmentation_decoders
from
official.vision.dataloaders
import
tfds_classification_decoders
def
get_classification_decoder
(
tfds_name
:
str
)
->
base_decoder
.
Decoder
:
"""Gets classification decoder.
Args:
tfds_name: `str`, name of the tfds classification decoder.
Returns:
`base_decoder.Decoder` instance.
Raises:
ValueError if the tfds_name doesn't exist in the available decoders.
"""
if
tfds_name
in
tfds_classification_decoders
.
TFDS_ID_TO_DECODER_MAP
:
decoder
=
tfds_classification_decoders
.
TFDS_ID_TO_DECODER_MAP
[
tfds_name
]()
else
:
raise
ValueError
(
f
'TFDS Classification
{
tfds_name
}
is not supported'
)
return
decoder
def
get_detection_decoder
(
tfds_name
:
str
)
->
base_decoder
.
Decoder
:
"""Gets detection decoder.
Args:
tfds_name: `str`, name of the tfds detection decoder.
Returns:
`base_decoder.Decoder` instance.
Raises:
ValueError if the tfds_name doesn't exist in the available decoders.
"""
if
tfds_name
in
tfds_detection_decoders
.
TFDS_ID_TO_DECODER_MAP
:
decoder
=
tfds_detection_decoders
.
TFDS_ID_TO_DECODER_MAP
[
tfds_name
]()
else
:
raise
ValueError
(
f
'TFDS Detection
{
tfds_name
}
is not supported'
)
return
decoder
def
get_segmentation_decoder
(
tfds_name
:
str
)
->
base_decoder
.
Decoder
:
"""Gets segmentation decoder.
Args:
tfds_name: `str`, name of the tfds segmentation decoder.
Returns:
`base_decoder.Decoder` instance.
Raises:
ValueError if the tfds_name doesn't exist in the available decoders.
"""
if
tfds_name
in
tfds_segmentation_decoders
.
TFDS_ID_TO_DECODER_MAP
:
decoder
=
tfds_segmentation_decoders
.
TFDS_ID_TO_DECODER_MAP
[
tfds_name
]()
else
:
raise
ValueError
(
f
'TFDS Segmentation
{
tfds_name
}
is not supported'
)
return
decoder
official/vision/dataloaders/tfds_factory_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tfds factory functions."""
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.dataloaders
import
decoder
as
base_decoder
from
official.vision.dataloaders
import
tfds_factory
class
TFDSFactoryTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
_create_test_example
(
self
):
serialized_example
=
{
'image'
:
tf
.
ones
(
shape
=
(
100
,
100
,
3
),
dtype
=
tf
.
uint8
),
'label'
:
1
,
'image/id'
:
0
,
'objects'
:
{
'label'
:
1
,
'is_crowd'
:
0
,
'area'
:
0.5
,
'bbox'
:
[
0.1
,
0.2
,
0.3
,
0.4
]
},
'segmentation_label'
:
tf
.
ones
((
100
,
100
,
1
),
dtype
=
tf
.
uint8
),
'image_left'
:
tf
.
ones
(
shape
=
(
100
,
100
,
3
),
dtype
=
tf
.
uint8
)
}
return
serialized_example
@
parameterized
.
parameters
(
(
'imagenet2012'
),
(
'cifar10'
),
(
'cifar100'
),
)
def
test_classification_decoder
(
self
,
tfds_name
):
decoder
=
tfds_factory
.
get_classification_decoder
(
tfds_name
)
self
.
assertIsInstance
(
decoder
,
base_decoder
.
Decoder
)
decoded_tensor
=
decoder
.
decode
(
self
.
_create_test_example
())
self
.
assertLen
(
decoded_tensor
,
2
)
self
.
assertIn
(
'image/encoded'
,
decoded_tensor
)
self
.
assertIn
(
'image/class/label'
,
decoded_tensor
)
@
parameterized
.
parameters
(
(
'flowers'
),
(
'coco'
),
)
def
test_doesnt_exit_classification_decoder
(
self
,
tfds_name
):
with
self
.
assertRaises
(
ValueError
):
_
=
tfds_factory
.
get_classification_decoder
(
tfds_name
)
@
parameterized
.
parameters
(
(
'coco'
),
(
'coco/2014'
),
(
'coco/2017'
),
)
def
test_detection_decoder
(
self
,
tfds_name
):
decoder
=
tfds_factory
.
get_detection_decoder
(
tfds_name
)
self
.
assertIsInstance
(
decoder
,
base_decoder
.
Decoder
)
decoded_tensor
=
decoder
.
decode
(
self
.
_create_test_example
())
self
.
assertLen
(
decoded_tensor
,
8
)
self
.
assertIn
(
'image'
,
decoded_tensor
)
self
.
assertIn
(
'source_id'
,
decoded_tensor
)
self
.
assertIn
(
'height'
,
decoded_tensor
)
self
.
assertIn
(
'width'
,
decoded_tensor
)
self
.
assertIn
(
'groundtruth_classes'
,
decoded_tensor
)
self
.
assertIn
(
'groundtruth_is_crowd'
,
decoded_tensor
)
self
.
assertIn
(
'groundtruth_area'
,
decoded_tensor
)
self
.
assertIn
(
'groundtruth_boxes'
,
decoded_tensor
)
@
parameterized
.
parameters
(
(
'pascal'
),
(
'cityscapes'
),
)
def
test_doesnt_exit_detection_decoder
(
self
,
tfds_name
):
with
self
.
assertRaises
(
ValueError
):
_
=
tfds_factory
.
get_detection_decoder
(
tfds_name
)
@
parameterized
.
parameters
(
(
'cityscapes'
),
(
'cityscapes/semantic_segmentation'
),
(
'cityscapes/semantic_segmentation_extra'
),
)
def
test_segmentation_decoder
(
self
,
tfds_name
):
decoder
=
tfds_factory
.
get_segmentation_decoder
(
tfds_name
)
self
.
assertIsInstance
(
decoder
,
base_decoder
.
Decoder
)
decoded_tensor
=
decoder
.
decode
(
self
.
_create_test_example
())
self
.
assertLen
(
decoded_tensor
,
4
)
self
.
assertIn
(
'image/encoded'
,
decoded_tensor
)
self
.
assertIn
(
'image/segmentation/class/encoded'
,
decoded_tensor
)
self
.
assertIn
(
'image/height'
,
decoded_tensor
)
self
.
assertIn
(
'image/width'
,
decoded_tensor
)
@
parameterized
.
parameters
(
(
'coco'
),
(
'imagenet'
),
)
def
test_doesnt_exit_segmentation_decoder
(
self
,
tfds_name
):
with
self
.
assertRaises
(
ValueError
):
_
=
tfds_factory
.
get_segmentation_decoder
(
tfds_name
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/dataloaders/tfds_segmentation_decoders.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFDS Semantic Segmentation decoders."""
import
tensorflow
as
tf
from
official.vision.dataloaders
import
decoder
class
CityScapesDecorder
(
decoder
.
Decoder
):
"""A tf.Example decoder for tfds cityscapes datasets."""
def
__init__
(
self
):
# Original labels to trainable labels map, 255 is the ignore class.
self
.
_label_map
=
{
-
1
:
255
,
0
:
255
,
1
:
255
,
2
:
255
,
3
:
255
,
4
:
255
,
5
:
255
,
6
:
255
,
7
:
0
,
8
:
1
,
9
:
255
,
10
:
255
,
11
:
2
,
12
:
3
,
13
:
4
,
14
:
255
,
15
:
255
,
16
:
255
,
17
:
5
,
18
:
255
,
19
:
6
,
20
:
7
,
21
:
8
,
22
:
9
,
23
:
10
,
24
:
11
,
25
:
12
,
26
:
13
,
27
:
14
,
28
:
15
,
29
:
255
,
30
:
255
,
31
:
16
,
32
:
17
,
33
:
18
,
}
def
decode
(
self
,
serialized_example
):
# Convert labels according to the self._label_map
label
=
serialized_example
[
'segmentation_label'
]
for
original_label
in
self
.
_label_map
:
label
=
tf
.
where
(
label
==
original_label
,
self
.
_label_map
[
original_label
]
*
tf
.
ones_like
(
label
),
label
)
sample_dict
=
{
'image/encoded'
:
tf
.
io
.
encode_jpeg
(
serialized_example
[
'image_left'
],
quality
=
100
),
'image/height'
:
serialized_example
[
'image_left'
].
shape
[
0
],
'image/width'
:
serialized_example
[
'image_left'
].
shape
[
1
],
'image/segmentation/class/encoded'
:
tf
.
io
.
encode_png
(
label
),
}
return
sample_dict
TFDS_ID_TO_DECODER_MAP
=
{
'cityscapes'
:
CityScapesDecorder
,
'cityscapes/semantic_segmentation'
:
CityScapesDecorder
,
'cityscapes/semantic_segmentation_extra'
:
CityScapesDecorder
,
}
official/vision/dataloaders/tfexample_utils.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Utility functions to create tf.Example and tf.SequnceExample for test.
Example:video classification end-to-end test
i.e. from reading input file to train and eval.
```python
class FooTrainTest(tf.test.TestCase):
def setUp(self):
super(TrainTest, self).setUp()
# Write the fake tf.train.SequenceExample to file for test.
data_dir = os.path.join(self.get_temp_dir(), 'data')
tf.io.gfile.makedirs(data_dir)
self._data_path = os.path.join(data_dir, 'data.tfrecord')
examples = [
tfexample_utils.make_video_test_example(
image_shape=(36, 36, 3),
audio_shape=(20, 128),
label=random.randint(0, 100)) for _ in range(2)
]
tfexample_utils.dump_to_tfrecord(self._data_path, tf_examples=examples)
def test_foo(self):
dataset = tf.data.TFRecordDataset(self._data_path)
...
```
"""
import
io
from
typing
import
Sequence
,
Union
import
numpy
as
np
from
PIL
import
Image
import
tensorflow
as
tf
IMAGE_KEY
=
'image/encoded'
CLASSIFICATION_LABEL_KEY
=
'image/class/label'
DISTILATION_LABEL_KEY
=
'image/class/soft_labels'
LABEL_KEY
=
'clip/label/index'
AUDIO_KEY
=
'features/audio'
DUMP_SOURCE_ID
=
b
'123'
def
encode_image
(
image_array
:
np
.
array
,
fmt
:
str
)
->
bytes
:
image
=
Image
.
fromarray
(
image_array
)
with
io
.
BytesIO
()
as
output
:
image
.
save
(
output
,
format
=
fmt
)
return
output
.
getvalue
()
def
make_image_bytes
(
shape
:
Sequence
[
int
],
fmt
:
str
=
'JPEG'
)
->
bytes
:
"""Generates image and return bytes in specified format."""
random_image
=
np
.
random
.
randint
(
0
,
256
,
size
=
shape
,
dtype
=
np
.
uint8
)
return
encode_image
(
random_image
,
fmt
=
fmt
)
def
put_int64_to_context
(
seq_example
:
tf
.
train
.
SequenceExample
,
label
:
int
=
0
,
key
:
str
=
LABEL_KEY
):
"""Puts int64 to SequenceExample context with key."""
seq_example
.
context
.
feature
[
key
].
int64_list
.
value
[:]
=
[
label
]
def
put_bytes_list_to_feature
(
seq_example
:
tf
.
train
.
SequenceExample
,
raw_image_bytes
:
bytes
,
key
:
str
=
IMAGE_KEY
,
repeat_num
:
int
=
2
):
"""Puts bytes list to SequenceExample context with key."""
for
_
in
range
(
repeat_num
):
seq_example
.
feature_lists
.
feature_list
.
get_or_create
(
key
).
feature
.
add
().
bytes_list
.
value
[:]
=
[
raw_image_bytes
]
def
put_float_list_to_feature
(
seq_example
:
tf
.
train
.
SequenceExample
,
value
:
Sequence
[
Sequence
[
float
]],
key
:
str
):
"""Puts float list to SequenceExample context with key."""
for
s
in
value
:
seq_example
.
feature_lists
.
feature_list
.
get_or_create
(
key
).
feature
.
add
().
float_list
.
value
[:]
=
s
def
make_video_test_example
(
image_shape
:
Sequence
[
int
]
=
(
263
,
320
,
3
),
audio_shape
:
Sequence
[
int
]
=
(
10
,
256
),
label
:
int
=
42
):
"""Generates data for testing video models (inc. RGB, audio, & label)."""
raw_image_bytes
=
make_image_bytes
(
shape
=
image_shape
)
random_audio
=
np
.
random
.
normal
(
size
=
audio_shape
).
tolist
()
seq_example
=
tf
.
train
.
SequenceExample
()
put_int64_to_context
(
seq_example
,
label
=
label
,
key
=
LABEL_KEY
)
put_bytes_list_to_feature
(
seq_example
,
raw_image_bytes
,
key
=
IMAGE_KEY
,
repeat_num
=
4
)
put_float_list_to_feature
(
seq_example
,
value
=
random_audio
,
key
=
AUDIO_KEY
)
return
seq_example
def
dump_to_tfrecord
(
record_file
:
str
,
tf_examples
:
Sequence
[
Union
[
tf
.
train
.
Example
,
tf
.
train
.
SequenceExample
]]):
"""Writes serialized Example to TFRecord file with path."""
with
tf
.
io
.
TFRecordWriter
(
record_file
)
as
writer
:
for
tf_example
in
tf_examples
:
writer
.
write
(
tf_example
.
SerializeToString
())
def
_encode_image
(
image_array
:
np
.
ndarray
,
fmt
:
str
)
->
bytes
:
"""Util function to encode an image."""
image
=
Image
.
fromarray
(
image_array
)
with
io
.
BytesIO
()
as
output
:
image
.
save
(
output
,
format
=
fmt
)
return
output
.
getvalue
()
def
create_classification_example
(
image_height
:
int
,
image_width
:
int
,
image_format
:
str
=
'JPEG'
,
is_multilabel
:
bool
=
False
)
->
tf
.
train
.
Example
:
"""Creates image and labels for image classification input pipeline."""
image
=
_encode_image
(
np
.
uint8
(
np
.
random
.
rand
(
image_height
,
image_width
,
3
)
*
255
),
fmt
=
image_format
)
labels
=
[
0
,
1
]
if
is_multilabel
else
[
0
]
serialized_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
{
IMAGE_KEY
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
image
]))),
CLASSIFICATION_LABEL_KEY
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
labels
))),
})).
SerializeToString
()
return
serialized_example
def
create_distillation_example
(
image_height
:
int
,
image_width
:
int
,
num_labels
:
int
,
image_format
:
str
=
'JPEG'
)
->
tf
.
train
.
Example
:
"""Creates image and labels for image classification with distillation."""
image
=
_encode_image
(
np
.
uint8
(
np
.
random
.
rand
(
image_height
,
image_width
,
3
)
*
255
),
fmt
=
image_format
)
soft_labels
=
[
0.6
]
*
num_labels
serialized_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
{
IMAGE_KEY
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
image
]))),
DISTILATION_LABEL_KEY
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
soft_labels
))),
})).
SerializeToString
()
return
serialized_example
def
create_3d_image_test_example
(
image_height
:
int
,
image_width
:
int
,
image_volume
:
int
,
image_channel
:
int
)
->
tf
.
train
.
Example
:
"""Creates 3D image and label."""
images
=
np
.
random
.
rand
(
image_height
,
image_width
,
image_volume
,
image_channel
)
images
=
images
.
astype
(
np
.
float32
)
labels
=
np
.
random
.
randint
(
low
=
2
,
size
=
(
image_height
,
image_width
,
image_volume
,
image_channel
))
labels
=
labels
.
astype
(
np
.
float32
)
feature
=
{
IMAGE_KEY
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
images
.
tobytes
()]))),
CLASSIFICATION_LABEL_KEY
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
labels
.
tobytes
()])))
}
return
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
feature
))
def
create_detection_test_example
(
image_height
:
int
,
image_width
:
int
,
image_channel
:
int
,
num_instances
:
int
)
->
tf
.
train
.
Example
:
"""Creates and returns a test example containing box and mask annotations.
Args:
image_height: The height of test image.
image_width: The width of test image.
image_channel: The channel of test image.
num_instances: The number of object instances per image.
Returns:
A tf.train.Example for testing.
"""
image
=
make_image_bytes
([
image_height
,
image_width
,
image_channel
])
if
num_instances
==
0
:
xmins
=
[]
xmaxs
=
[]
ymins
=
[]
ymaxs
=
[]
labels
=
[]
areas
=
[]
is_crowds
=
[]
masks
=
[]
labels_text
=
[]
else
:
xmins
=
list
(
np
.
random
.
rand
(
num_instances
))
xmaxs
=
list
(
np
.
random
.
rand
(
num_instances
))
ymins
=
list
(
np
.
random
.
rand
(
num_instances
))
ymaxs
=
list
(
np
.
random
.
rand
(
num_instances
))
labels_text
=
[
b
'class_1'
]
*
num_instances
labels
=
list
(
np
.
random
.
randint
(
100
,
size
=
num_instances
))
areas
=
[(
xmax
-
xmin
)
*
(
ymax
-
ymin
)
*
image_height
*
image_width
for
xmin
,
xmax
,
ymin
,
ymax
in
zip
(
xmins
,
xmaxs
,
ymins
,
ymaxs
)]
is_crowds
=
[
0
]
*
num_instances
masks
=
[]
for
_
in
range
(
num_instances
):
mask
=
make_image_bytes
([
image_height
,
image_width
],
fmt
=
'PNG'
)
masks
.
append
(
mask
)
return
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
{
'image/encoded'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
image
]))),
'image/source_id'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
DUMP_SOURCE_ID
]))),
'image/height'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_height
]))),
'image/width'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_width
]))),
'image/object/bbox/xmin'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
xmins
))),
'image/object/bbox/xmax'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
xmaxs
))),
'image/object/bbox/ymin'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
ymins
))),
'image/object/bbox/ymax'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
ymaxs
))),
'image/object/class/label'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
labels
))),
'image/object/class/text'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
labels_text
))),
'image/object/is_crowd'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
is_crowds
))),
'image/object/area'
:
(
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
areas
))),
'image/object/mask'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
masks
))),
}))
def
create_segmentation_test_example
(
image_height
:
int
,
image_width
:
int
,
image_channel
:
int
)
->
tf
.
train
.
Example
:
"""Creates and returns a test example containing mask annotations.
Args:
image_height: The height of test image.
image_width: The width of test image.
image_channel: The channel of test image.
Returns:
A tf.train.Example for testing.
"""
image
=
make_image_bytes
([
image_height
,
image_width
,
image_channel
])
mask
=
make_image_bytes
([
image_height
,
image_width
],
fmt
=
'PNG'
)
return
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
{
'image/encoded'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
image
]))),
'image/segmentation/class/encoded'
:
(
tf
.
train
.
Feature
(
bytes_list
=
tf
.
train
.
BytesList
(
value
=
[
mask
]))),
'image/height'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_height
]))),
'image/width'
:
(
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
image_width
])))
}))
official/vision/dataloaders/utils.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data loader utils."""
from
typing
import
Dict
# Import libraries
import
tensorflow
as
tf
from
official.vision.ops
import
preprocess_ops
def
process_source_id
(
source_id
:
tf
.
Tensor
)
->
tf
.
Tensor
:
"""Processes source_id to the right format.
Args:
source_id: A `tf.Tensor` that contains the source ID. It can be empty.
Returns:
A formatted source ID.
"""
if
source_id
.
dtype
==
tf
.
string
:
source_id
=
tf
.
strings
.
to_number
(
source_id
,
tf
.
int64
)
with
tf
.
control_dependencies
([
source_id
]):
source_id
=
tf
.
cond
(
pred
=
tf
.
equal
(
tf
.
size
(
input
=
source_id
),
0
),
true_fn
=
lambda
:
tf
.
cast
(
tf
.
constant
(
-
1
),
tf
.
int64
),
false_fn
=
lambda
:
tf
.
identity
(
source_id
))
return
source_id
def
pad_groundtruths_to_fixed_size
(
groundtruths
:
Dict
[
str
,
tf
.
Tensor
],
size
:
int
)
->
Dict
[
str
,
tf
.
Tensor
]:
"""Pads the first dimension of groundtruths labels to the fixed size.
Args:
groundtruths: A dictionary of {`str`: `tf.Tensor`} that contains groundtruth
annotations of `boxes`, `is_crowds`, `areas` and `classes`.
size: An `int` that specifies the expected size of the first dimension of
padded tensors.
Returns:
A dictionary of the same keys as input and padded tensors as values.
"""
groundtruths
[
'boxes'
]
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
groundtruths
[
'boxes'
],
size
,
-
1
)
groundtruths
[
'is_crowds'
]
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
groundtruths
[
'is_crowds'
],
size
,
0
)
groundtruths
[
'areas'
]
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
groundtruths
[
'areas'
],
size
,
-
1
)
groundtruths
[
'classes'
]
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
groundtruths
[
'classes'
],
size
,
-
1
)
if
'attributes'
in
groundtruths
:
for
k
,
v
in
groundtruths
[
'attributes'
].
items
():
groundtruths
[
'attributes'
][
k
]
=
preprocess_ops
.
clip_or_pad_to_fixed_size
(
v
,
size
,
-
1
)
return
groundtruths
official/vision/dataloaders/utils_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for dataloader utils functions."""
# Import libraries
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.dataloaders
import
utils
class
UtilsTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
test_process_empty_source_id
(
self
):
source_id
=
tf
.
constant
([],
dtype
=
tf
.
int64
)
source_id
=
tf
.
strings
.
as_string
(
source_id
)
self
.
assertEqual
(
-
1
,
utils
.
process_source_id
(
source_id
=
source_id
))
@
parameterized
.
parameters
(
([
128
,
256
],
[
128
,
256
]),
([
128
,
32
,
16
],
[
128
,
32
,
16
]),
)
def
test_process_source_id
(
self
,
source_id
,
expected_result
):
source_id
=
tf
.
constant
(
source_id
,
dtype
=
tf
.
int64
)
source_id
=
tf
.
strings
.
as_string
(
source_id
)
self
.
assertSequenceAlmostEqual
(
expected_result
,
utils
.
process_source_id
(
source_id
=
source_id
))
@
parameterized
.
parameters
(
([[
10
,
20
,
30
,
40
]],
[[
100
]],
[[
0
]],
10
,
None
),
([[
0.1
,
0.2
,
0.5
,
0.6
]],
[[
0.5
]],
[[
1
]],
2
,
[[
1.0
,
2.0
]]),
)
def
test_pad_groundtruths_to_fixed_size
(
self
,
boxes
,
area
,
classes
,
size
,
attributes
):
groundtruths
=
{}
groundtruths
[
'boxes'
]
=
tf
.
constant
(
boxes
)
groundtruths
[
'is_crowds'
]
=
tf
.
constant
([[
0
]])
groundtruths
[
'areas'
]
=
tf
.
constant
(
area
)
groundtruths
[
'classes'
]
=
tf
.
constant
(
classes
)
if
attributes
:
groundtruths
[
'attributes'
]
=
{
'depth'
:
tf
.
constant
(
attributes
)}
actual_result
=
utils
.
pad_groundtruths_to_fixed_size
(
groundtruths
=
groundtruths
,
size
=
size
)
# Check that the first dimension is padded to the expected size.
for
key
in
actual_result
:
if
key
==
'attributes'
:
for
_
,
v
in
actual_result
[
key
].
items
():
pad_shape
=
v
.
shape
[
0
]
self
.
assertEqual
(
size
,
pad_shape
)
else
:
pad_shape
=
actual_result
[
key
].
shape
[
0
]
self
.
assertEqual
(
size
,
pad_shape
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/dataloaders/video_input.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Parser for video and label datasets."""
from
typing
import
Dict
,
Optional
,
Tuple
,
Union
from
absl
import
logging
import
tensorflow
as
tf
from
official.vision.configs
import
video_classification
as
exp_cfg
from
official.vision.dataloaders
import
decoder
from
official.vision.dataloaders
import
parser
from
official.vision.ops
import
augment
from
official.vision.ops
import
preprocess_ops_3d
IMAGE_KEY
=
'image/encoded'
LABEL_KEY
=
'clip/label/index'
def
process_image
(
image
:
tf
.
Tensor
,
is_training
:
bool
=
True
,
num_frames
:
int
=
32
,
stride
:
int
=
1
,
random_stride_range
:
int
=
0
,
num_test_clips
:
int
=
1
,
min_resize
:
int
=
256
,
crop_size
:
int
=
224
,
num_crops
:
int
=
1
,
zero_centering_image
:
bool
=
False
,
min_aspect_ratio
:
float
=
0.5
,
max_aspect_ratio
:
float
=
2
,
min_area_ratio
:
float
=
0.49
,
max_area_ratio
:
float
=
1.0
,
augmenter
:
Optional
[
augment
.
ImageAugment
]
=
None
,
seed
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Processes a serialized image tensor.
Args:
image: Input Tensor of shape [timesteps] and type tf.string of serialized
frames.
is_training: Whether or not in training mode. If True, random sample, crop
and left right flip is used.
num_frames: Number of frames per subclip.
stride: Temporal stride to sample frames.
random_stride_range: An int indicating the min and max bounds to uniformly
sample different strides from the video. E.g., a value of 1 with stride=2
will uniformly sample a stride in {1, 2, 3} for each video in a batch.
Only used enabled training for the purposes of frame-rate augmentation.
Defaults to 0, which disables random sampling.
num_test_clips: Number of test clips (1 by default). If more than 1, this
will sample multiple linearly spaced clips within each video at test time.
If 1, then a single clip in the middle of the video is sampled. The clips
are aggreagated in the batch dimension.
min_resize: Frames are resized so that min(height, width) is min_resize.
crop_size: Final size of the frame after cropping the resized frames. Both
height and width are the same.
num_crops: Number of crops to perform on the resized frames.
zero_centering_image: If True, frames are normalized to values in [-1, 1].
If False, values in [0, 1].
min_aspect_ratio: The minimum aspect range for cropping.
max_aspect_ratio: The maximum aspect range for cropping.
min_area_ratio: The minimum area range for cropping.
max_area_ratio: The maximum area range for cropping.
augmenter: Image augmenter to distort each image.
seed: A deterministic seed to use when sampling.
Returns:
Processed frames. Tensor of shape
[num_frames * num_test_clips, crop_size, crop_size, 3].
"""
# Validate parameters.
if
is_training
and
num_test_clips
!=
1
:
logging
.
warning
(
'`num_test_clips` %d is ignored since `is_training` is `True`.'
,
num_test_clips
)
if
random_stride_range
<
0
:
raise
ValueError
(
'Random stride range should be >= 0, got {}'
.
format
(
random_stride_range
))
# Temporal sampler.
if
is_training
:
if
random_stride_range
>
0
:
# Uniformly sample different frame-rates
stride
=
tf
.
random
.
uniform
(
[],
tf
.
maximum
(
stride
-
random_stride_range
,
1
),
stride
+
random_stride_range
,
dtype
=
tf
.
int32
)
# Sample random clip.
image
=
preprocess_ops_3d
.
sample_sequence
(
image
,
num_frames
,
True
,
stride
,
seed
)
elif
num_test_clips
>
1
:
# Sample linspace clips.
image
=
preprocess_ops_3d
.
sample_linspace_sequence
(
image
,
num_test_clips
,
num_frames
,
stride
)
else
:
# Sample middle clip.
image
=
preprocess_ops_3d
.
sample_sequence
(
image
,
num_frames
,
False
,
stride
)
# Decode JPEG string to tf.uint8.
if
image
.
dtype
==
tf
.
string
:
image
=
preprocess_ops_3d
.
decode_jpeg
(
image
,
3
)
if
is_training
:
# Standard image data augmentation: random resized crop and random flip.
image
=
preprocess_ops_3d
.
random_crop_resize
(
image
,
crop_size
,
crop_size
,
num_frames
,
3
,
(
min_aspect_ratio
,
max_aspect_ratio
),
(
min_area_ratio
,
max_area_ratio
))
image
=
preprocess_ops_3d
.
random_flip_left_right
(
image
,
seed
)
if
augmenter
is
not
None
:
image
=
augmenter
.
distort
(
image
)
else
:
# Resize images (resize happens only if necessary to save compute).
image
=
preprocess_ops_3d
.
resize_smallest
(
image
,
min_resize
)
# Crop of the frames.
image
=
preprocess_ops_3d
.
crop_image
(
image
,
crop_size
,
crop_size
,
False
,
num_crops
)
# Cast the frames in float32, normalizing according to zero_centering_image.
return
preprocess_ops_3d
.
normalize_image
(
image
,
zero_centering_image
)
def
postprocess_image
(
image
:
tf
.
Tensor
,
is_training
:
bool
=
True
,
num_frames
:
int
=
32
,
num_test_clips
:
int
=
1
,
num_test_crops
:
int
=
1
)
->
tf
.
Tensor
:
"""Processes a batched Tensor of frames.
The same parameters used in process should be used here.
Args:
image: Input Tensor of shape [batch, timesteps, height, width, 3].
is_training: Whether or not in training mode. If True, random sample, crop
and left right flip is used.
num_frames: Number of frames per subclip.
num_test_clips: Number of test clips (1 by default). If more than 1, this
will sample multiple linearly spaced clips within each video at test time.
If 1, then a single clip in the middle of the video is sampled. The clips
are aggreagated in the batch dimension.
num_test_crops: Number of test crops (1 by default). If more than 1, there
are multiple crops for each clip at test time. If 1, there is a single
central crop. The crops are aggreagated in the batch dimension.
Returns:
Processed frames. Tensor of shape
[batch * num_test_clips * num_test_crops, num_frames, height, width, 3].
"""
num_views
=
num_test_clips
*
num_test_crops
if
num_views
>
1
and
not
is_training
:
# In this case, multiple views are merged together in batch dimenstion which
# will be batch * num_views.
image
=
tf
.
reshape
(
image
,
[
-
1
,
num_frames
]
+
image
.
shape
[
2
:].
as_list
())
return
image
def
process_label
(
label
:
tf
.
Tensor
,
one_hot_label
:
bool
=
True
,
num_classes
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
"""Processes label Tensor."""
# Validate parameters.
if
one_hot_label
and
not
num_classes
:
raise
ValueError
(
'`num_classes` should be given when requesting one hot label.'
)
# Cast to tf.int32.
label
=
tf
.
cast
(
label
,
dtype
=
tf
.
int32
)
if
one_hot_label
:
# Replace label index by one hot representation.
label
=
tf
.
one_hot
(
label
,
num_classes
)
if
len
(
label
.
shape
.
as_list
())
>
1
:
label
=
tf
.
reduce_sum
(
label
,
axis
=
0
)
if
num_classes
==
1
:
# The trick for single label.
label
=
1
-
label
return
label
class
Decoder
(
decoder
.
Decoder
):
"""A tf.Example decoder for classification task."""
def
__init__
(
self
,
image_key
:
str
=
IMAGE_KEY
,
label_key
:
str
=
LABEL_KEY
):
self
.
_context_description
=
{
# One integer stored in context.
label_key
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
}
self
.
_sequence_description
=
{
# Each image is a string encoding JPEG.
image_key
:
tf
.
io
.
FixedLenSequenceFeature
((),
tf
.
string
),
}
def
add_feature
(
self
,
feature_name
:
str
,
feature_type
:
Union
[
tf
.
io
.
VarLenFeature
,
tf
.
io
.
FixedLenFeature
,
tf
.
io
.
FixedLenSequenceFeature
]):
self
.
_sequence_description
[
feature_name
]
=
feature_type
def
add_context
(
self
,
feature_name
:
str
,
feature_type
:
Union
[
tf
.
io
.
VarLenFeature
,
tf
.
io
.
FixedLenFeature
,
tf
.
io
.
FixedLenSequenceFeature
]):
self
.
_context_description
[
feature_name
]
=
feature_type
def
decode
(
self
,
serialized_example
):
"""Parses a single tf.Example into image and label tensors."""
result
=
{}
context
,
sequences
=
tf
.
io
.
parse_single_sequence_example
(
serialized_example
,
self
.
_context_description
,
self
.
_sequence_description
)
result
.
update
(
context
)
result
.
update
(
sequences
)
for
key
,
value
in
result
.
items
():
if
isinstance
(
value
,
tf
.
SparseTensor
):
result
[
key
]
=
tf
.
sparse
.
to_dense
(
value
)
return
result
class
VideoTfdsDecoder
(
decoder
.
Decoder
):
"""A tf.SequenceExample decoder for tfds video classification datasets."""
def
__init__
(
self
,
image_key
:
str
=
IMAGE_KEY
,
label_key
:
str
=
LABEL_KEY
):
self
.
_image_key
=
image_key
self
.
_label_key
=
label_key
def
decode
(
self
,
features
):
"""Decode the TFDS FeatureDict.
Args:
features: features from TFDS video dataset.
See https://www.tensorflow.org/datasets/catalog/ucf101 for example.
Returns:
Dict of tensors.
"""
sample_dict
=
{
self
.
_image_key
:
features
[
'video'
],
self
.
_label_key
:
features
[
'label'
],
}
return
sample_dict
class
Parser
(
parser
.
Parser
):
"""Parses a video and label dataset."""
def
__init__
(
self
,
input_params
:
exp_cfg
.
DataConfig
,
image_key
:
str
=
IMAGE_KEY
,
label_key
:
str
=
LABEL_KEY
):
self
.
_num_frames
=
input_params
.
feature_shape
[
0
]
self
.
_stride
=
input_params
.
temporal_stride
self
.
_random_stride_range
=
input_params
.
random_stride_range
self
.
_num_test_clips
=
input_params
.
num_test_clips
self
.
_min_resize
=
input_params
.
min_image_size
self
.
_crop_size
=
input_params
.
feature_shape
[
1
]
self
.
_num_crops
=
input_params
.
num_test_crops
self
.
_one_hot_label
=
input_params
.
one_hot
self
.
_num_classes
=
input_params
.
num_classes
self
.
_image_key
=
image_key
self
.
_label_key
=
label_key
self
.
_dtype
=
tf
.
dtypes
.
as_dtype
(
input_params
.
dtype
)
self
.
_output_audio
=
input_params
.
output_audio
self
.
_min_aspect_ratio
=
input_params
.
aug_min_aspect_ratio
self
.
_max_aspect_ratio
=
input_params
.
aug_max_aspect_ratio
self
.
_min_area_ratio
=
input_params
.
aug_min_area_ratio
self
.
_max_area_ratio
=
input_params
.
aug_max_area_ratio
if
self
.
_output_audio
:
self
.
_audio_feature
=
input_params
.
audio_feature
self
.
_audio_shape
=
input_params
.
audio_feature_shape
self
.
_augmenter
=
None
if
input_params
.
aug_type
is
not
None
:
aug_type
=
input_params
.
aug_type
if
aug_type
==
'autoaug'
:
logging
.
info
(
'Using AutoAugment.'
)
self
.
_augmenter
=
augment
.
AutoAugment
()
elif
aug_type
==
'randaug'
:
logging
.
info
(
'Using RandAugment.'
)
self
.
_augmenter
=
augment
.
RandAugment
()
else
:
raise
ValueError
(
'Augmentation policy {} is not supported.'
.
format
(
aug_type
))
def
_parse_train_data
(
self
,
decoded_tensors
:
Dict
[
str
,
tf
.
Tensor
]
)
->
Tuple
[
Dict
[
str
,
tf
.
Tensor
],
tf
.
Tensor
]:
"""Parses data for training."""
# Process image and label.
image
=
decoded_tensors
[
self
.
_image_key
]
image
=
process_image
(
image
=
image
,
is_training
=
True
,
num_frames
=
self
.
_num_frames
,
stride
=
self
.
_stride
,
random_stride_range
=
self
.
_random_stride_range
,
num_test_clips
=
self
.
_num_test_clips
,
min_resize
=
self
.
_min_resize
,
crop_size
=
self
.
_crop_size
,
min_aspect_ratio
=
self
.
_min_aspect_ratio
,
max_aspect_ratio
=
self
.
_max_aspect_ratio
,
min_area_ratio
=
self
.
_min_area_ratio
,
max_area_ratio
=
self
.
_max_area_ratio
,
augmenter
=
self
.
_augmenter
)
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
features
=
{
'image'
:
image
}
label
=
decoded_tensors
[
self
.
_label_key
]
label
=
process_label
(
label
,
self
.
_one_hot_label
,
self
.
_num_classes
)
if
self
.
_output_audio
:
audio
=
decoded_tensors
[
self
.
_audio_feature
]
audio
=
tf
.
cast
(
audio
,
dtype
=
self
.
_dtype
)
# TODO(yeqing): synchronize audio/video sampling. Especially randomness.
audio
=
preprocess_ops_3d
.
sample_sequence
(
audio
,
self
.
_audio_shape
[
0
],
random
=
False
,
stride
=
1
)
audio
=
tf
.
ensure_shape
(
audio
,
self
.
_audio_shape
)
features
[
'audio'
]
=
audio
return
features
,
label
def
_parse_eval_data
(
self
,
decoded_tensors
:
Dict
[
str
,
tf
.
Tensor
]
)
->
Tuple
[
Dict
[
str
,
tf
.
Tensor
],
tf
.
Tensor
]:
"""Parses data for evaluation."""
image
=
decoded_tensors
[
self
.
_image_key
]
image
=
process_image
(
image
=
image
,
is_training
=
False
,
num_frames
=
self
.
_num_frames
,
stride
=
self
.
_stride
,
num_test_clips
=
self
.
_num_test_clips
,
min_resize
=
self
.
_min_resize
,
crop_size
=
self
.
_crop_size
,
num_crops
=
self
.
_num_crops
)
image
=
tf
.
cast
(
image
,
dtype
=
self
.
_dtype
)
features
=
{
'image'
:
image
}
label
=
decoded_tensors
[
self
.
_label_key
]
label
=
process_label
(
label
,
self
.
_one_hot_label
,
self
.
_num_classes
)
if
self
.
_output_audio
:
audio
=
decoded_tensors
[
self
.
_audio_feature
]
audio
=
tf
.
cast
(
audio
,
dtype
=
self
.
_dtype
)
audio
=
preprocess_ops_3d
.
sample_sequence
(
audio
,
self
.
_audio_shape
[
0
],
random
=
False
,
stride
=
1
)
audio
=
tf
.
ensure_shape
(
audio
,
self
.
_audio_shape
)
features
[
'audio'
]
=
audio
return
features
,
label
class
PostBatchProcessor
(
object
):
"""Processes a video and label dataset which is batched."""
def
__init__
(
self
,
input_params
:
exp_cfg
.
DataConfig
):
self
.
_is_training
=
input_params
.
is_training
self
.
_num_frames
=
input_params
.
feature_shape
[
0
]
self
.
_num_test_clips
=
input_params
.
num_test_clips
self
.
_num_test_crops
=
input_params
.
num_test_crops
def
__call__
(
self
,
features
:
Dict
[
str
,
tf
.
Tensor
],
label
:
tf
.
Tensor
)
->
Tuple
[
Dict
[
str
,
tf
.
Tensor
],
tf
.
Tensor
]:
"""Parses a single tf.Example into image and label tensors."""
for
key
in
[
'image'
]:
if
key
in
features
:
features
[
key
]
=
postprocess_image
(
image
=
features
[
key
],
is_training
=
self
.
_is_training
,
num_frames
=
self
.
_num_frames
,
num_test_clips
=
self
.
_num_test_clips
,
num_test_crops
=
self
.
_num_test_crops
)
return
features
,
label
official/vision/dataloaders/video_input_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
import
io
# Import libraries
import
numpy
as
np
from
PIL
import
Image
import
tensorflow
as
tf
import
tensorflow_datasets
as
tfds
from
official.vision.configs
import
video_classification
as
exp_cfg
from
official.vision.dataloaders
import
video_input
AUDIO_KEY
=
'features/audio'
def
fake_seq_example
():
# Create fake data.
random_image
=
np
.
random
.
randint
(
0
,
256
,
size
=
(
263
,
320
,
3
),
dtype
=
np
.
uint8
)
random_image
=
Image
.
fromarray
(
random_image
)
label
=
42
with
io
.
BytesIO
()
as
buffer
:
random_image
.
save
(
buffer
,
format
=
'JPEG'
)
raw_image_bytes
=
buffer
.
getvalue
()
seq_example
=
tf
.
train
.
SequenceExample
()
seq_example
.
feature_lists
.
feature_list
.
get_or_create
(
video_input
.
IMAGE_KEY
).
feature
.
add
().
bytes_list
.
value
[:]
=
[
raw_image_bytes
]
seq_example
.
feature_lists
.
feature_list
.
get_or_create
(
video_input
.
IMAGE_KEY
).
feature
.
add
().
bytes_list
.
value
[:]
=
[
raw_image_bytes
]
seq_example
.
context
.
feature
[
video_input
.
LABEL_KEY
].
int64_list
.
value
[:]
=
[
label
]
random_audio
=
np
.
random
.
normal
(
size
=
(
10
,
256
)).
tolist
()
for
s
in
random_audio
:
seq_example
.
feature_lists
.
feature_list
.
get_or_create
(
AUDIO_KEY
).
feature
.
add
().
float_list
.
value
[:]
=
s
return
seq_example
,
label
class
DecoderTest
(
tf
.
test
.
TestCase
):
"""A tf.SequenceExample decoder for the video classification task."""
def
test_decoder
(
self
):
decoder
=
video_input
.
Decoder
()
seq_example
,
label
=
fake_seq_example
()
serialized_example
=
seq_example
.
SerializeToString
()
decoded_tensors
=
decoder
.
decode
(
tf
.
convert_to_tensor
(
serialized_example
))
results
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
decoded_tensors
)
self
.
assertCountEqual
([
video_input
.
IMAGE_KEY
,
video_input
.
LABEL_KEY
],
results
.
keys
())
self
.
assertEqual
(
label
,
results
[
video_input
.
LABEL_KEY
])
def
test_decode_audio
(
self
):
decoder
=
video_input
.
Decoder
()
decoder
.
add_feature
(
AUDIO_KEY
,
tf
.
io
.
VarLenFeature
(
dtype
=
tf
.
float32
))
seq_example
,
label
=
fake_seq_example
()
serialized_example
=
seq_example
.
SerializeToString
()
decoded_tensors
=
decoder
.
decode
(
tf
.
convert_to_tensor
(
serialized_example
))
results
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
decoded_tensors
)
self
.
assertCountEqual
(
[
video_input
.
IMAGE_KEY
,
video_input
.
LABEL_KEY
,
AUDIO_KEY
],
results
.
keys
())
self
.
assertEqual
(
label
,
results
[
video_input
.
LABEL_KEY
])
self
.
assertEqual
(
results
[
AUDIO_KEY
].
shape
,
(
10
,
256
))
def
test_tfds_decode
(
self
):
with
tfds
.
testing
.
mock_data
(
num_examples
=
1
):
dataset
=
tfds
.
load
(
'ucf101'
,
split
=
'train'
).
take
(
1
)
data
=
next
(
iter
(
dataset
))
decoder
=
video_input
.
VideoTfdsDecoder
()
decoded_tensors
=
decoder
.
decode
(
data
)
self
.
assertContainsSubset
([
video_input
.
LABEL_KEY
,
video_input
.
IMAGE_KEY
],
decoded_tensors
.
keys
())
class
VideoAndLabelParserTest
(
tf
.
test
.
TestCase
):
def
test_video_input
(
self
):
params
=
exp_cfg
.
kinetics600
(
is_training
=
True
)
params
.
feature_shape
=
(
2
,
224
,
224
,
3
)
params
.
min_image_size
=
224
decoder
=
video_input
.
Decoder
()
parser
=
video_input
.
Parser
(
params
).
parse_fn
(
params
.
is_training
)
seq_example
,
label
=
fake_seq_example
()
input_tensor
=
tf
.
constant
(
seq_example
.
SerializeToString
())
decoded_tensors
=
decoder
.
decode
(
input_tensor
)
output_tensor
=
parser
(
decoded_tensors
)
image_features
,
label
=
output_tensor
image
=
image_features
[
'image'
]
self
.
assertAllEqual
(
image
.
shape
,
(
2
,
224
,
224
,
3
))
self
.
assertAllEqual
(
label
.
shape
,
(
600
,))
def
test_video_audio_input
(
self
):
params
=
exp_cfg
.
kinetics600
(
is_training
=
True
)
params
.
feature_shape
=
(
2
,
224
,
224
,
3
)
params
.
min_image_size
=
224
params
.
output_audio
=
True
params
.
audio_feature
=
AUDIO_KEY
params
.
audio_feature_shape
=
(
15
,
256
)
decoder
=
video_input
.
Decoder
()
decoder
.
add_feature
(
params
.
audio_feature
,
tf
.
io
.
VarLenFeature
(
dtype
=
tf
.
float32
))
parser
=
video_input
.
Parser
(
params
).
parse_fn
(
params
.
is_training
)
seq_example
,
label
=
fake_seq_example
()
input_tensor
=
tf
.
constant
(
seq_example
.
SerializeToString
())
decoded_tensors
=
decoder
.
decode
(
input_tensor
)
output_tensor
=
parser
(
decoded_tensors
)
features
,
label
=
output_tensor
image
=
features
[
'image'
]
audio
=
features
[
'audio'
]
self
.
assertAllEqual
(
image
.
shape
,
(
2
,
224
,
224
,
3
))
self
.
assertAllEqual
(
label
.
shape
,
(
600
,))
self
.
assertEqual
(
audio
.
shape
,
(
15
,
256
))
def
test_video_input_random_stride
(
self
):
params
=
exp_cfg
.
kinetics600
(
is_training
=
True
)
params
.
feature_shape
=
(
2
,
224
,
224
,
3
)
params
.
min_image_size
=
224
params
.
temporal_stride
=
2
params
.
random_stride_range
=
1
decoder
=
video_input
.
Decoder
()
parser
=
video_input
.
Parser
(
params
).
parse_fn
(
params
.
is_training
)
seq_example
,
label
=
fake_seq_example
()
input_tensor
=
tf
.
constant
(
seq_example
.
SerializeToString
())
decoded_tensors
=
decoder
.
decode
(
input_tensor
)
output_tensor
=
parser
(
decoded_tensors
)
image_features
,
label
=
output_tensor
image
=
image_features
[
'image'
]
self
.
assertAllEqual
(
image
.
shape
,
(
2
,
224
,
224
,
3
))
self
.
assertAllEqual
(
label
.
shape
,
(
600
,))
def
test_video_input_augmentation_returns_shape
(
self
):
params
=
exp_cfg
.
kinetics600
(
is_training
=
True
)
params
.
feature_shape
=
(
2
,
224
,
224
,
3
)
params
.
min_image_size
=
224
params
.
temporal_stride
=
2
params
.
aug_type
=
'autoaug'
decoder
=
video_input
.
Decoder
()
parser
=
video_input
.
Parser
(
params
).
parse_fn
(
params
.
is_training
)
seq_example
,
label
=
fake_seq_example
()
input_tensor
=
tf
.
constant
(
seq_example
.
SerializeToString
())
decoded_tensors
=
decoder
.
decode
(
input_tensor
)
output_tensor
=
parser
(
decoded_tensors
)
image_features
,
label
=
output_tensor
image
=
image_features
[
'image'
]
self
.
assertAllEqual
(
image
.
shape
,
(
2
,
224
,
224
,
3
))
self
.
assertAllEqual
(
label
.
shape
,
(
600
,))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/evaluation/__init__.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Prev
1
…
5
6
7
8
9
10
11
12
13
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment