Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
472e2f80
Commit
472e2f80
authored
Mar 16, 2024
by
zhanggzh
Browse files
Merge remote-tracking branch 'tf_model/main'
parents
d91296eb
f3a14f85
Changes
215
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4427 additions
and
0 deletions
+4427
-0
models-2.13.1/official/legacy/detection/configs/maskrcnn_config.py
...13.1/official/legacy/detection/configs/maskrcnn_config.py
+115
-0
models-2.13.1/official/legacy/detection/configs/olnmask_config.py
....13.1/official/legacy/detection/configs/olnmask_config.py
+143
-0
models-2.13.1/official/legacy/detection/configs/retinanet_config.py
...3.1/official/legacy/detection/configs/retinanet_config.py
+58
-0
models-2.13.1/official/legacy/detection/configs/shapemask_config.py
...3.1/official/legacy/detection/configs/shapemask_config.py
+97
-0
models-2.13.1/official/legacy/detection/dataloader/__init__.py
...s-2.13.1/official/legacy/detection/dataloader/__init__.py
+14
-0
models-2.13.1/official/legacy/detection/dataloader/anchor.py
models-2.13.1/official/legacy/detection/dataloader/anchor.py
+458
-0
models-2.13.1/official/legacy/detection/dataloader/factory.py
...ls-2.13.1/official/legacy/detection/dataloader/factory.py
+136
-0
models-2.13.1/official/legacy/detection/dataloader/input_reader.py
...13.1/official/legacy/detection/dataloader/input_reader.py
+105
-0
models-2.13.1/official/legacy/detection/dataloader/maskrcnn_parser.py
...1/official/legacy/detection/dataloader/maskrcnn_parser.py
+381
-0
models-2.13.1/official/legacy/detection/dataloader/mode_keys.py
...-2.13.1/official/legacy/detection/dataloader/mode_keys.py
+33
-0
models-2.13.1/official/legacy/detection/dataloader/olnmask_parser.py
....1/official/legacy/detection/dataloader/olnmask_parser.py
+327
-0
models-2.13.1/official/legacy/detection/dataloader/retinanet_parser.py
.../official/legacy/detection/dataloader/retinanet_parser.py
+425
-0
models-2.13.1/official/legacy/detection/dataloader/shapemask_parser.py
.../official/legacy/detection/dataloader/shapemask_parser.py
+521
-0
models-2.13.1/official/legacy/detection/dataloader/tf_example_decoder.py
...fficial/legacy/detection/dataloader/tf_example_decoder.py
+156
-0
models-2.13.1/official/legacy/detection/evaluation/__init__.py
...s-2.13.1/official/legacy/detection/evaluation/__init__.py
+14
-0
models-2.13.1/official/legacy/detection/evaluation/coco_evaluator.py
....1/official/legacy/detection/evaluation/coco_evaluator.py
+847
-0
models-2.13.1/official/legacy/detection/evaluation/coco_utils.py
...2.13.1/official/legacy/detection/evaluation/coco_utils.py
+372
-0
models-2.13.1/official/legacy/detection/evaluation/factory.py
...ls-2.13.1/official/legacy/detection/evaluation/factory.py
+52
-0
models-2.13.1/official/legacy/detection/executor/__init__.py
models-2.13.1/official/legacy/detection/executor/__init__.py
+14
-0
models-2.13.1/official/legacy/detection/executor/detection_executor.py
.../official/legacy/detection/executor/detection_executor.py
+159
-0
No files found.
Too many changes to show.
To preserve performance only
215 of 215+
files are displayed.
Plain diff
Email patch
models-2.13.1/official/legacy/detection/configs/maskrcnn_config.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Config template to train Mask R-CNN."""
from
official.legacy.detection.configs
import
base_config
from
official.modeling.hyperparams
import
params_dict
# pylint: disable=line-too-long
MASKRCNN_CFG
=
params_dict
.
ParamsDict
(
base_config
.
BASE_CFG
)
MASKRCNN_CFG
.
override
({
'type'
:
'mask_rcnn'
,
'eval'
:
{
'type'
:
'box_and_mask'
,
'num_images_to_visualize'
:
0
,
},
'architecture'
:
{
'parser'
:
'maskrcnn_parser'
,
'min_level'
:
2
,
'max_level'
:
6
,
'include_mask'
:
True
,
'mask_target_size'
:
28
,
},
'maskrcnn_parser'
:
{
'output_size'
:
[
1024
,
1024
],
'num_channels'
:
3
,
'rpn_match_threshold'
:
0.7
,
'rpn_unmatched_threshold'
:
0.3
,
'rpn_batch_size_per_im'
:
256
,
'rpn_fg_fraction'
:
0.5
,
'aug_rand_hflip'
:
True
,
'aug_scale_min'
:
1.0
,
'aug_scale_max'
:
1.0
,
'skip_crowd_during_training'
:
True
,
'max_num_instances'
:
100
,
'mask_crop_size'
:
112
,
},
'anchor'
:
{
'num_scales'
:
1
,
'anchor_size'
:
8
,
},
'rpn_head'
:
{
'num_convs'
:
2
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
'use_batch_norm'
:
False
,
},
'frcnn_head'
:
{
'num_convs'
:
0
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
'num_fcs'
:
2
,
'fc_dims'
:
1024
,
'use_batch_norm'
:
False
,
},
'mrcnn_head'
:
{
'num_convs'
:
4
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
'use_batch_norm'
:
False
,
},
'rpn_score_loss'
:
{
'rpn_batch_size_per_im'
:
256
,
},
'rpn_box_loss'
:
{
'huber_loss_delta'
:
1.0
/
9.0
,
},
'frcnn_box_loss'
:
{
'huber_loss_delta'
:
1.0
,
},
'roi_proposal'
:
{
'rpn_pre_nms_top_k'
:
2000
,
'rpn_post_nms_top_k'
:
1000
,
'rpn_nms_threshold'
:
0.7
,
'rpn_score_threshold'
:
0.0
,
'rpn_min_size_threshold'
:
0.0
,
'test_rpn_pre_nms_top_k'
:
1000
,
'test_rpn_post_nms_top_k'
:
1000
,
'test_rpn_nms_threshold'
:
0.7
,
'test_rpn_score_threshold'
:
0.0
,
'test_rpn_min_size_threshold'
:
0.0
,
'use_batched_nms'
:
False
,
},
'roi_sampling'
:
{
'num_samples_per_image'
:
512
,
'fg_fraction'
:
0.25
,
'fg_iou_thresh'
:
0.5
,
'bg_iou_thresh_hi'
:
0.5
,
'bg_iou_thresh_lo'
:
0.0
,
'mix_gt_boxes'
:
True
,
},
'mask_sampling'
:
{
'num_mask_samples_per_image'
:
128
,
# Typically = `num_samples_per_image` * `fg_fraction`.
},
'postprocess'
:
{
'pre_nms_num_boxes'
:
1000
,
},
},
is_strict
=
False
)
MASKRCNN_RESTRICTIONS
=
[
]
# pylint: enable=line-too-long
models-2.13.1/official/legacy/detection/configs/olnmask_config.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Config template to train Object Localization Network (OLN)."""
from
official.legacy.detection.configs
import
base_config
from
official.modeling.hyperparams
import
params_dict
# pylint: disable=line-too-long
OLNMASK_CFG
=
params_dict
.
ParamsDict
(
base_config
.
BASE_CFG
)
OLNMASK_CFG
.
override
({
'type'
:
'olnmask'
,
'eval'
:
{
'type'
:
'oln_xclass_box'
,
'use_category'
:
False
,
'seen_class'
:
'voc'
,
'num_images_to_visualize'
:
0
,
},
'architecture'
:
{
'parser'
:
'olnmask_parser'
,
'min_level'
:
2
,
'max_level'
:
6
,
'include_rpn_class'
:
False
,
'include_frcnn_class'
:
False
,
'include_frcnn_box'
:
True
,
'include_mask'
:
False
,
'mask_target_size'
:
28
,
'num_classes'
:
2
,
},
'olnmask_parser'
:
{
'output_size'
:
[
640
,
640
],
'num_channels'
:
3
,
'rpn_match_threshold'
:
0.7
,
'rpn_unmatched_threshold'
:
0.3
,
'rpn_batch_size_per_im'
:
256
,
'rpn_fg_fraction'
:
0.5
,
'aug_rand_hflip'
:
True
,
'aug_scale_min'
:
0.5
,
'aug_scale_max'
:
2.0
,
'skip_crowd_during_training'
:
True
,
'max_num_instances'
:
100
,
'mask_crop_size'
:
112
,
# centerness targets.
'has_centerness'
:
True
,
'rpn_center_match_iou_threshold'
:
0.3
,
'rpn_center_unmatched_iou_threshold'
:
0.1
,
'rpn_num_center_samples_per_im'
:
256
,
# class manipulation.
'class_agnostic'
:
True
,
'train_class'
:
'voc'
,
},
'anchor'
:
{
'num_scales'
:
1
,
'aspect_ratios'
:
[
1.0
],
'anchor_size'
:
8
,
},
'rpn_head'
:
{
'num_convs'
:
2
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
'use_batch_norm'
:
False
,
# RPN-Centerness learning {
'has_centerness'
:
True
,
# }
},
'frcnn_head'
:
{
'num_convs'
:
0
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
'num_fcs'
:
2
,
'fc_dims'
:
1024
,
'use_batch_norm'
:
False
,
'has_scoring'
:
True
,
},
'mrcnn_head'
:
{
'num_convs'
:
4
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
'use_batch_norm'
:
False
,
'has_scoring'
:
False
,
},
'rpn_score_loss'
:
{
'rpn_batch_size_per_im'
:
256
,
},
'rpn_box_loss'
:
{
'huber_loss_delta'
:
1.0
/
9.0
,
},
'frcnn_box_loss'
:
{
'huber_loss_delta'
:
1.0
,
},
'frcnn_box_score_loss'
:
{
'ignore_threshold'
:
0.3
,
},
'roi_proposal'
:
{
'rpn_pre_nms_top_k'
:
2000
,
'rpn_post_nms_top_k'
:
2000
,
'rpn_nms_threshold'
:
0.7
,
'rpn_score_threshold'
:
0.0
,
'rpn_min_size_threshold'
:
0.0
,
'test_rpn_pre_nms_top_k'
:
2000
,
'test_rpn_post_nms_top_k'
:
2000
,
'test_rpn_nms_threshold'
:
0.7
,
'test_rpn_score_threshold'
:
0.0
,
'test_rpn_min_size_threshold'
:
0.0
,
'use_batched_nms'
:
False
,
},
'roi_sampling'
:
{
'num_samples_per_image'
:
512
,
'fg_fraction'
:
0.25
,
'fg_iou_thresh'
:
0.5
,
'bg_iou_thresh_hi'
:
0.5
,
'bg_iou_thresh_lo'
:
0.0
,
'mix_gt_boxes'
:
True
,
},
'mask_sampling'
:
{
'num_mask_samples_per_image'
:
128
,
# Typically = `num_samples_per_image` * `fg_fraction`.
},
'postprocess'
:
{
'use_batched_nms'
:
False
,
'max_total_size'
:
100
,
'nms_iou_threshold'
:
0.5
,
'score_threshold'
:
0.00
,
'pre_nms_num_boxes'
:
2000
,
},
},
is_strict
=
False
)
OLNMASK_RESTRICTIONS
=
[
# 'anchor.aspect_ratios == [1.0]',
# 'anchor.scales == 1',
]
# pylint: enable=line-too-long
models-2.13.1/official/legacy/detection/configs/retinanet_config.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Config template to train Retinanet."""
from
official.legacy.detection.configs
import
base_config
from
official.modeling.hyperparams
import
params_dict
# pylint: disable=line-too-long
RETINANET_CFG
=
params_dict
.
ParamsDict
(
base_config
.
BASE_CFG
)
RETINANET_CFG
.
override
({
'type'
:
'retinanet'
,
'architecture'
:
{
'parser'
:
'retinanet_parser'
,
},
'retinanet_parser'
:
{
'output_size'
:
[
640
,
640
],
'num_channels'
:
3
,
'match_threshold'
:
0.5
,
'unmatched_threshold'
:
0.5
,
'aug_rand_hflip'
:
True
,
'aug_scale_min'
:
1.0
,
'aug_scale_max'
:
1.0
,
'use_autoaugment'
:
False
,
'autoaugment_policy_name'
:
'v0'
,
'skip_crowd_during_training'
:
True
,
'max_num_instances'
:
100
,
},
'retinanet_head'
:
{
'num_convs'
:
4
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
},
'retinanet_loss'
:
{
'focal_loss_alpha'
:
0.25
,
'focal_loss_gamma'
:
1.5
,
'huber_loss_delta'
:
0.1
,
'box_loss_weight'
:
50
,
},
'enable_summary'
:
True
,
},
is_strict
=
False
)
RETINANET_RESTRICTIONS
=
[
]
# pylint: enable=line-too-long
models-2.13.1/official/legacy/detection/configs/shapemask_config.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Config to train shapemask on COCO."""
from
official.legacy.detection.configs
import
base_config
from
official.modeling.hyperparams
import
params_dict
SHAPEMASK_RESNET_FROZEN_VAR_PREFIX
=
r
'(conv2d(|_([1-9]|10))|batch_normalization(|_([1-9]|10)))\/'
SHAPEMASK_CFG
=
params_dict
.
ParamsDict
(
base_config
.
BASE_CFG
)
SHAPEMASK_CFG
.
override
({
'type'
:
'shapemask'
,
'architecture'
:
{
'parser'
:
'shapemask_parser'
,
'backbone'
:
'resnet'
,
'multilevel_features'
:
'fpn'
,
'outer_box_scale'
:
1.25
,
},
'train'
:
{
'total_steps'
:
45000
,
'learning_rate'
:
{
'learning_rate_steps'
:
[
30000
,
40000
],
},
'frozen_variable_prefix'
:
SHAPEMASK_RESNET_FROZEN_VAR_PREFIX
,
'regularization_variable_regex'
:
None
,
},
'eval'
:
{
'type'
:
'shapemask_box_and_mask'
,
'mask_eval_class'
:
'all'
,
# 'all', 'voc', or 'nonvoc'.
},
'shapemask_parser'
:
{
'output_size'
:
[
640
,
640
],
'num_channels'
:
3
,
'match_threshold'
:
0.5
,
'unmatched_threshold'
:
0.5
,
'aug_rand_hflip'
:
True
,
'aug_scale_min'
:
0.8
,
'aug_scale_max'
:
1.2
,
'skip_crowd_during_training'
:
True
,
'max_num_instances'
:
100
,
# Shapemask specific parameters
'mask_train_class'
:
'all'
,
# 'all', 'voc', or 'nonvoc'.
'use_category'
:
True
,
'outer_box_scale'
:
1.25
,
'num_sampled_masks'
:
8
,
'mask_crop_size'
:
32
,
'mask_min_level'
:
3
,
'mask_max_level'
:
5
,
'box_jitter_scale'
:
0.025
,
'upsample_factor'
:
4
,
},
'retinanet_head'
:
{
'num_convs'
:
4
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
'use_batch_norm'
:
True
,
},
'shapemask_head'
:
{
'num_downsample_channels'
:
128
,
'mask_crop_size'
:
32
,
'use_category_for_mask'
:
True
,
'num_convs'
:
4
,
'upsample_factor'
:
4
,
'shape_prior_path'
:
''
,
},
'retinanet_loss'
:
{
'focal_loss_alpha'
:
0.4
,
'focal_loss_gamma'
:
1.5
,
'huber_loss_delta'
:
0.15
,
'box_loss_weight'
:
50
,
},
'shapemask_loss'
:
{
'shape_prior_loss_weight'
:
0.1
,
'coarse_mask_loss_weight'
:
1.0
,
'fine_mask_loss_weight'
:
1.0
,
},
},
is_strict
=
False
)
SHAPEMASK_RESTRICTIONS
=
[
'shapemask_head.mask_crop_size == shapemask_parser.mask_crop_size'
,
'shapemask_head.upsample_factor == shapemask_parser.upsample_factor'
,
'shapemask_parser.outer_box_scale == architecture.outer_box_scale'
,
]
# pylint: enable=line-too-long
models-2.13.1/official/legacy/detection/dataloader/__init__.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models-2.13.1/official/legacy/detection/dataloader/anchor.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Anchor box and labeler definition."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
tensorflow
as
tf
from
official.legacy.detection.utils
import
box_utils
from
official.vision.ops
import
iou_similarity
from
official.vision.utils.object_detection
import
argmax_matcher
from
official.vision.utils.object_detection
import
balanced_positive_negative_sampler
from
official.vision.utils.object_detection
import
box_list
from
official.vision.utils.object_detection
import
faster_rcnn_box_coder
from
official.vision.utils.object_detection
import
target_assigner
class
Anchor
(
object
):
"""Anchor class for anchor-based object detectors."""
def
__init__
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
):
"""Constructs multiscale anchors.
Args:
min_level: integer number of minimum level of the output feature pyramid.
max_level: integer number of maximum level of the output feature pyramid.
num_scales: integer number representing intermediate scales added on each
level. For instances, num_scales=2 adds one additional intermediate
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: list of float numbers representing the aspect ratio anchors
added on each level. The number indicates the ratio of width to height.
For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
scale level.
anchor_size: float number representing the scale of size of the base
anchor to the feature stride 2^level.
image_size: a list of integer numbers or Tensors representing [height,
width] of the input image size.The image_size should be divisible by the
largest feature stride 2^max_level.
"""
self
.
min_level
=
min_level
self
.
max_level
=
max_level
self
.
num_scales
=
num_scales
self
.
aspect_ratios
=
aspect_ratios
self
.
anchor_size
=
anchor_size
self
.
image_size
=
image_size
self
.
boxes
=
self
.
_generate_boxes
()
def
_generate_boxes
(
self
):
"""Generates multiscale anchor boxes.
Returns:
a Tensor of shape [N, 4], represneting anchor boxes of all levels
concatenated together.
"""
boxes_all
=
[]
for
level
in
range
(
self
.
min_level
,
self
.
max_level
+
1
):
boxes_l
=
[]
for
scale
in
range
(
self
.
num_scales
):
for
aspect_ratio
in
self
.
aspect_ratios
:
stride
=
2
**
level
intermediate_scale
=
2
**
(
scale
/
float
(
self
.
num_scales
))
base_anchor_size
=
self
.
anchor_size
*
stride
*
intermediate_scale
aspect_x
=
aspect_ratio
**
0.5
aspect_y
=
aspect_ratio
**-
0.5
half_anchor_size_x
=
base_anchor_size
*
aspect_x
/
2.0
half_anchor_size_y
=
base_anchor_size
*
aspect_y
/
2.0
x
=
tf
.
range
(
stride
/
2
,
self
.
image_size
[
1
],
stride
)
y
=
tf
.
range
(
stride
/
2
,
self
.
image_size
[
0
],
stride
)
xv
,
yv
=
tf
.
meshgrid
(
x
,
y
)
xv
=
tf
.
cast
(
tf
.
reshape
(
xv
,
[
-
1
]),
dtype
=
tf
.
float32
)
yv
=
tf
.
cast
(
tf
.
reshape
(
yv
,
[
-
1
]),
dtype
=
tf
.
float32
)
# Tensor shape Nx4.
boxes
=
tf
.
stack
([
yv
-
half_anchor_size_y
,
xv
-
half_anchor_size_x
,
yv
+
half_anchor_size_y
,
xv
+
half_anchor_size_x
],
axis
=
1
)
boxes_l
.
append
(
boxes
)
# Concat anchors on the same level to tensor shape NxAx4.
boxes_l
=
tf
.
stack
(
boxes_l
,
axis
=
1
)
boxes_l
=
tf
.
reshape
(
boxes_l
,
[
-
1
,
4
])
boxes_all
.
append
(
boxes_l
)
return
tf
.
concat
(
boxes_all
,
axis
=
0
)
def
unpack_labels
(
self
,
labels
):
"""Unpacks an array of labels into multiscales labels."""
unpacked_labels
=
collections
.
OrderedDict
()
count
=
0
for
level
in
range
(
self
.
min_level
,
self
.
max_level
+
1
):
feat_size_y
=
tf
.
cast
(
self
.
image_size
[
0
]
/
2
**
level
,
tf
.
int32
)
feat_size_x
=
tf
.
cast
(
self
.
image_size
[
1
]
/
2
**
level
,
tf
.
int32
)
steps
=
feat_size_y
*
feat_size_x
*
self
.
anchors_per_location
unpacked_labels
[
level
]
=
tf
.
reshape
(
labels
[
count
:
count
+
steps
],
[
feat_size_y
,
feat_size_x
,
-
1
])
count
+=
steps
return
unpacked_labels
@
property
def
anchors_per_location
(
self
):
return
self
.
num_scales
*
len
(
self
.
aspect_ratios
)
@
property
def
multilevel_boxes
(
self
):
return
self
.
unpack_labels
(
self
.
boxes
)
class
AnchorLabeler
(
object
):
"""Labeler for dense object detector."""
def
__init__
(
self
,
anchor
,
match_threshold
=
0.5
,
unmatched_threshold
=
0.5
):
"""Constructs anchor labeler to assign labels to anchors.
Args:
anchor: an instance of class Anchors.
match_threshold: a float number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: a float number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
"""
similarity_calc
=
iou_similarity
.
IouSimilarity
()
matcher
=
argmax_matcher
.
ArgMaxMatcher
(
match_threshold
,
unmatched_threshold
=
unmatched_threshold
,
negatives_lower_than_unmatched
=
True
,
force_match_for_each_row
=
True
)
box_coder
=
faster_rcnn_box_coder
.
FasterRcnnBoxCoder
()
self
.
_target_assigner
=
target_assigner
.
TargetAssigner
(
similarity_calc
,
matcher
,
box_coder
)
self
.
_anchor
=
anchor
self
.
_match_threshold
=
match_threshold
self
.
_unmatched_threshold
=
unmatched_threshold
def
label_anchors
(
self
,
gt_boxes
,
gt_labels
):
"""Labels anchors with ground truth inputs.
Args:
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
cls_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
and width_l represent the dimension of bounding box regression output at
l-th level.
num_positives: scalar tensor storing number of positives in an image.
"""
gt_box_list
=
box_list
.
BoxList
(
gt_boxes
)
anchor_box_list
=
box_list
.
BoxList
(
self
.
_anchor
.
boxes
)
# The cls_weights, box_weights are not used.
cls_targets
,
_
,
box_targets
,
_
,
matches
=
self
.
_target_assigner
.
assign
(
anchor_box_list
,
gt_box_list
,
gt_labels
)
# Labels definition in matches.match_results:
# (1) match_results[i]>=0, meaning that column i is matched with row
# match_results[i].
# (2) match_results[i]=-1, meaning that column i is not matched.
# (3) match_results[i]=-2, meaning that column i is ignored.
match_results
=
tf
.
expand_dims
(
matches
.
match_results
,
axis
=
1
)
cls_targets
=
tf
.
cast
(
cls_targets
,
tf
.
int32
)
cls_targets
=
tf
.
where
(
tf
.
equal
(
match_results
,
-
1
),
-
tf
.
ones_like
(
cls_targets
),
cls_targets
)
cls_targets
=
tf
.
where
(
tf
.
equal
(
match_results
,
-
2
),
-
2
*
tf
.
ones_like
(
cls_targets
),
cls_targets
)
# Unpacks labels into multi-level representations.
cls_targets_dict
=
self
.
_anchor
.
unpack_labels
(
cls_targets
)
box_targets_dict
=
self
.
_anchor
.
unpack_labels
(
box_targets
)
num_positives
=
tf
.
reduce_sum
(
input_tensor
=
tf
.
cast
(
tf
.
greater
(
matches
.
match_results
,
-
1
),
tf
.
float32
))
return
cls_targets_dict
,
box_targets_dict
,
num_positives
class
RpnAnchorLabeler
(
AnchorLabeler
):
"""Labeler for Region Proposal Network."""
def
__init__
(
self
,
anchor
,
match_threshold
=
0.7
,
unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
):
AnchorLabeler
.
__init__
(
self
,
anchor
,
match_threshold
=
0.7
,
unmatched_threshold
=
0.3
)
self
.
_rpn_batch_size_per_im
=
rpn_batch_size_per_im
self
.
_rpn_fg_fraction
=
rpn_fg_fraction
def
_get_rpn_samples
(
self
,
match_results
):
"""Computes anchor labels.
This function performs subsampling for foreground (fg) and background (bg)
anchors.
Args:
match_results: A integer tensor with shape [N] representing the matching
results of anchors. (1) match_results[i]>=0, meaning that column i is
matched with row match_results[i]. (2) match_results[i]=-1, meaning that
column i is not matched. (3) match_results[i]=-2, meaning that column i
is ignored.
Returns:
score_targets: a integer tensor with the a shape of [N].
(1) score_targets[i]=1, the anchor is a positive sample.
(2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
don't care (ignore).
"""
sampler
=
(
balanced_positive_negative_sampler
.
BalancedPositiveNegativeSampler
(
positive_fraction
=
self
.
_rpn_fg_fraction
,
is_static
=
False
))
# indicator includes both positive and negative labels.
# labels includes only positives labels.
# positives = indicator & labels.
# negatives = indicator & !labels.
# ignore = !indicator.
indicator
=
tf
.
greater
(
match_results
,
-
2
)
labels
=
tf
.
greater
(
match_results
,
-
1
)
samples
=
sampler
.
subsample
(
indicator
,
self
.
_rpn_batch_size_per_im
,
labels
)
positive_labels
=
tf
.
where
(
tf
.
logical_and
(
samples
,
labels
),
tf
.
constant
(
2
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
),
tf
.
constant
(
0
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
))
negative_labels
=
tf
.
where
(
tf
.
logical_and
(
samples
,
tf
.
logical_not
(
labels
)),
tf
.
constant
(
1
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
),
tf
.
constant
(
0
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
))
ignore_labels
=
tf
.
fill
(
match_results
.
shape
,
-
1
)
return
(
ignore_labels
+
positive_labels
+
negative_labels
,
positive_labels
,
negative_labels
)
def
label_anchors
(
self
,
gt_boxes
,
gt_labels
):
"""Labels anchors with ground truth inputs.
Args:
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
score_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
"""
gt_box_list
=
box_list
.
BoxList
(
gt_boxes
)
anchor_box_list
=
box_list
.
BoxList
(
self
.
_anchor
.
boxes
)
# cls_targets, cls_weights, box_weights are not used.
_
,
_
,
box_targets
,
_
,
matches
=
self
.
_target_assigner
.
assign
(
anchor_box_list
,
gt_box_list
,
gt_labels
)
# score_targets contains the subsampled positive and negative anchors.
score_targets
,
_
,
_
=
self
.
_get_rpn_samples
(
matches
.
match_results
)
# Unpacks labels.
score_targets_dict
=
self
.
_anchor
.
unpack_labels
(
score_targets
)
box_targets_dict
=
self
.
_anchor
.
unpack_labels
(
box_targets
)
return
score_targets_dict
,
box_targets_dict
class
OlnAnchorLabeler
(
RpnAnchorLabeler
):
"""Labeler for Region Proposal Network."""
def
__init__
(
self
,
anchor
,
match_threshold
=
0.7
,
unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
,
has_centerness
=
False
,
center_match_iou_threshold
=
0.3
,
center_unmatched_iou_threshold
=
0.1
,
num_center_samples_per_im
=
256
):
"""Constructs rpn anchor labeler to assign labels and centerness to anchors.
Args:
anchor: an instance of class Anchors.
match_threshold: a float number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: a float number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
rpn_batch_size_per_im: number of anchors that are sampled per image.
rpn_fg_fraction:
has_centerness: whether to include centerness target creation. An anchor
is paired with one centerness score.
center_match_iou_threshold: a float number between 0 and 1 representing
the lower-bound threshold to sample foreground anchors for centerness
regression. An anchor with a score over the threshold is sampled as
foreground sample for centerness regression. We sample mostly from the
foreground region (255 out of 256 samples). That is, we sample 255 vs 1
(foreground vs background) anchor points to learn centerness regression.
center_unmatched_iou_threshold: a float number between 0 and 1
representing the lower-bound threshold to sample background anchors for
centerness regression. An anchor with a score over the threshold is
sampled as foreground sample for centerness regression. We sample very
sparsely from the background region (1 out of 256 samples). That is, we
sample 255 vs 1 (foreground vs background) anchor points to learn
centerness regression.
num_center_samples_per_im: number of anchor points per image that are
sampled as centerness targets.
"""
super
(
OlnAnchorLabeler
,
self
).
__init__
(
anchor
,
match_threshold
=
match_threshold
,
unmatched_threshold
=
unmatched_threshold
,
rpn_batch_size_per_im
=
rpn_batch_size_per_im
,
rpn_fg_fraction
=
rpn_fg_fraction
)
similarity_calc
=
iou_similarity
.
IouSimilarity
()
matcher
=
argmax_matcher
.
ArgMaxMatcher
(
match_threshold
,
unmatched_threshold
=
unmatched_threshold
,
negatives_lower_than_unmatched
=
True
,
force_match_for_each_row
=
True
)
box_coder
=
faster_rcnn_box_coder
.
FasterRcnnBoxCoder
()
if
has_centerness
:
center_matcher
=
argmax_matcher
.
ArgMaxMatcher
(
center_match_iou_threshold
,
unmatched_threshold
=
center_match_iou_threshold
,
negatives_lower_than_unmatched
=
True
,
force_match_for_each_row
=
True
,)
else
:
center_matcher
=
None
self
.
_target_assigner
=
target_assigner
.
OlnTargetAssigner
(
similarity_calc
,
matcher
,
box_coder
,
center_matcher
=
center_matcher
)
self
.
_num_center_samples_per_im
=
num_center_samples_per_im
self
.
_center_unmatched_iou_threshold
=
center_unmatched_iou_threshold
self
.
_rpn_batch_size_per_im
=
rpn_batch_size_per_im
self
.
_rpn_fg_fraction
=
rpn_fg_fraction
def
label_anchors_lrtb
(
self
,
gt_boxes
,
gt_labels
):
"""Labels anchors with ground truth inputs.
Args:
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
score_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
lrtb_targets_dict: Same strucure to box_target_dict, except the regression
targets are converted from xyhw to lrtb format. Ordered dictionary with
keys [min_level, min_level+1, ..., max_level]. The values are tensor
with shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
center_targets_dict: Same structure to score_tragets_dict, except the
scores are centerness values ranging from 0 to 1. Ordered dictionary
with keys [min_level, min_level+1, ..., max_level]. The values are
tensor with shape [height_l, width_l, num_anchors]. The height_l and
width_l represent the dimension of class logits at l-th level.
"""
gt_box_list
=
box_list
.
BoxList
(
gt_boxes
)
anchor_box_list
=
box_list
.
BoxList
(
self
.
_anchor
.
boxes
)
# cls_targets, cls_weights, box_weights are not used.
(
_
,
_
,
box_targets
,
_
,
matches
,
matched_gt_box_list
,
matched_anchors_mask
,
center_matched_gt_box_list
,
center_matched_anchors_mask
,
matched_ious
)
=
self
.
_target_assigner
.
assign
(
anchor_box_list
,
gt_box_list
,
gt_labels
)
# Box lrtb_targets.
lrtb_targets
,
_
=
box_utils
.
encode_boxes_lrtb
(
matched_gt_box_list
.
data
[
'boxes'
],
anchor_box_list
.
data
[
'boxes'
],
weights
=
[
1.0
,
1.0
,
1.0
,
1.0
])
lrtb_sanity
=
tf
.
logical_and
(
tf
.
greater
(
tf
.
reduce_min
(
lrtb_targets
,
-
1
),
0.
),
matched_anchors_mask
)
# To broadcast lrtb_sanity to the same shape as lrtb_targets.
lrtb_sanity
=
tf
.
tile
(
tf
.
expand_dims
(
lrtb_sanity
,
1
),
[
1
,
tf
.
shape
(
lrtb_targets
)[
1
]])
lrtb_targets
=
tf
.
where
(
lrtb_sanity
,
lrtb_targets
,
tf
.
zeros_like
(
lrtb_targets
))
# RPN anchor-gtbox iou values.
iou_targets
=
tf
.
where
(
tf
.
greater
(
matched_ious
,
0.0
),
matched_ious
,
tf
.
zeros_like
(
matched_ious
))
# Centerness_targets.
_
,
center_targets
=
box_utils
.
encode_boxes_lrtb
(
center_matched_gt_box_list
.
data
[
'boxes'
],
anchor_box_list
.
data
[
'boxes'
],
weights
=
[
1.0
,
1.0
,
1.0
,
1.0
])
# Positive-negative centerness sampler.
num_center_samples_per_im
=
self
.
_num_center_samples_per_im
center_pos_neg_sampler
=
(
balanced_positive_negative_sampler
.
BalancedPositiveNegativeSampler
(
positive_fraction
=
(
1.
-
1.
/
num_center_samples_per_im
),
is_static
=
False
))
center_pos_neg_indicator
=
tf
.
logical_or
(
center_matched_anchors_mask
,
tf
.
less
(
iou_targets
,
self
.
_center_unmatched_iou_threshold
))
center_pos_labels
=
center_matched_anchors_mask
center_samples
=
center_pos_neg_sampler
.
subsample
(
center_pos_neg_indicator
,
num_center_samples_per_im
,
center_pos_labels
)
is_valid
=
center_samples
center_targets
=
tf
.
where
(
is_valid
,
center_targets
,
(
-
1
)
*
tf
.
ones_like
(
center_targets
))
# score_targets contains the subsampled positive and negative anchors.
score_targets
,
_
,
_
=
self
.
_get_rpn_samples
(
matches
.
match_results
)
# Unpacks labels.
score_targets_dict
=
self
.
_anchor
.
unpack_labels
(
score_targets
)
box_targets_dict
=
self
.
_anchor
.
unpack_labels
(
box_targets
)
lrtb_targets_dict
=
self
.
_anchor
.
unpack_labels
(
lrtb_targets
)
center_targets_dict
=
self
.
_anchor
.
unpack_labels
(
center_targets
)
return
(
score_targets_dict
,
box_targets_dict
,
lrtb_targets_dict
,
center_targets_dict
)
models-2.13.1/official/legacy/detection/dataloader/factory.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model architecture factory."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
official.legacy.detection.dataloader
import
maskrcnn_parser
from
official.legacy.detection.dataloader
import
olnmask_parser
from
official.legacy.detection.dataloader
import
retinanet_parser
from
official.legacy.detection.dataloader
import
shapemask_parser
def
parser_generator
(
params
,
mode
):
"""Generator function for various dataset parser."""
if
params
.
architecture
.
parser
==
'retinanet_parser'
:
anchor_params
=
params
.
anchor
parser_params
=
params
.
retinanet_parser
parser_fn
=
retinanet_parser
.
Parser
(
output_size
=
parser_params
.
output_size
,
min_level
=
params
.
architecture
.
min_level
,
max_level
=
params
.
architecture
.
max_level
,
num_scales
=
anchor_params
.
num_scales
,
aspect_ratios
=
anchor_params
.
aspect_ratios
,
anchor_size
=
anchor_params
.
anchor_size
,
match_threshold
=
parser_params
.
match_threshold
,
unmatched_threshold
=
parser_params
.
unmatched_threshold
,
aug_rand_hflip
=
parser_params
.
aug_rand_hflip
,
aug_scale_min
=
parser_params
.
aug_scale_min
,
aug_scale_max
=
parser_params
.
aug_scale_max
,
use_autoaugment
=
parser_params
.
use_autoaugment
,
autoaugment_policy_name
=
parser_params
.
autoaugment_policy_name
,
skip_crowd_during_training
=
parser_params
.
skip_crowd_during_training
,
max_num_instances
=
parser_params
.
max_num_instances
,
use_bfloat16
=
params
.
architecture
.
use_bfloat16
,
mode
=
mode
)
elif
params
.
architecture
.
parser
==
'maskrcnn_parser'
:
anchor_params
=
params
.
anchor
parser_params
=
params
.
maskrcnn_parser
parser_fn
=
maskrcnn_parser
.
Parser
(
output_size
=
parser_params
.
output_size
,
min_level
=
params
.
architecture
.
min_level
,
max_level
=
params
.
architecture
.
max_level
,
num_scales
=
anchor_params
.
num_scales
,
aspect_ratios
=
anchor_params
.
aspect_ratios
,
anchor_size
=
anchor_params
.
anchor_size
,
rpn_match_threshold
=
parser_params
.
rpn_match_threshold
,
rpn_unmatched_threshold
=
parser_params
.
rpn_unmatched_threshold
,
rpn_batch_size_per_im
=
parser_params
.
rpn_batch_size_per_im
,
rpn_fg_fraction
=
parser_params
.
rpn_fg_fraction
,
aug_rand_hflip
=
parser_params
.
aug_rand_hflip
,
aug_scale_min
=
parser_params
.
aug_scale_min
,
aug_scale_max
=
parser_params
.
aug_scale_max
,
skip_crowd_during_training
=
parser_params
.
skip_crowd_during_training
,
max_num_instances
=
parser_params
.
max_num_instances
,
include_mask
=
params
.
architecture
.
include_mask
,
mask_crop_size
=
parser_params
.
mask_crop_size
,
use_bfloat16
=
params
.
architecture
.
use_bfloat16
,
mode
=
mode
)
elif
params
.
architecture
.
parser
==
'olnmask_parser'
:
anchor_params
=
params
.
anchor
parser_params
=
params
.
olnmask_parser
parser_fn
=
olnmask_parser
.
Parser
(
output_size
=
parser_params
.
output_size
,
min_level
=
params
.
architecture
.
min_level
,
max_level
=
params
.
architecture
.
max_level
,
num_scales
=
anchor_params
.
num_scales
,
aspect_ratios
=
anchor_params
.
aspect_ratios
,
anchor_size
=
anchor_params
.
anchor_size
,
rpn_match_threshold
=
parser_params
.
rpn_match_threshold
,
rpn_unmatched_threshold
=
parser_params
.
rpn_unmatched_threshold
,
rpn_batch_size_per_im
=
parser_params
.
rpn_batch_size_per_im
,
rpn_fg_fraction
=
parser_params
.
rpn_fg_fraction
,
aug_rand_hflip
=
parser_params
.
aug_rand_hflip
,
aug_scale_min
=
parser_params
.
aug_scale_min
,
aug_scale_max
=
parser_params
.
aug_scale_max
,
skip_crowd_during_training
=
parser_params
.
skip_crowd_during_training
,
max_num_instances
=
parser_params
.
max_num_instances
,
include_mask
=
params
.
architecture
.
include_mask
,
mask_crop_size
=
parser_params
.
mask_crop_size
,
use_bfloat16
=
params
.
architecture
.
use_bfloat16
,
mode
=
mode
,
has_centerness
=
parser_params
.
has_centerness
,
rpn_center_match_iou_threshold
=
(
parser_params
.
rpn_center_match_iou_threshold
),
rpn_center_unmatched_iou_threshold
=
(
parser_params
.
rpn_center_unmatched_iou_threshold
),
rpn_num_center_samples_per_im
=
(
parser_params
.
rpn_num_center_samples_per_im
),
class_agnostic
=
parser_params
.
class_agnostic
,
train_class
=
parser_params
.
train_class
,)
elif
params
.
architecture
.
parser
==
'shapemask_parser'
:
anchor_params
=
params
.
anchor
parser_params
=
params
.
shapemask_parser
parser_fn
=
shapemask_parser
.
Parser
(
output_size
=
parser_params
.
output_size
,
min_level
=
params
.
architecture
.
min_level
,
max_level
=
params
.
architecture
.
max_level
,
num_scales
=
anchor_params
.
num_scales
,
aspect_ratios
=
anchor_params
.
aspect_ratios
,
anchor_size
=
anchor_params
.
anchor_size
,
use_category
=
parser_params
.
use_category
,
outer_box_scale
=
parser_params
.
outer_box_scale
,
box_jitter_scale
=
parser_params
.
box_jitter_scale
,
num_sampled_masks
=
parser_params
.
num_sampled_masks
,
mask_crop_size
=
parser_params
.
mask_crop_size
,
mask_min_level
=
parser_params
.
mask_min_level
,
mask_max_level
=
parser_params
.
mask_max_level
,
upsample_factor
=
parser_params
.
upsample_factor
,
match_threshold
=
parser_params
.
match_threshold
,
unmatched_threshold
=
parser_params
.
unmatched_threshold
,
aug_rand_hflip
=
parser_params
.
aug_rand_hflip
,
aug_scale_min
=
parser_params
.
aug_scale_min
,
aug_scale_max
=
parser_params
.
aug_scale_max
,
skip_crowd_during_training
=
parser_params
.
skip_crowd_during_training
,
max_num_instances
=
parser_params
.
max_num_instances
,
use_bfloat16
=
params
.
architecture
.
use_bfloat16
,
mask_train_class
=
parser_params
.
mask_train_class
,
mode
=
mode
)
else
:
raise
ValueError
(
'Parser %s is not supported.'
%
params
.
architecture
.
parser
)
return
parser_fn
models-2.13.1/official/legacy/detection/dataloader/input_reader.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data loader and input processing."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
typing
import
Optional
,
Text
import
tensorflow
as
tf
from
official.legacy.detection.dataloader
import
factory
from
official.legacy.detection.dataloader
import
mode_keys
as
ModeKeys
from
official.modeling.hyperparams
import
params_dict
class
InputFn
(
object
):
"""Input function that creates dataset from files."""
def
__init__
(
self
,
file_pattern
:
Text
,
params
:
params_dict
.
ParamsDict
,
mode
:
Text
,
batch_size
:
int
,
num_examples
:
Optional
[
int
]
=
-
1
):
"""Initialize.
Args:
file_pattern: the file pattern for the data example (TFRecords).
params: the parameter object for constructing example parser and model.
mode: ModeKeys.TRAIN or ModeKeys.Eval
batch_size: the data batch size.
num_examples: If positive, only takes this number of examples and raise
tf.errors.OutOfRangeError after that. If non-positive, it will be
ignored.
"""
assert
file_pattern
is
not
None
assert
mode
is
not
None
assert
batch_size
is
not
None
self
.
_file_pattern
=
file_pattern
self
.
_mode
=
mode
self
.
_is_training
=
(
mode
==
ModeKeys
.
TRAIN
)
self
.
_batch_size
=
batch_size
self
.
_num_examples
=
num_examples
self
.
_parser_fn
=
factory
.
parser_generator
(
params
,
mode
)
self
.
_dataset_fn
=
tf
.
data
.
TFRecordDataset
self
.
_input_sharding
=
(
not
self
.
_is_training
)
try
:
if
self
.
_is_training
:
self
.
_input_sharding
=
params
.
train
.
input_sharding
else
:
self
.
_input_sharding
=
params
.
eval
.
input_sharding
except
AttributeError
:
pass
def
__call__
(
self
,
ctx
=
None
,
batch_size
:
int
=
None
):
"""Provides tf.data.Dataset object.
Args:
ctx: context object.
batch_size: expected batch size input data.
Returns:
tf.data.Dataset object.
"""
if
not
batch_size
:
batch_size
=
self
.
_batch_size
assert
batch_size
is
not
None
dataset
=
tf
.
data
.
Dataset
.
list_files
(
self
.
_file_pattern
,
shuffle
=
self
.
_is_training
)
if
self
.
_input_sharding
and
ctx
and
ctx
.
num_input_pipelines
>
1
:
dataset
=
dataset
.
shard
(
ctx
.
num_input_pipelines
,
ctx
.
input_pipeline_id
)
dataset
=
dataset
.
cache
()
if
self
.
_is_training
:
dataset
=
dataset
.
repeat
()
dataset
=
dataset
.
interleave
(
map_func
=
self
.
_dataset_fn
,
cycle_length
=
32
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
if
self
.
_is_training
:
dataset
=
dataset
.
shuffle
(
1000
)
if
self
.
_num_examples
>
0
:
dataset
=
dataset
.
take
(
self
.
_num_examples
)
# Parses the fetched records to input tensors for model function.
dataset
=
dataset
.
map
(
self
.
_parser_fn
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
dataset
=
dataset
.
batch
(
batch_size
,
drop_remainder
=
True
)
dataset
=
dataset
.
prefetch
(
tf
.
data
.
experimental
.
AUTOTUNE
)
return
dataset
models-2.13.1/official/legacy/detection/dataloader/maskrcnn_parser.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for Mask R-CNN."""
import
tensorflow
as
tf
from
official.legacy.detection.dataloader
import
anchor
from
official.legacy.detection.dataloader
import
mode_keys
as
ModeKeys
from
official.legacy.detection.dataloader
import
tf_example_decoder
from
official.legacy.detection.utils
import
box_utils
from
official.legacy.detection.utils
import
dataloader_utils
from
official.legacy.detection.utils
import
input_utils
class
Parser
(
object
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
rpn_match_threshold
=
0.7
,
rpn_unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
include_mask
=
False
,
mask_crop_size
=
112
,
use_bfloat16
=
True
,
mode
=
None
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
rpn_match_threshold:
rpn_unmatched_threshold:
rpn_batch_size_per_im:
rpn_fg_fraction:
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
include_mask: a bool to indicate whether parse mask groundtruth.
mask_crop_size: the size which groundtruth mask is cropped to.
use_bfloat16: `bool`, if True, cast output image to tf.bfloat16.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction
or prediction with groundtruths in the outputs.
"""
self
.
_mode
=
mode
self
.
_max_num_instances
=
max_num_instances
self
.
_skip_crowd_during_training
=
skip_crowd_during_training
self
.
_is_training
=
(
mode
==
ModeKeys
.
TRAIN
)
self
.
_example_decoder
=
tf_example_decoder
.
TfExampleDecoder
(
include_mask
=
include_mask
)
# Anchor.
self
.
_output_size
=
output_size
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_num_scales
=
num_scales
self
.
_aspect_ratios
=
aspect_ratios
self
.
_anchor_size
=
anchor_size
# Target assigning.
self
.
_rpn_match_threshold
=
rpn_match_threshold
self
.
_rpn_unmatched_threshold
=
rpn_unmatched_threshold
self
.
_rpn_batch_size_per_im
=
rpn_batch_size_per_im
self
.
_rpn_fg_fraction
=
rpn_fg_fraction
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# Mask.
self
.
_include_mask
=
include_mask
self
.
_mask_crop_size
=
mask_crop_size
# Device.
self
.
_use_bfloat16
=
use_bfloat16
# Data is parsed depending on the model Modekey.
if
mode
==
ModeKeys
.
TRAIN
:
self
.
_parse_fn
=
self
.
_parse_train_data
elif
mode
==
ModeKeys
.
EVAL
:
self
.
_parse_fn
=
self
.
_parse_eval_data
elif
mode
==
ModeKeys
.
PREDICT
or
mode
==
ModeKeys
.
PREDICT_WITH_GT
:
self
.
_parse_fn
=
self
.
_parse_predict_data
else
:
raise
ValueError
(
'mode is not defined.'
)
def
__call__
(
self
,
value
):
"""Parses data to an image and associated training labels.
Args:
value: a string tensor holding a serialized tf.Example proto.
Returns:
image, labels: if mode == ModeKeys.TRAIN. see _parse_train_data.
{'images': image, 'labels': labels}: if mode == ModeKeys.PREDICT
or ModeKeys.PREDICT_WITH_GT.
"""
with
tf
.
name_scope
(
'parser'
):
data
=
self
.
_example_decoder
.
decode
(
value
)
return
self
.
_parse_fn
(
data
)
def
_parse_train_data
(
self
,
data
):
"""Parses data for training.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following describes
{key: value} pairs in the dictionary.
image_info: a 2D `Tensor` that encodes the information of the image and
the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
rpn_score_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
rpn_box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
gt_boxes: Groundtruth bounding box annotations. The box is represented
in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
image that is fed to the network. The tennsor is padded with -1 to
the fixed dimension [self._max_num_instances, 4].
gt_classes: Groundtruth classes annotations. The tennsor is padded
with -1 to the fixed dimension [self._max_num_instances].
gt_masks: groundtrugh masks cropped by the bounding box and
resized to a fixed size determined by mask_crop_size.
"""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
if
self
.
_include_mask
:
masks
=
data
[
'groundtruth_instance_masks'
]
is_crowds
=
data
[
'groundtruth_is_crowd'
]
# Skips annotations with `is_crowd` = True.
if
self
.
_skip_crowd_during_training
and
self
.
_is_training
:
num_groundtruths
=
tf
.
shape
(
classes
)[
0
]
with
tf
.
control_dependencies
([
num_groundtruths
,
is_crowds
]):
indices
=
tf
.
cond
(
tf
.
greater
(
tf
.
size
(
is_crowds
),
0
),
lambda
:
tf
.
where
(
tf
.
logical_not
(
is_crowds
))[:,
0
],
lambda
:
tf
.
cast
(
tf
.
range
(
num_groundtruths
),
tf
.
int64
))
classes
=
tf
.
gather
(
classes
,
indices
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
indices
)
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
input_utils
.
normalize_image
(
image
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
if
self
.
_include_mask
:
image
,
boxes
,
masks
=
input_utils
.
random_horizontal_flip
(
image
,
boxes
,
masks
)
else
:
image
,
boxes
=
input_utils
.
random_horizontal_flip
(
image
,
boxes
)
# Converts boxes from normalized coordinates to pixel coordinates.
# Now the coordinates of boxes are w.r.t. the original image.
boxes
=
box_utils
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
input_utils
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
input_utils
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
# Now the coordinates of boxes are w.r.t the scaled image.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
input_utils
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_utils
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
indices
)
# Transfer boxes to the original image space and do normalization.
cropped_boxes
=
boxes
+
tf
.
tile
(
tf
.
expand_dims
(
offset
,
axis
=
0
),
[
1
,
2
])
cropped_boxes
/=
tf
.
tile
(
tf
.
expand_dims
(
image_scale
,
axis
=
0
),
[
1
,
2
])
cropped_boxes
=
box_utils
.
normalize_boxes
(
cropped_boxes
,
image_shape
)
num_masks
=
tf
.
shape
(
masks
)[
0
]
masks
=
tf
.
image
.
crop_and_resize
(
tf
.
expand_dims
(
masks
,
axis
=-
1
),
cropped_boxes
,
box_indices
=
tf
.
range
(
num_masks
,
dtype
=
tf
.
int32
),
crop_size
=
[
self
.
_mask_crop_size
,
self
.
_mask_crop_size
],
method
=
'bilinear'
)
masks
=
tf
.
squeeze
(
masks
,
axis
=-
1
)
# Assigns anchor targets.
# Note that after the target assignment, box targets are absolute pixel
# offsets w.r.t. the scaled image.
input_anchor
=
anchor
.
Anchor
(
self
.
_min_level
,
self
.
_max_level
,
self
.
_num_scales
,
self
.
_aspect_ratios
,
self
.
_anchor_size
,
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
RpnAnchorLabeler
(
input_anchor
,
self
.
_rpn_match_threshold
,
self
.
_rpn_unmatched_threshold
,
self
.
_rpn_batch_size_per_im
,
self
.
_rpn_fg_fraction
)
rpn_score_targets
,
rpn_box_targets
=
anchor_labeler
.
label_anchors
(
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=-
1
),
dtype
=
tf
.
float32
))
# If bfloat16 is used, casts input image to tf.bfloat16.
if
self
.
_use_bfloat16
:
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
bfloat16
)
inputs
=
{
'image'
:
image
,
'image_info'
:
image_info
,
}
# Packs labels for model_fn outputs.
labels
=
{
'anchor_boxes'
:
input_anchor
.
multilevel_boxes
,
'image_info'
:
image_info
,
'rpn_score_targets'
:
rpn_score_targets
,
'rpn_box_targets'
:
rpn_box_targets
,
}
inputs
[
'gt_boxes'
]
=
input_utils
.
pad_to_fixed_size
(
boxes
,
self
.
_max_num_instances
,
-
1
)
inputs
[
'gt_classes'
]
=
input_utils
.
pad_to_fixed_size
(
classes
,
self
.
_max_num_instances
,
-
1
)
if
self
.
_include_mask
:
inputs
[
'gt_masks'
]
=
input_utils
.
pad_to_fixed_size
(
masks
,
self
.
_max_num_instances
,
-
1
)
return
inputs
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for evaluation."""
raise
NotImplementedError
(
'Not implemented!'
)
def
_parse_predict_data
(
self
,
data
):
"""Parses data for prediction.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
A dictionary of {'images': image, 'labels': labels} where
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following
describes {key: value} pairs in the dictionary.
source_ids: Source image id. Default value -1 if the source id is
empty in the groundtruth annotation.
image_info: a 2D `Tensor` that encodes the information of the image
and the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each
level.
"""
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
input_utils
.
normalize_image
(
image
)
# Resizes and crops image.
image
,
image_info
=
input_utils
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
input_utils
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# If bfloat16 is used, casts input image to tf.bfloat16.
if
self
.
_use_bfloat16
:
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
bfloat16
)
# Compute Anchor boxes.
_
=
anchor
.
Anchor
(
self
.
_min_level
,
self
.
_max_level
,
self
.
_num_scales
,
self
.
_aspect_ratios
,
self
.
_anchor_size
,
(
image_height
,
image_width
))
labels
=
{
'image_info'
:
image_info
,
}
if
self
.
_mode
==
ModeKeys
.
PREDICT_WITH_GT
:
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_utils
.
denormalize_boxes
(
data
[
'groundtruth_boxes'
],
image_shape
)
groundtruths
=
{
'source_id'
:
data
[
'source_id'
],
'height'
:
data
[
'height'
],
'width'
:
data
[
'width'
],
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
]),
'boxes'
:
boxes
,
'classes'
:
data
[
'groundtruth_classes'
],
'areas'
:
data
[
'groundtruth_area'
],
'is_crowds'
:
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
}
groundtruths
[
'source_id'
]
=
dataloader_utils
.
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
dataloader_utils
.
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
# TODO(yeqing): Remove the `groundtrtuh` layer key (no longer needed).
labels
[
'groundtruths'
]
=
groundtruths
inputs
=
{
'image'
:
image
,
'image_info'
:
image_info
,
}
return
inputs
,
labels
models-2.13.1/official/legacy/detection/dataloader/mode_keys.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Standard names for input dataloader modes.
The following standard keys are defined:
* `TRAIN`: training mode.
* `EVAL`: evaluation mode.
* `PREDICT`: prediction mode.
* `PREDICT_WITH_GT`: prediction mode with groundtruths in returned variables.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
TRAIN
=
'train'
EVAL
=
'eval'
PREDICT
=
'predict'
PREDICT_WITH_GT
=
'predict_with_gt'
models-2.13.1/official/legacy/detection/dataloader/olnmask_parser.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for Mask R-CNN."""
import
tensorflow
as
tf
from
official.legacy.detection.dataloader
import
anchor
from
official.legacy.detection.dataloader.maskrcnn_parser
import
Parser
as
MaskrcnnParser
from
official.legacy.detection.utils
import
box_utils
from
official.legacy.detection.utils
import
class_utils
from
official.legacy.detection.utils
import
input_utils
class
Parser
(
MaskrcnnParser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
rpn_match_threshold
=
0.7
,
rpn_unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
include_mask
=
False
,
mask_crop_size
=
112
,
use_bfloat16
=
True
,
mode
=
None
,
# for centerness learning.
has_centerness
=
False
,
rpn_center_match_iou_threshold
=
0.3
,
rpn_center_unmatched_iou_threshold
=
0.1
,
rpn_num_center_samples_per_im
=
256
,
# for class manipulation.
class_agnostic
=
False
,
train_class
=
'all'
,
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
rpn_match_threshold:
rpn_unmatched_threshold:
rpn_batch_size_per_im:
rpn_fg_fraction:
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
include_mask: a bool to indicate whether parse mask groundtruth.
mask_crop_size: the size which groundtruth mask is cropped to.
use_bfloat16: `bool`, if True, cast output image to tf.bfloat16.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction
or prediction with groundtruths in the outputs.
has_centerness: whether to create centerness targets
rpn_center_match_iou_threshold: iou threshold for valid centerness samples
,set to 0.3 by default.
rpn_center_unmatched_iou_threshold: iou threshold for invalid centerness
samples, set to 0.1 by default.
rpn_num_center_samples_per_im: number of centerness samples per image,
256 by default.
class_agnostic: whether to merge class ids into one foreground(=1) class,
False by default.
train_class: 'all' or 'voc' or 'nonvoc', 'all' by default.
"""
super
(
Parser
,
self
).
__init__
(
output_size
=
output_size
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
,
rpn_match_threshold
=
rpn_match_threshold
,
rpn_unmatched_threshold
=
rpn_unmatched_threshold
,
rpn_batch_size_per_im
=
rpn_batch_size_per_im
,
rpn_fg_fraction
=
rpn_fg_fraction
,
aug_rand_hflip
=
aug_rand_hflip
,
aug_scale_min
=
aug_scale_min
,
aug_scale_max
=
aug_scale_max
,
skip_crowd_during_training
=
skip_crowd_during_training
,
max_num_instances
=
max_num_instances
,
include_mask
=
include_mask
,
mask_crop_size
=
mask_crop_size
,
use_bfloat16
=
use_bfloat16
,
mode
=
mode
,)
# Centerness target assigning.
self
.
_has_centerness
=
has_centerness
self
.
_rpn_center_match_iou_threshold
=
rpn_center_match_iou_threshold
self
.
_rpn_center_unmatched_iou_threshold
=
(
rpn_center_unmatched_iou_threshold
)
self
.
_rpn_num_center_samples_per_im
=
rpn_num_center_samples_per_im
# Class manipulation.
self
.
_class_agnostic
=
class_agnostic
self
.
_train_class
=
train_class
def
_parse_train_data
(
self
,
data
):
"""Parses data for training.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following describes
{key: value} pairs in the dictionary.
image_info: a 2D `Tensor` that encodes the information of the image and
the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
rpn_score_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
rpn_box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
gt_boxes: Groundtruth bounding box annotations. The box is represented
in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
image that is fed to the network. The tennsor is padded with -1 to
the fixed dimension [self._max_num_instances, 4].
gt_classes: Groundtruth classes annotations. The tennsor is padded
with -1 to the fixed dimension [self._max_num_instances].
gt_masks: groundtrugh masks cropped by the bounding box and
resized to a fixed size determined by mask_crop_size.
"""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
if
self
.
_include_mask
:
masks
=
data
[
'groundtruth_instance_masks'
]
is_crowds
=
data
[
'groundtruth_is_crowd'
]
# Skips annotations with `is_crowd` = True.
if
self
.
_skip_crowd_during_training
and
self
.
_is_training
:
num_groundtruths
=
tf
.
shape
(
classes
)[
0
]
with
tf
.
control_dependencies
([
num_groundtruths
,
is_crowds
]):
indices
=
tf
.
cond
(
tf
.
greater
(
tf
.
size
(
is_crowds
),
0
),
lambda
:
tf
.
where
(
tf
.
logical_not
(
is_crowds
))[:,
0
],
lambda
:
tf
.
cast
(
tf
.
range
(
num_groundtruths
),
tf
.
int64
))
classes
=
tf
.
gather
(
classes
,
indices
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
indices
)
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
input_utils
.
normalize_image
(
image
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
if
self
.
_include_mask
:
image
,
boxes
,
masks
=
input_utils
.
random_horizontal_flip
(
image
,
boxes
,
masks
)
else
:
image
,
boxes
=
input_utils
.
random_horizontal_flip
(
image
,
boxes
)
# Converts boxes from normalized coordinates to pixel coordinates.
# Now the coordinates of boxes are w.r.t. the original image.
boxes
=
box_utils
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
input_utils
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
input_utils
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
# Now the coordinates of boxes are w.r.t the scaled image.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
input_utils
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_utils
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
indices
)
# Transfer boxes to the original image space and do normalization.
cropped_boxes
=
boxes
+
tf
.
tile
(
tf
.
expand_dims
(
offset
,
axis
=
0
),
[
1
,
2
])
cropped_boxes
/=
tf
.
tile
(
tf
.
expand_dims
(
image_scale
,
axis
=
0
),
[
1
,
2
])
cropped_boxes
=
box_utils
.
normalize_boxes
(
cropped_boxes
,
image_shape
)
num_masks
=
tf
.
shape
(
masks
)[
0
]
masks
=
tf
.
image
.
crop_and_resize
(
tf
.
expand_dims
(
masks
,
axis
=-
1
),
cropped_boxes
,
box_indices
=
tf
.
range
(
num_masks
,
dtype
=
tf
.
int32
),
crop_size
=
[
self
.
_mask_crop_size
,
self
.
_mask_crop_size
],
method
=
'bilinear'
)
masks
=
tf
.
squeeze
(
masks
,
axis
=-
1
)
# Class manipulation.
# Filter out novel split classes from training.
if
self
.
_train_class
!=
'all'
:
valid_classes
=
tf
.
cast
(
class_utils
.
coco_split_class_ids
(
self
.
_train_class
),
dtype
=
classes
.
dtype
)
match
=
tf
.
reduce_any
(
tf
.
equal
(
tf
.
expand_dims
(
valid_classes
,
1
),
tf
.
expand_dims
(
classes
,
0
)),
0
)
# kill novel split classes and boxes.
boxes
=
tf
.
gather
(
boxes
,
tf
.
where
(
match
)[:,
0
])
classes
=
tf
.
gather
(
classes
,
tf
.
where
(
match
)[:,
0
])
if
self
.
_include_mask
:
masks
=
tf
.
gather
(
masks
,
tf
.
where
(
match
)[:,
0
])
# Assigns anchor targets.
# Note that after the target assignment, box targets are absolute pixel
# offsets w.r.t. the scaled image.
input_anchor
=
anchor
.
Anchor
(
self
.
_min_level
,
self
.
_max_level
,
self
.
_num_scales
,
self
.
_aspect_ratios
,
self
.
_anchor_size
,
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
OlnAnchorLabeler
(
input_anchor
,
self
.
_rpn_match_threshold
,
self
.
_rpn_unmatched_threshold
,
self
.
_rpn_batch_size_per_im
,
self
.
_rpn_fg_fraction
,
# for centerness target.
self
.
_has_centerness
,
self
.
_rpn_center_match_iou_threshold
,
self
.
_rpn_center_unmatched_iou_threshold
,
self
.
_rpn_num_center_samples_per_im
,)
if
self
.
_has_centerness
:
rpn_score_targets
,
_
,
rpn_lrtb_targets
,
rpn_center_targets
=
(
anchor_labeler
.
label_anchors_lrtb
(
gt_boxes
=
boxes
,
gt_labels
=
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=-
1
),
dtype
=
tf
.
float32
)))
else
:
rpn_score_targets
,
rpn_box_targets
=
anchor_labeler
.
label_anchors
(
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=-
1
),
dtype
=
tf
.
float32
))
# For base rpn, dummy placeholder for centerness target.
rpn_center_targets
=
rpn_score_targets
.
copy
()
# If bfloat16 is used, casts input image to tf.bfloat16.
if
self
.
_use_bfloat16
:
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
bfloat16
)
inputs
=
{
'image'
:
image
,
'image_info'
:
image_info
,
}
# Packs labels for model_fn outputs.
labels
=
{
'anchor_boxes'
:
input_anchor
.
multilevel_boxes
,
'image_info'
:
image_info
,
'rpn_score_targets'
:
rpn_score_targets
,
'rpn_box_targets'
:
(
rpn_lrtb_targets
if
self
.
_has_centerness
else
rpn_box_targets
),
'rpn_center_targets'
:
rpn_center_targets
,
}
# If class_agnostic, convert to binary classes.
if
self
.
_class_agnostic
:
classes
=
tf
.
where
(
tf
.
greater
(
classes
,
0
),
tf
.
ones_like
(
classes
),
tf
.
zeros_like
(
classes
))
inputs
[
'gt_boxes'
]
=
input_utils
.
pad_to_fixed_size
(
boxes
,
self
.
_max_num_instances
,
-
1
)
inputs
[
'gt_classes'
]
=
input_utils
.
pad_to_fixed_size
(
classes
,
self
.
_max_num_instances
,
-
1
)
if
self
.
_include_mask
:
inputs
[
'gt_masks'
]
=
input_utils
.
pad_to_fixed_size
(
masks
,
self
.
_max_num_instances
,
-
1
)
return
inputs
,
labels
models-2.13.1/official/legacy/detection/dataloader/retinanet_parser.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing.
Parse image and ground truths in a dataset to training targets and package them
into (image, labels) tuple for RetinaNet.
T.-Y. Lin, P. Goyal, R. Girshick, K. He, and P. Dollar
Focal Loss for Dense Object Detection. arXiv:1708.02002
"""
import
tensorflow
as
tf
from
official.legacy.detection.dataloader
import
anchor
from
official.legacy.detection.dataloader
import
mode_keys
as
ModeKeys
from
official.legacy.detection.dataloader
import
tf_example_decoder
from
official.legacy.detection.utils
import
box_utils
from
official.legacy.detection.utils
import
input_utils
def
process_source_id
(
source_id
):
"""Processes source_id to the right format."""
if
source_id
.
dtype
==
tf
.
string
:
source_id
=
tf
.
cast
(
tf
.
strings
.
to_number
(
source_id
),
tf
.
int32
)
with
tf
.
control_dependencies
([
source_id
]):
source_id
=
tf
.
cond
(
pred
=
tf
.
equal
(
tf
.
size
(
input
=
source_id
),
0
),
true_fn
=
lambda
:
tf
.
cast
(
tf
.
constant
(
-
1
),
tf
.
int32
),
false_fn
=
lambda
:
tf
.
identity
(
source_id
))
return
source_id
def
pad_groundtruths_to_fixed_size
(
gt
,
n
):
"""Pads the first dimension of groundtruths labels to the fixed size."""
gt
[
'boxes'
]
=
input_utils
.
pad_to_fixed_size
(
gt
[
'boxes'
],
n
,
-
1
)
gt
[
'is_crowds'
]
=
input_utils
.
pad_to_fixed_size
(
gt
[
'is_crowds'
],
n
,
0
)
gt
[
'areas'
]
=
input_utils
.
pad_to_fixed_size
(
gt
[
'areas'
],
n
,
-
1
)
gt
[
'classes'
]
=
input_utils
.
pad_to_fixed_size
(
gt
[
'classes'
],
n
,
-
1
)
return
gt
class
Parser
(
object
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
match_threshold
=
0.5
,
unmatched_threshold
=
0.5
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
use_autoaugment
=
False
,
autoaugment_policy_name
=
'v0'
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
use_bfloat16
=
True
,
mode
=
None
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added on each
level. For instances, num_scales=2 adds one additional intermediate
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
match_threshold: `float` number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: `float` number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
aug_rand_hflip: `bool`, if True, augment training with random horizontal
flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
during training.
autoaugment_policy_name: `string` that specifies the name of the
AutoAugment policy that will be used during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
use_bfloat16: `bool`, if True, cast output image to tf.bfloat16.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
prediction with groundtruths in the outputs.
"""
self
.
_mode
=
mode
self
.
_max_num_instances
=
max_num_instances
self
.
_skip_crowd_during_training
=
skip_crowd_during_training
self
.
_is_training
=
(
mode
==
ModeKeys
.
TRAIN
)
self
.
_example_decoder
=
tf_example_decoder
.
TfExampleDecoder
(
include_mask
=
False
)
# Anchor.
self
.
_output_size
=
output_size
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_num_scales
=
num_scales
self
.
_aspect_ratios
=
aspect_ratios
self
.
_anchor_size
=
anchor_size
self
.
_match_threshold
=
match_threshold
self
.
_unmatched_threshold
=
unmatched_threshold
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# Data Augmentation with AutoAugment.
self
.
_use_autoaugment
=
use_autoaugment
self
.
_autoaugment_policy_name
=
autoaugment_policy_name
# Device.
self
.
_use_bfloat16
=
use_bfloat16
# Data is parsed depending on the model Modekey.
if
mode
==
ModeKeys
.
TRAIN
:
self
.
_parse_fn
=
self
.
_parse_train_data
elif
mode
==
ModeKeys
.
EVAL
:
self
.
_parse_fn
=
self
.
_parse_eval_data
elif
mode
==
ModeKeys
.
PREDICT
or
mode
==
ModeKeys
.
PREDICT_WITH_GT
:
self
.
_parse_fn
=
self
.
_parse_predict_data
else
:
raise
ValueError
(
'mode is not defined.'
)
def
__call__
(
self
,
value
):
"""Parses data to an image and associated training labels.
Args:
value: a string tensor holding a serialized tf.Example proto.
Returns:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels:
cls_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
num_positives: number of positive anchors in the image.
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
image_info: a 2D `Tensor` that encodes the information of the image and
the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width],
[y_scale, x_scale], [y_offset, x_offset]].
groundtruths:
source_id: source image id. Default value -1 if the source id is empty
in the groundtruth annotation.
boxes: groundtruth bounding box annotations. The box is represented in
[y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed
dimension [self._max_num_instances, 4].
classes: groundtruth classes annotations. The tennsor is padded with
-1 to the fixed dimension [self._max_num_instances].
areas: groundtruth areas annotations. The tennsor is padded with -1
to the fixed dimension [self._max_num_instances].
is_crowds: groundtruth annotations to indicate if an annotation
represents a group of instances by value {0, 1}. The tennsor is
padded with 0 to the fixed dimension [self._max_num_instances].
"""
with
tf
.
name_scope
(
'parser'
):
data
=
self
.
_example_decoder
.
decode
(
value
)
return
self
.
_parse_fn
(
data
)
def
_parse_train_data
(
self
,
data
):
"""Parses data for training and evaluation."""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
is_crowds
=
data
[
'groundtruth_is_crowd'
]
# Skips annotations with `is_crowd` = True.
if
self
.
_skip_crowd_during_training
and
self
.
_is_training
:
num_groundtrtuhs
=
tf
.
shape
(
input
=
classes
)[
0
]
with
tf
.
control_dependencies
([
num_groundtrtuhs
,
is_crowds
]):
indices
=
tf
.
cond
(
pred
=
tf
.
greater
(
tf
.
size
(
input
=
is_crowds
),
0
),
true_fn
=
lambda
:
tf
.
where
(
tf
.
logical_not
(
is_crowds
))[:,
0
],
false_fn
=
lambda
:
tf
.
cast
(
tf
.
range
(
num_groundtrtuhs
),
tf
.
int64
))
classes
=
tf
.
gather
(
classes
,
indices
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
input
=
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
input_utils
.
normalize_image
(
image
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
image
,
boxes
=
input_utils
.
random_horizontal_flip
(
image
,
boxes
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_utils
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
input_utils
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
input_utils
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
input_utils
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_utils
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
# Assigns anchors.
input_anchor
=
anchor
.
Anchor
(
self
.
_min_level
,
self
.
_max_level
,
self
.
_num_scales
,
self
.
_aspect_ratios
,
self
.
_anchor_size
,
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
AnchorLabeler
(
input_anchor
,
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
(
cls_targets
,
box_targets
,
num_positives
)
=
anchor_labeler
.
label_anchors
(
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=
1
),
tf
.
float32
))
# If bfloat16 is used, casts input image to tf.bfloat16.
if
self
.
_use_bfloat16
:
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
bfloat16
)
# Packs labels for model_fn outputs.
labels
=
{
'cls_targets'
:
cls_targets
,
'box_targets'
:
box_targets
,
'anchor_boxes'
:
input_anchor
.
multilevel_boxes
,
'num_positives'
:
num_positives
,
'image_info'
:
image_info
,
}
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for training and evaluation."""
groundtruths
=
{}
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
input
=
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
input_utils
.
normalize_image
(
image
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_utils
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
input_utils
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
input_utils
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# Resizes and crops boxes.
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
input_utils
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_utils
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
# Assigns anchors.
input_anchor
=
anchor
.
Anchor
(
self
.
_min_level
,
self
.
_max_level
,
self
.
_num_scales
,
self
.
_aspect_ratios
,
self
.
_anchor_size
,
(
image_height
,
image_width
))
anchor_labeler
=
anchor
.
AnchorLabeler
(
input_anchor
,
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
(
cls_targets
,
box_targets
,
num_positives
)
=
anchor_labeler
.
label_anchors
(
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=
1
),
tf
.
float32
))
# If bfloat16 is used, casts input image to tf.bfloat16.
if
self
.
_use_bfloat16
:
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
bfloat16
)
# Sets up groundtruth data for evaluation.
groundtruths
=
{
'source_id'
:
data
[
'source_id'
],
'num_groundtrtuhs'
:
tf
.
shape
(
data
[
'groundtruth_classes'
]),
'image_info'
:
image_info
,
'boxes'
:
box_utils
.
denormalize_boxes
(
data
[
'groundtruth_boxes'
],
image_shape
),
'classes'
:
data
[
'groundtruth_classes'
],
'areas'
:
data
[
'groundtruth_area'
],
'is_crowds'
:
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
}
groundtruths
[
'source_id'
]
=
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
# Packs labels for model_fn outputs.
labels
=
{
'cls_targets'
:
cls_targets
,
'box_targets'
:
box_targets
,
'anchor_boxes'
:
input_anchor
.
multilevel_boxes
,
'num_positives'
:
num_positives
,
'image_info'
:
image_info
,
'groundtruths'
:
groundtruths
,
}
return
image
,
labels
def
_parse_predict_data
(
self
,
data
):
"""Parses data for prediction."""
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
input
=
image
)[
0
:
2
]
# Normalizes image with mean and std pixel values.
image
=
input_utils
.
normalize_image
(
image
)
# Resizes and crops image.
image
,
image_info
=
input_utils
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
padded_size
=
input_utils
.
compute_padded_size
(
self
.
_output_size
,
2
**
self
.
_max_level
),
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
)
image_height
,
image_width
,
_
=
image
.
get_shape
().
as_list
()
# If bfloat16 is used, casts input image to tf.bfloat16.
if
self
.
_use_bfloat16
:
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
bfloat16
)
# Compute Anchor boxes.
input_anchor
=
anchor
.
Anchor
(
self
.
_min_level
,
self
.
_max_level
,
self
.
_num_scales
,
self
.
_aspect_ratios
,
self
.
_anchor_size
,
(
image_height
,
image_width
))
labels
=
{
'anchor_boxes'
:
input_anchor
.
multilevel_boxes
,
'image_info'
:
image_info
,
}
# If mode is PREDICT_WITH_GT, returns groundtruths and training targets
# in labels.
if
self
.
_mode
==
ModeKeys
.
PREDICT_WITH_GT
:
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_utils
.
denormalize_boxes
(
data
[
'groundtruth_boxes'
],
image_shape
)
groundtruths
=
{
'source_id'
:
data
[
'source_id'
],
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
]),
'boxes'
:
boxes
,
'classes'
:
data
[
'groundtruth_classes'
],
'areas'
:
data
[
'groundtruth_area'
],
'is_crowds'
:
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
}
groundtruths
[
'source_id'
]
=
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
labels
[
'groundtruths'
]
=
groundtruths
# Computes training objective for evaluation loss.
classes
=
data
[
'groundtruth_classes'
]
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
boxes
=
input_utils
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_utils
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
# Assigns anchors.
anchor_labeler
=
anchor
.
AnchorLabeler
(
input_anchor
,
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
(
cls_targets
,
box_targets
,
num_positives
)
=
anchor_labeler
.
label_anchors
(
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=
1
),
tf
.
float32
))
labels
[
'cls_targets'
]
=
cls_targets
labels
[
'box_targets'
]
=
box_targets
labels
[
'num_positives'
]
=
num_positives
return
image
,
labels
models-2.13.1/official/legacy/detection/dataloader/shapemask_parser.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing.
Parse image and ground truths in a dataset to training targets and package them
into (image, labels) tuple for ShapeMask.
Weicheng Kuo, Anelia Angelova, Jitendra Malik, Tsung-Yi Lin
ShapeMask: Learning to Segment Novel Objects by Refining Shape Priors.
arXiv:1904.03239.
"""
import
tensorflow
as
tf
from
official.legacy.detection.dataloader
import
anchor
from
official.legacy.detection.dataloader
import
mode_keys
as
ModeKeys
from
official.legacy.detection.dataloader
import
tf_example_decoder
from
official.legacy.detection.utils
import
box_utils
from
official.legacy.detection.utils
import
class_utils
from
official.legacy.detection.utils
import
dataloader_utils
from
official.legacy.detection.utils
import
input_utils
def
pad_to_size
(
input_tensor
,
size
):
"""Pads data with zeros to a given length at the first dimension if needed.
Args:
input_tensor: `Tensor` with any dimension.
size: `int` number for the first dimension of output Tensor.
Returns:
`Tensor` with the first dimension padded to `size` if the first diemsion
is less than `size`, otherwise no padding.
"""
input_shape
=
tf
.
shape
(
input_tensor
)
padding_shape
=
[]
# Computes the padding length on the first dimension.
padding_length
=
tf
.
maximum
(
0
,
size
-
tf
.
shape
(
input_tensor
)[
0
])
assert_length
=
tf
.
Assert
(
tf
.
greater_equal
(
padding_length
,
0
),
[
padding_length
])
with
tf
.
control_dependencies
([
assert_length
]):
padding_shape
.
append
(
padding_length
)
# Copies shapes of the rest of input shape dimensions.
for
i
in
range
(
1
,
len
(
input_shape
)):
padding_shape
.
append
(
tf
.
shape
(
input
=
input_tensor
)[
i
])
# Pads input tensor to the fixed first dimension.
paddings
=
tf
.
cast
(
tf
.
zeros
(
padding_shape
),
input_tensor
.
dtype
)
padded_tensor
=
tf
.
concat
([
input_tensor
,
paddings
],
axis
=
0
)
return
padded_tensor
class
Parser
(
object
):
"""ShapeMask Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
use_category
=
True
,
outer_box_scale
=
1.0
,
box_jitter_scale
=
0.025
,
num_sampled_masks
=
8
,
mask_crop_size
=
32
,
mask_min_level
=
3
,
mask_max_level
=
5
,
upsample_factor
=
4
,
match_threshold
=
0.5
,
unmatched_threshold
=
0.5
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
use_bfloat16
=
True
,
mask_train_class
=
'all'
,
mode
=
None
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
use_category: if `False`, treat all object in all classes in one
foreground category.
outer_box_scale: `float` number in a range of [1.0, inf) representing
the scale from object box to outer box. The mask branch predicts
instance mask enclosed in outer box.
box_jitter_scale: `float` number representing the noise magnitude to
jitter the training groundtruth boxes for mask branch.
num_sampled_masks: `int` number of sampled masks for training.
mask_crop_size: `list` for [height, width] of output training masks.
mask_min_level: `int` number indicating the minimum feature level to
obtain instance features.
mask_max_level: `int` number indicating the maximum feature level to
obtain instance features.
upsample_factor: `int` factor of upsampling the fine mask predictions.
match_threshold: `float` number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: `float` number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
use_bfloat16: `bool`, if True, cast output image to tf.bfloat16.
mask_train_class: a string of experiment mode: `all`, `voc` or `nonvoc`.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction
or prediction with groundtruths in the outputs.
"""
self
.
_mode
=
mode
self
.
_mask_train_class
=
mask_train_class
self
.
_max_num_instances
=
max_num_instances
self
.
_skip_crowd_during_training
=
skip_crowd_during_training
self
.
_is_training
=
(
mode
==
ModeKeys
.
TRAIN
)
self
.
_example_decoder
=
tf_example_decoder
.
TfExampleDecoder
(
include_mask
=
True
)
# Anchor.
self
.
_output_size
=
output_size
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_num_scales
=
num_scales
self
.
_aspect_ratios
=
aspect_ratios
self
.
_anchor_size
=
anchor_size
self
.
_match_threshold
=
match_threshold
self
.
_unmatched_threshold
=
unmatched_threshold
# Data augmentation.
self
.
_aug_rand_hflip
=
aug_rand_hflip
self
.
_aug_scale_min
=
aug_scale_min
self
.
_aug_scale_max
=
aug_scale_max
# Device.
self
.
_use_bfloat16
=
use_bfloat16
# ShapeMask specific.
# Control of which category to use.
self
.
_use_category
=
use_category
self
.
_num_sampled_masks
=
num_sampled_masks
self
.
_mask_crop_size
=
mask_crop_size
self
.
_mask_min_level
=
mask_min_level
self
.
_mask_max_level
=
mask_max_level
self
.
_outer_box_scale
=
outer_box_scale
self
.
_box_jitter_scale
=
box_jitter_scale
self
.
_up_sample_factor
=
upsample_factor
# Data is parsed depending on the model Modekey.
if
mode
==
ModeKeys
.
TRAIN
:
self
.
_parse_fn
=
self
.
_parse_train_data
elif
mode
==
ModeKeys
.
EVAL
:
self
.
_parse_fn
=
self
.
_parse_eval_data
elif
mode
==
ModeKeys
.
PREDICT
or
mode
==
ModeKeys
.
PREDICT_WITH_GT
:
self
.
_parse_fn
=
self
.
_parse_predict_data
else
:
raise
ValueError
(
'mode is not defined.'
)
def
__call__
(
self
,
value
):
"""Parses data to an image and associated training labels.
Args:
value: a string tensor holding a serialized tf.Example proto.
Returns:
inputs:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
mask_boxes: sampled boxes that tightly enclose the training masks. The
box is represented in [y1, x1, y2, x2] format. The tensor is sampled
to the fixed dimension [self._num_sampled_masks, 4].
mask_outer_boxes: loose box that enclose sampled tight box. The
box is represented in [y1, x1, y2, x2] format. The tensor is sampled
to the fixed dimension [self._num_sampled_masks, 4].
mask_classes: the class ids of sampled training masks. The tensor has
shape [self._num_sampled_masks].
labels:
cls_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
num_positives: number of positive anchors in the image.
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
image_scale: 2D float `Tensor` representing scale factors that apply
to [height, width] of input image.
mask_targets: training binary mask targets. The tensor has shape
[self._num_sampled_masks, self._mask_crop_size, self._mask_crop_size].
mask_is_valid: the binary tensor to indicate if the sampled masks are
valide. The sampled masks are invalid when no mask annotations are
included in the image. The tensor has shape [1].
groundtruths:
source_id: source image id. Default value -1 if the source id is empty
in the groundtruth annotation.
boxes: groundtruth bounding box annotations. The box is represented in
[y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed
dimension [self._max_num_instances, 4].
classes: groundtruth classes annotations. The tensor is padded with
-1 to the fixed dimension [self._max_num_instances].
areas: groundtruth areas annotations. The tensor is padded with -1
to the fixed dimension [self._max_num_instances].
is_crowds: groundtruth annotations to indicate if an annotation
represents a group of instances by value {0, 1}. The tensor is
padded with 0 to the fixed dimension [self._max_num_instances].
"""
with
tf
.
name_scope
(
'parser'
):
data
=
self
.
_example_decoder
.
decode
(
value
)
return
self
.
_parse_fn
(
data
)
def
_parse_train_data
(
self
,
data
):
"""Parse data for ShapeMask training."""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
masks
=
data
[
'groundtruth_instance_masks'
]
is_crowds
=
data
[
'groundtruth_is_crowd'
]
# Skips annotations with `is_crowd` = True.
if
self
.
_skip_crowd_during_training
and
self
.
_is_training
:
num_groundtrtuhs
=
tf
.
shape
(
classes
)[
0
]
with
tf
.
control_dependencies
([
num_groundtrtuhs
,
is_crowds
]):
indices
=
tf
.
cond
(
tf
.
greater
(
tf
.
size
(
is_crowds
),
0
),
lambda
:
tf
.
where
(
tf
.
logical_not
(
is_crowds
))[:,
0
],
lambda
:
tf
.
cast
(
tf
.
range
(
num_groundtrtuhs
),
tf
.
int64
))
classes
=
tf
.
gather
(
classes
,
indices
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
masks
=
tf
.
gather
(
masks
,
indices
)
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# If not using category, makes all categories with id = 0.
if
not
self
.
_use_category
:
classes
=
tf
.
cast
(
tf
.
greater
(
classes
,
0
),
dtype
=
tf
.
float32
)
# Normalizes image with mean and std pixel values.
image
=
input_utils
.
normalize_image
(
image
)
# Flips image randomly during training.
if
self
.
_aug_rand_hflip
:
image
,
boxes
,
masks
=
input_utils
.
random_horizontal_flip
(
image
,
boxes
,
masks
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_utils
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
input_utils
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
self
.
_output_size
,
aug_scale_min
=
self
.
_aug_scale_min
,
aug_scale_max
=
self
.
_aug_scale_max
)
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
# Resizes and crops boxes and masks.
boxes
=
input_utils
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_utils
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
masks
=
tf
.
gather
(
masks
,
indices
)
# Assigns anchors.
input_anchor
=
anchor
.
Anchor
(
self
.
_min_level
,
self
.
_max_level
,
self
.
_num_scales
,
self
.
_aspect_ratios
,
self
.
_anchor_size
,
self
.
_output_size
)
anchor_labeler
=
anchor
.
AnchorLabeler
(
input_anchor
,
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
(
cls_targets
,
box_targets
,
num_positives
)
=
anchor_labeler
.
label_anchors
(
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=
1
),
tf
.
float32
))
# Sample groundtruth masks/boxes/classes for mask branch.
num_masks
=
tf
.
shape
(
masks
)[
0
]
mask_shape
=
tf
.
shape
(
masks
)[
1
:
3
]
# Pad sampled boxes/masks/classes to a constant batch size.
padded_boxes
=
pad_to_size
(
boxes
,
self
.
_num_sampled_masks
)
padded_classes
=
pad_to_size
(
classes
,
self
.
_num_sampled_masks
)
padded_masks
=
pad_to_size
(
masks
,
self
.
_num_sampled_masks
)
# Randomly sample groundtruth masks for mask branch training. For the image
# without groundtruth masks, it will sample the dummy padded tensors.
rand_indices
=
tf
.
random
.
shuffle
(
tf
.
range
(
tf
.
maximum
(
num_masks
,
self
.
_num_sampled_masks
)))
rand_indices
=
tf
.
math
.
mod
(
rand_indices
,
tf
.
maximum
(
num_masks
,
1
))
rand_indices
=
rand_indices
[
0
:
self
.
_num_sampled_masks
]
rand_indices
=
tf
.
reshape
(
rand_indices
,
[
self
.
_num_sampled_masks
])
sampled_boxes
=
tf
.
gather
(
padded_boxes
,
rand_indices
)
sampled_classes
=
tf
.
gather
(
padded_classes
,
rand_indices
)
sampled_masks
=
tf
.
gather
(
padded_masks
,
rand_indices
)
# Jitter the sampled boxes to mimic the noisy detections.
sampled_boxes
=
box_utils
.
jitter_boxes
(
sampled_boxes
,
noise_scale
=
self
.
_box_jitter_scale
)
sampled_boxes
=
box_utils
.
clip_boxes
(
sampled_boxes
,
self
.
_output_size
)
# Compute mask targets in feature crop. A feature crop fully contains a
# sampled box.
mask_outer_boxes
=
box_utils
.
compute_outer_boxes
(
sampled_boxes
,
tf
.
shape
(
image
)[
0
:
2
],
scale
=
self
.
_outer_box_scale
)
mask_outer_boxes
=
box_utils
.
clip_boxes
(
mask_outer_boxes
,
self
.
_output_size
)
# Compensate the offset of mask_outer_boxes to map it back to original image
# scale.
mask_outer_boxes_ori
=
mask_outer_boxes
mask_outer_boxes_ori
+=
tf
.
tile
(
tf
.
expand_dims
(
offset
,
axis
=
0
),
[
1
,
2
])
mask_outer_boxes_ori
/=
tf
.
tile
(
tf
.
expand_dims
(
image_scale
,
axis
=
0
),
[
1
,
2
])
norm_mask_outer_boxes_ori
=
box_utils
.
normalize_boxes
(
mask_outer_boxes_ori
,
mask_shape
)
# Set sampled_masks shape to [batch_size, height, width, 1].
sampled_masks
=
tf
.
cast
(
tf
.
expand_dims
(
sampled_masks
,
axis
=-
1
),
tf
.
float32
)
mask_targets
=
tf
.
image
.
crop_and_resize
(
sampled_masks
,
norm_mask_outer_boxes_ori
,
box_indices
=
tf
.
range
(
self
.
_num_sampled_masks
),
crop_size
=
[
self
.
_mask_crop_size
,
self
.
_mask_crop_size
],
method
=
'bilinear'
,
extrapolation_value
=
0
,
name
=
'train_mask_targets'
)
mask_targets
=
tf
.
where
(
tf
.
greater_equal
(
mask_targets
,
0.5
),
tf
.
ones_like
(
mask_targets
),
tf
.
zeros_like
(
mask_targets
))
mask_targets
=
tf
.
squeeze
(
mask_targets
,
axis
=-
1
)
if
self
.
_up_sample_factor
>
1
:
fine_mask_targets
=
tf
.
image
.
crop_and_resize
(
sampled_masks
,
norm_mask_outer_boxes_ori
,
box_indices
=
tf
.
range
(
self
.
_num_sampled_masks
),
crop_size
=
[
self
.
_mask_crop_size
*
self
.
_up_sample_factor
,
self
.
_mask_crop_size
*
self
.
_up_sample_factor
],
method
=
'bilinear'
,
extrapolation_value
=
0
,
name
=
'train_mask_targets'
)
fine_mask_targets
=
tf
.
where
(
tf
.
greater_equal
(
fine_mask_targets
,
0.5
),
tf
.
ones_like
(
fine_mask_targets
),
tf
.
zeros_like
(
fine_mask_targets
))
fine_mask_targets
=
tf
.
squeeze
(
fine_mask_targets
,
axis
=-
1
)
else
:
fine_mask_targets
=
mask_targets
# If bfloat16 is used, casts input image to tf.bfloat16.
if
self
.
_use_bfloat16
:
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
bfloat16
)
valid_image
=
tf
.
cast
(
tf
.
not_equal
(
num_masks
,
0
),
tf
.
int32
)
if
self
.
_mask_train_class
==
'all'
:
mask_is_valid
=
valid_image
*
tf
.
ones_like
(
sampled_classes
,
tf
.
int32
)
else
:
# Get the intersection of sampled classes with training splits.
mask_valid_classes
=
tf
.
cast
(
tf
.
expand_dims
(
class_utils
.
coco_split_class_ids
(
self
.
_mask_train_class
),
1
),
sampled_classes
.
dtype
)
match
=
tf
.
reduce_any
(
tf
.
equal
(
tf
.
expand_dims
(
sampled_classes
,
0
),
mask_valid_classes
),
0
)
mask_is_valid
=
valid_image
*
tf
.
cast
(
match
,
tf
.
int32
)
# Packs labels for model_fn outputs.
labels
=
{
'cls_targets'
:
cls_targets
,
'box_targets'
:
box_targets
,
'anchor_boxes'
:
input_anchor
.
multilevel_boxes
,
'num_positives'
:
num_positives
,
'image_info'
:
image_info
,
# For ShapeMask.
'mask_targets'
:
mask_targets
,
'fine_mask_targets'
:
fine_mask_targets
,
'mask_is_valid'
:
mask_is_valid
,
}
inputs
=
{
'image'
:
image
,
'image_info'
:
image_info
,
'mask_boxes'
:
sampled_boxes
,
'mask_outer_boxes'
:
mask_outer_boxes
,
'mask_classes'
:
sampled_classes
,
}
return
inputs
,
labels
def
_parse_predict_data
(
self
,
data
):
"""Parse data for ShapeMask training."""
classes
=
data
[
'groundtruth_classes'
]
boxes
=
data
[
'groundtruth_boxes'
]
masks
=
data
[
'groundtruth_instance_masks'
]
# Gets original image and its size.
image
=
data
[
'image'
]
image_shape
=
tf
.
shape
(
image
)[
0
:
2
]
# If not using category, makes all categories with id = 0.
if
not
self
.
_use_category
:
classes
=
tf
.
cast
(
tf
.
greater
(
classes
,
0
),
dtype
=
tf
.
float32
)
# Normalizes image with mean and std pixel values.
image
=
input_utils
.
normalize_image
(
image
)
# Converts boxes from normalized coordinates to pixel coordinates.
boxes
=
box_utils
.
denormalize_boxes
(
boxes
,
image_shape
)
# Resizes and crops image.
image
,
image_info
=
input_utils
.
resize_and_crop_image
(
image
,
self
.
_output_size
,
self
.
_output_size
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
)
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
# Resizes and crops boxes and masks.
boxes
=
input_utils
.
resize_and_crop_boxes
(
boxes
,
image_scale
,
image_info
[
1
,
:],
offset
)
masks
=
input_utils
.
resize_and_crop_masks
(
tf
.
expand_dims
(
masks
,
axis
=-
1
),
image_scale
,
self
.
_output_size
,
offset
)
# Filters out ground truth boxes that are all zeros.
indices
=
box_utils
.
get_non_empty_box_indices
(
boxes
)
boxes
=
tf
.
gather
(
boxes
,
indices
)
classes
=
tf
.
gather
(
classes
,
indices
)
# Assigns anchors.
input_anchor
=
anchor
.
Anchor
(
self
.
_min_level
,
self
.
_max_level
,
self
.
_num_scales
,
self
.
_aspect_ratios
,
self
.
_anchor_size
,
self
.
_output_size
)
anchor_labeler
=
anchor
.
AnchorLabeler
(
input_anchor
,
self
.
_match_threshold
,
self
.
_unmatched_threshold
)
# If bfloat16 is used, casts input image to tf.bfloat16.
if
self
.
_use_bfloat16
:
image
=
tf
.
cast
(
image
,
dtype
=
tf
.
bfloat16
)
labels
=
{
'anchor_boxes'
:
input_anchor
.
multilevel_boxes
,
'image_info'
:
image_info
,
}
if
self
.
_mode
==
ModeKeys
.
PREDICT_WITH_GT
:
# Converts boxes from normalized coordinates to pixel coordinates.
groundtruths
=
{
'source_id'
:
data
[
'source_id'
],
'height'
:
data
[
'height'
],
'width'
:
data
[
'width'
],
'num_detections'
:
tf
.
shape
(
data
[
'groundtruth_classes'
]),
'boxes'
:
box_utils
.
denormalize_boxes
(
data
[
'groundtruth_boxes'
],
image_shape
),
'classes'
:
data
[
'groundtruth_classes'
],
# 'masks': tf.squeeze(masks, axis=-1),
'areas'
:
data
[
'groundtruth_area'
],
'is_crowds'
:
tf
.
cast
(
data
[
'groundtruth_is_crowd'
],
tf
.
int32
),
}
groundtruths
[
'source_id'
]
=
dataloader_utils
.
process_source_id
(
groundtruths
[
'source_id'
])
groundtruths
=
dataloader_utils
.
pad_groundtruths_to_fixed_size
(
groundtruths
,
self
.
_max_num_instances
)
# Computes training labels.
(
cls_targets
,
box_targets
,
num_positives
)
=
anchor_labeler
.
label_anchors
(
boxes
,
tf
.
cast
(
tf
.
expand_dims
(
classes
,
axis
=
1
),
tf
.
float32
))
# Packs labels for model_fn outputs.
labels
.
update
({
'cls_targets'
:
cls_targets
,
'box_targets'
:
box_targets
,
'num_positives'
:
num_positives
,
'groundtruths'
:
groundtruths
,
})
inputs
=
{
'image'
:
image
,
'image_info'
:
image_info
,
}
return
inputs
,
labels
models-2.13.1/official/legacy/detection/dataloader/tf_example_decoder.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import
tensorflow
as
tf
class
TfExampleDecoder
(
object
):
"""Tensorflow Example proto decoder."""
def
__init__
(
self
,
include_mask
=
False
):
self
.
_include_mask
=
include_mask
self
.
_keys_to_features
=
{
'image/encoded'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
),
'image/source_id'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
),
'image/height'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
),
'image/width'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
int64
),
'image/object/bbox/xmin'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/bbox/xmax'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/bbox/ymin'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/bbox/ymax'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/class/label'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
'image/object/area'
:
tf
.
io
.
VarLenFeature
(
tf
.
float32
),
'image/object/is_crowd'
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
}
if
include_mask
:
self
.
_keys_to_features
.
update
({
'image/object/mask'
:
tf
.
io
.
VarLenFeature
(
tf
.
string
),
})
def
_decode_image
(
self
,
parsed_tensors
):
"""Decodes the image and set its static shape."""
image
=
tf
.
io
.
decode_image
(
parsed_tensors
[
'image/encoded'
],
channels
=
3
)
image
.
set_shape
([
None
,
None
,
3
])
return
image
def
_decode_boxes
(
self
,
parsed_tensors
):
"""Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
xmin
=
parsed_tensors
[
'image/object/bbox/xmin'
]
xmax
=
parsed_tensors
[
'image/object/bbox/xmax'
]
ymin
=
parsed_tensors
[
'image/object/bbox/ymin'
]
ymax
=
parsed_tensors
[
'image/object/bbox/ymax'
]
return
tf
.
stack
([
ymin
,
xmin
,
ymax
,
xmax
],
axis
=-
1
)
def
_decode_masks
(
self
,
parsed_tensors
):
"""Decode a set of PNG masks to the tf.float32 tensors."""
def
_decode_png_mask
(
png_bytes
):
mask
=
tf
.
squeeze
(
tf
.
io
.
decode_png
(
png_bytes
,
channels
=
1
,
dtype
=
tf
.
uint8
),
axis
=-
1
)
mask
=
tf
.
cast
(
mask
,
dtype
=
tf
.
float32
)
mask
.
set_shape
([
None
,
None
])
return
mask
height
=
parsed_tensors
[
'image/height'
]
width
=
parsed_tensors
[
'image/width'
]
masks
=
parsed_tensors
[
'image/object/mask'
]
return
tf
.
cond
(
pred
=
tf
.
greater
(
tf
.
size
(
input
=
masks
),
0
),
true_fn
=
lambda
:
tf
.
map_fn
(
_decode_png_mask
,
masks
,
dtype
=
tf
.
float32
),
false_fn
=
lambda
:
tf
.
zeros
([
0
,
height
,
width
],
dtype
=
tf
.
float32
))
def
_decode_areas
(
self
,
parsed_tensors
):
xmin
=
parsed_tensors
[
'image/object/bbox/xmin'
]
xmax
=
parsed_tensors
[
'image/object/bbox/xmax'
]
ymin
=
parsed_tensors
[
'image/object/bbox/ymin'
]
ymax
=
parsed_tensors
[
'image/object/bbox/ymax'
]
return
tf
.
cond
(
tf
.
greater
(
tf
.
shape
(
parsed_tensors
[
'image/object/area'
])[
0
],
0
),
lambda
:
parsed_tensors
[
'image/object/area'
],
lambda
:
(
xmax
-
xmin
)
*
(
ymax
-
ymin
))
def
decode
(
self
,
serialized_example
):
"""Decode the serialized example.
Args:
serialized_example: a single serialized tf.Example string.
Returns:
decoded_tensors: a dictionary of tensors with the following fields:
- image: a uint8 tensor of shape [None, None, 3].
- source_id: a string scalar tensor.
- height: an integer scalar tensor.
- width: an integer scalar tensor.
- groundtruth_classes: a int64 tensor of shape [None].
- groundtruth_is_crowd: a bool tensor of shape [None].
- groundtruth_area: a float32 tensor of shape [None].
- groundtruth_boxes: a float32 tensor of shape [None, 4].
- groundtruth_instance_masks: a float32 tensor of shape
[None, None, None].
- groundtruth_instance_masks_png: a string tensor of shape [None].
"""
parsed_tensors
=
tf
.
io
.
parse_single_example
(
serialized
=
serialized_example
,
features
=
self
.
_keys_to_features
)
for
k
in
parsed_tensors
:
if
isinstance
(
parsed_tensors
[
k
],
tf
.
SparseTensor
):
if
parsed_tensors
[
k
].
dtype
==
tf
.
string
:
parsed_tensors
[
k
]
=
tf
.
sparse
.
to_dense
(
parsed_tensors
[
k
],
default_value
=
''
)
else
:
parsed_tensors
[
k
]
=
tf
.
sparse
.
to_dense
(
parsed_tensors
[
k
],
default_value
=
0
)
image
=
self
.
_decode_image
(
parsed_tensors
)
boxes
=
self
.
_decode_boxes
(
parsed_tensors
)
areas
=
self
.
_decode_areas
(
parsed_tensors
)
is_crowds
=
tf
.
cond
(
tf
.
greater
(
tf
.
shape
(
parsed_tensors
[
'image/object/is_crowd'
])[
0
],
0
),
lambda
:
tf
.
cast
(
parsed_tensors
[
'image/object/is_crowd'
],
dtype
=
tf
.
bool
),
lambda
:
tf
.
zeros_like
(
parsed_tensors
[
'image/object/class/label'
],
dtype
=
tf
.
bool
))
# pylint: disable=line-too-long
if
self
.
_include_mask
:
masks
=
self
.
_decode_masks
(
parsed_tensors
)
decoded_tensors
=
{
'image'
:
image
,
'source_id'
:
parsed_tensors
[
'image/source_id'
],
'height'
:
parsed_tensors
[
'image/height'
],
'width'
:
parsed_tensors
[
'image/width'
],
'groundtruth_classes'
:
parsed_tensors
[
'image/object/class/label'
],
'groundtruth_is_crowd'
:
is_crowds
,
'groundtruth_area'
:
areas
,
'groundtruth_boxes'
:
boxes
,
}
if
self
.
_include_mask
:
decoded_tensors
.
update
({
'groundtruth_instance_masks'
:
masks
,
'groundtruth_instance_masks_png'
:
parsed_tensors
[
'image/object/mask'
],
})
return
decoded_tensors
models-2.13.1/official/legacy/detection/evaluation/__init__.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models-2.13.1/official/legacy/detection/evaluation/coco_evaluator.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The COCO-style evaluator.
The following snippet demonstrates the use of interfaces:
evaluator = COCOEvaluator(...)
for _ in range(num_evals):
for _ in range(num_batches_per_eval):
predictions, groundtruth = predictor.predict(...) # pop a batch.
evaluator.update(predictions, groundtruths) # aggregate internal stats.
evaluator.evaluate() # finish one full eval.
See also: https://github.com/cocodataset/cocoapi/
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
atexit
import
copy
import
tempfile
from
absl
import
logging
import
numpy
as
np
from
pycocotools
import
cocoeval
import
six
import
tensorflow
as
tf
from
official.legacy.detection.evaluation
import
coco_utils
from
official.legacy.detection.utils
import
class_utils
class
OlnCOCOevalWrapper
(
cocoeval
.
COCOeval
):
"""COCOeval wrapper class.
Rewritten based on cocoapi: (pycocotools/cocoeval.py)
This class wraps COCOEVAL API object, which provides the following additional
functionalities:
1. summarze 'all', 'seen', and 'novel' split output print-out, e.g., AR at
different K proposals, AR and AP resutls for 'seen' and 'novel' class
splits.
"""
def
__init__
(
self
,
coco_gt
,
coco_dt
,
iou_type
=
'box'
):
super
(
OlnCOCOevalWrapper
,
self
).
__init__
(
cocoGt
=
coco_gt
,
cocoDt
=
coco_dt
,
iouType
=
iou_type
)
def
summarize
(
self
):
"""Compute and display summary metrics for evaluation results.
Delta to the standard cocoapi function:
More Averate Recall metrics are produced with different top-K proposals.
Note this functin can *only* be applied on the default parameter
setting.
Raises:
Exception: Please run accumulate() first.
"""
def
_summarize
(
ap
=
1
,
iou_thr
=
None
,
area_rng
=
'all'
,
max_dets
=
100
):
p
=
self
.
params
i_str
=
(
' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = '
'{:0.3f}'
)
title_str
=
'Average Precision'
if
ap
==
1
else
'Average Recall'
type_str
=
'(AP)'
if
ap
==
1
else
'(AR)'
iou_str
=
'{:0.2f}:{:0.2f}'
.
format
(
p
.
iouThrs
[
0
],
p
.
iouThrs
[
-
1
])
if
iou_thr
is
None
else
'{:0.2f}'
.
format
(
iou_thr
)
aind
=
[
i
for
i
,
a_rng
in
enumerate
(
p
.
areaRngLbl
)
if
a_rng
==
area_rng
]
mind
=
[
i
for
i
,
m_det
in
enumerate
(
p
.
maxDets
)
if
m_det
==
max_dets
]
if
ap
==
1
:
# dimension of precision: [TxRxKxAxM]
s
=
self
.
eval
[
'precision'
]
# IoU
if
iou_thr
is
not
None
:
t
=
np
.
where
(
iou_thr
==
p
.
iouThrs
)[
0
]
s
=
s
[
t
]
s
=
s
[:,
:,
:,
aind
,
mind
]
else
:
# dimension of recall: [TxKxAxM]
s
=
self
.
eval
[
'recall'
]
if
iou_thr
is
not
None
:
t
=
np
.
where
(
iou_thr
==
p
.
iouThrs
)[
0
]
s
=
s
[
t
]
s
=
s
[:,
:,
aind
,
mind
]
if
not
(
s
[
s
>
-
1
]).
any
():
mean_s
=
-
1
else
:
mean_s
=
np
.
mean
(
s
[
s
>
-
1
])
print
(
i_str
.
format
(
title_str
,
type_str
,
iou_str
,
area_rng
,
max_dets
,
mean_s
))
return
mean_s
def
_summarize_dets
():
stats
=
np
.
zeros
((
14
,))
stats
[
0
]
=
_summarize
(
1
)
stats
[
1
]
=
_summarize
(
1
,
iou_thr
=
.
5
,
)
stats
[
2
]
=
_summarize
(
1
,
iou_thr
=
.
75
,
)
stats
[
3
]
=
_summarize
(
1
,
area_rng
=
'small'
,
)
stats
[
4
]
=
_summarize
(
1
,
area_rng
=
'medium'
,
)
stats
[
5
]
=
_summarize
(
1
,
area_rng
=
'large'
,
)
stats
[
6
]
=
_summarize
(
0
,
max_dets
=
self
.
params
.
maxDets
[
0
])
# 10
stats
[
7
]
=
_summarize
(
0
,
max_dets
=
self
.
params
.
maxDets
[
1
])
# 20
stats
[
8
]
=
_summarize
(
0
,
max_dets
=
self
.
params
.
maxDets
[
2
])
# 50
stats
[
9
]
=
_summarize
(
0
,
max_dets
=
self
.
params
.
maxDets
[
3
])
# 100
stats
[
10
]
=
_summarize
(
0
,
max_dets
=
self
.
params
.
maxDets
[
4
])
# 200
stats
[
11
]
=
_summarize
(
0
,
area_rng
=
'small'
,
max_dets
=
10
)
stats
[
12
]
=
_summarize
(
0
,
area_rng
=
'medium'
,
max_dets
=
10
)
stats
[
13
]
=
_summarize
(
0
,
area_rng
=
'large'
,
max_dets
=
10
)
return
stats
if
not
self
.
eval
:
raise
Exception
(
'Please run accumulate() first'
)
summarize
=
_summarize_dets
self
.
stats
=
summarize
()
class
OlnCOCOevalXclassWrapper
(
OlnCOCOevalWrapper
):
"""COCOeval wrapper class.
Rewritten based on cocoapi: (pycocotools/cocoeval.py)
Delta to the standard cocoapi:
Detections that hit the 'seen' class objects are ignored in top-K proposals.
This class wraps COCOEVAL API object, which provides the following additional
functionalities:
1. Include ignore-class split (e.g., 'voc' or 'nonvoc').
2. Do not count (or ignore) box proposals hitting ignore-class when
evaluating Average Recall at top-K proposals.
"""
def
__init__
(
self
,
coco_gt
,
coco_dt
,
iou_type
=
'box'
):
super
(
OlnCOCOevalXclassWrapper
,
self
).
__init__
(
coco_gt
=
coco_gt
,
coco_dt
=
coco_dt
,
iou_type
=
iou_type
)
def
evaluateImg
(
self
,
img_id
,
cat_id
,
a_rng
,
max_det
):
p
=
self
.
params
if
p
.
useCats
:
gt
=
self
.
_gts
[
img_id
,
cat_id
]
dt
=
self
.
_dts
[
img_id
,
cat_id
]
else
:
gt
,
dt
=
[],
[]
for
c_id
in
p
.
catIds
:
gt
.
extend
(
self
.
_gts
[
img_id
,
c_id
])
dt
.
extend
(
self
.
_dts
[
img_id
,
c_id
])
if
not
gt
and
not
dt
:
return
None
for
g
in
gt
:
if
g
[
'ignore'
]
or
(
g
[
'area'
]
<
a_rng
[
0
]
or
g
[
'area'
]
>
a_rng
[
1
]):
g
[
'_ignore'
]
=
1
else
:
g
[
'_ignore'
]
=
0
# Class manipulation: ignore the 'ignored_split'.
if
'ignored_split'
in
g
and
g
[
'ignored_split'
]
==
1
:
g
[
'_ignore'
]
=
1
# sort dt highest score first, sort gt ignore last
gtind
=
np
.
argsort
([
g
[
'_ignore'
]
for
g
in
gt
],
kind
=
'mergesort'
)
gt
=
[
gt
[
i
]
for
i
in
gtind
]
dtind
=
np
.
argsort
([
-
d
[
'score'
]
for
d
in
dt
],
kind
=
'mergesort'
)
dt
=
[
dt
[
i
]
for
i
in
dtind
[
0
:
max_det
]]
iscrowd
=
[
int
(
o
[
'iscrowd'
])
for
o
in
gt
]
# load computed ious
# ious = self.ious[img_id, cat_id][:, gtind] if len(
# self.ious[img_id, cat_id]) > 0 else self.ious[img_id, cat_id]
if
self
.
ious
[
img_id
,
cat_id
].
any
():
ious
=
self
.
ious
[
img_id
,
cat_id
][:,
gtind
]
else
:
ious
=
self
.
ious
[
img_id
,
cat_id
]
tt
=
len
(
p
.
iouThrs
)
gg
=
len
(
gt
)
dd
=
len
(
dt
)
gtm
=
np
.
zeros
((
tt
,
gg
))
dtm
=
np
.
zeros
((
tt
,
dd
))
gt_ig
=
np
.
array
([
g
[
'_ignore'
]
for
g
in
gt
])
dt_ig
=
np
.
zeros
((
tt
,
dd
))
# indicator of whether the gt object class is of ignored_split or not.
gt_ig_split
=
np
.
array
([
g
[
'ignored_split'
]
for
g
in
gt
])
dt_ig_split
=
np
.
zeros
((
dd
))
if
ious
.
any
():
for
tind
,
t
in
enumerate
(
p
.
iouThrs
):
for
dind
,
d
in
enumerate
(
dt
):
# information about best match so far (m=-1 -> unmatched)
iou
=
min
([
t
,
1
-
1e-10
])
m
=
-
1
for
gind
,
g
in
enumerate
(
gt
):
# if this gt already matched, and not a crowd, continue
if
gtm
[
tind
,
gind
]
>
0
and
not
iscrowd
[
gind
]:
continue
# if dt matched to reg gt, and on ignore gt, stop
if
m
>
-
1
and
gt_ig
[
m
]
==
0
and
gt_ig
[
gind
]
==
1
:
break
# continue to next gt unless better match made
if
ious
[
dind
,
gind
]
<
iou
:
continue
# if match successful and best so far, store appropriately
iou
=
ious
[
dind
,
gind
]
m
=
gind
# if match made store id of match for both dt and gt
if
m
==
-
1
:
continue
dt_ig
[
tind
,
dind
]
=
gt_ig
[
m
]
dtm
[
tind
,
dind
]
=
gt
[
m
][
'id'
]
gtm
[
tind
,
m
]
=
d
[
'id'
]
# Activate to ignore the seen-class detections.
if
tind
==
0
:
# Register just only once: tind > 0 is also fine.
dt_ig_split
[
dind
]
=
gt_ig_split
[
m
]
# set unmatched detections outside of area range to ignore
a
=
np
.
array
([
d
[
'area'
]
<
a_rng
[
0
]
or
d
[
'area'
]
>
a_rng
[
1
]
for
d
in
dt
]).
reshape
((
1
,
len
(
dt
)))
dt_ig
=
np
.
logical_or
(
dt_ig
,
np
.
logical_and
(
dtm
==
0
,
np
.
repeat
(
a
,
tt
,
0
)))
# Activate to ignore the seen-class detections.
# Take only eval_split (eg, nonvoc) and ignore seen_split (eg, voc).
if
dt_ig_split
.
sum
()
>
0
:
dtm
=
dtm
[:,
dt_ig_split
==
0
]
dt_ig
=
dt_ig
[:,
dt_ig_split
==
0
]
len_dt
=
min
(
max_det
,
len
(
dt
))
dt
=
[
dt
[
i
]
for
i
in
range
(
len_dt
)
if
dt_ig_split
[
i
]
==
0
]
# store results for given image and category
return
{
'image_id'
:
img_id
,
'category_id'
:
cat_id
,
'aRng'
:
a_rng
,
'maxDet'
:
max_det
,
'dtIds'
:
[
d
[
'id'
]
for
d
in
dt
],
'gtIds'
:
[
g
[
'id'
]
for
g
in
gt
],
'dtMatches'
:
dtm
,
'gtMatches'
:
gtm
,
'dtScores'
:
[
d
[
'score'
]
for
d
in
dt
],
'gtIgnore'
:
gt_ig
,
'dtIgnore'
:
dt_ig
,
}
class
MetricWrapper
(
object
):
"""Metric Wrapper of the COCO evaluator."""
# This is only a wrapper for COCO metric and works on for numpy array. So it
# doesn't inherit from tf.keras.layers.Layer or tf.keras.metrics.Metric.
def
__init__
(
self
,
evaluator
):
self
.
_evaluator
=
evaluator
def
update_state
(
self
,
y_true
,
y_pred
):
"""Update internal states."""
labels
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
y_true
)
outputs
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
(),
y_pred
)
groundtruths
=
{}
predictions
=
{}
for
key
,
val
in
outputs
.
items
():
if
isinstance
(
val
,
tuple
):
val
=
np
.
concatenate
(
val
)
predictions
[
key
]
=
val
for
key
,
val
in
labels
.
items
():
if
isinstance
(
val
,
tuple
):
val
=
np
.
concatenate
(
val
)
groundtruths
[
key
]
=
val
self
.
_evaluator
.
update
(
predictions
,
groundtruths
)
def
result
(
self
):
return
self
.
_evaluator
.
evaluate
()
def
reset_states
(
self
):
return
self
.
_evaluator
.
reset
()
class
COCOEvaluator
(
object
):
"""COCO evaluation metric class."""
def
__init__
(
self
,
annotation_file
,
include_mask
,
need_rescale_bboxes
=
True
):
"""Constructs COCO evaluation class.
The class provides the interface to metrics_fn in TPUEstimator. The
_update_op() takes detections from each image and push them to
self.detections. The _evaluate() loads a JSON file in COCO annotation format
as the groundtruths and runs COCO evaluation.
Args:
annotation_file: a JSON file that stores annotations of the eval dataset.
If `annotation_file` is None, groundtruth annotations will be loaded
from the dataloader.
include_mask: a boolean to indicate whether or not to include the mask
eval.
need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
to absolute values (`image_info` is needed in this case).
"""
if
annotation_file
:
if
annotation_file
.
startswith
(
'gs://'
):
_
,
local_val_json
=
tempfile
.
mkstemp
(
suffix
=
'.json'
)
tf
.
io
.
gfile
.
remove
(
local_val_json
)
tf
.
io
.
gfile
.
copy
(
annotation_file
,
local_val_json
)
atexit
.
register
(
tf
.
io
.
gfile
.
remove
,
local_val_json
)
else
:
local_val_json
=
annotation_file
self
.
_coco_gt
=
coco_utils
.
COCOWrapper
(
eval_type
=
(
'mask'
if
include_mask
else
'box'
),
annotation_file
=
local_val_json
)
self
.
_annotation_file
=
annotation_file
self
.
_include_mask
=
include_mask
self
.
_metric_names
=
[
'AP'
,
'AP50'
,
'AP75'
,
'APs'
,
'APm'
,
'APl'
,
'ARmax1'
,
'ARmax10'
,
'ARmax100'
,
'ARs'
,
'ARm'
,
'ARl'
]
self
.
_required_prediction_fields
=
[
'source_id'
,
'num_detections'
,
'detection_classes'
,
'detection_scores'
,
'detection_boxes'
]
self
.
_need_rescale_bboxes
=
need_rescale_bboxes
if
self
.
_need_rescale_bboxes
:
self
.
_required_prediction_fields
.
append
(
'image_info'
)
self
.
_required_groundtruth_fields
=
[
'source_id'
,
'height'
,
'width'
,
'classes'
,
'boxes'
]
if
self
.
_include_mask
:
mask_metric_names
=
[
'mask_'
+
x
for
x
in
self
.
_metric_names
]
self
.
_metric_names
.
extend
(
mask_metric_names
)
self
.
_required_prediction_fields
.
extend
([
'detection_masks'
])
self
.
_required_groundtruth_fields
.
extend
([
'masks'
])
self
.
reset
()
def
reset
(
self
):
"""Resets internal states for a fresh run."""
self
.
_predictions
=
{}
if
not
self
.
_annotation_file
:
self
.
_groundtruths
=
{}
def
evaluate
(
self
):
"""Evaluates with detections from all images with COCO API.
Returns:
coco_metric: float numpy array with shape [24] representing the
coco-style evaluation metrics (box and mask).
"""
if
not
self
.
_annotation_file
:
logging
.
info
(
'Thre is no annotation_file in COCOEvaluator.'
)
gt_dataset
=
coco_utils
.
convert_groundtruths_to_coco_dataset
(
self
.
_groundtruths
)
coco_gt
=
coco_utils
.
COCOWrapper
(
eval_type
=
(
'mask'
if
self
.
_include_mask
else
'box'
),
gt_dataset
=
gt_dataset
)
else
:
logging
.
info
(
'Using annotation file: %s'
,
self
.
_annotation_file
)
coco_gt
=
self
.
_coco_gt
coco_predictions
=
coco_utils
.
convert_predictions_to_coco_annotations
(
self
.
_predictions
)
coco_dt
=
coco_gt
.
loadRes
(
predictions
=
coco_predictions
)
image_ids
=
[
ann
[
'image_id'
]
for
ann
in
coco_predictions
]
coco_eval
=
cocoeval
.
COCOeval
(
coco_gt
,
coco_dt
,
iouType
=
'bbox'
)
coco_eval
.
params
.
imgIds
=
image_ids
coco_eval
.
evaluate
()
coco_eval
.
accumulate
()
coco_eval
.
summarize
()
coco_metrics
=
coco_eval
.
stats
if
self
.
_include_mask
:
mcoco_eval
=
cocoeval
.
COCOeval
(
coco_gt
,
coco_dt
,
iouType
=
'segm'
)
mcoco_eval
.
params
.
imgIds
=
image_ids
mcoco_eval
.
evaluate
()
mcoco_eval
.
accumulate
()
mcoco_eval
.
summarize
()
mask_coco_metrics
=
mcoco_eval
.
stats
if
self
.
_include_mask
:
metrics
=
np
.
hstack
((
coco_metrics
,
mask_coco_metrics
))
else
:
metrics
=
coco_metrics
# Cleans up the internal variables in order for a fresh eval next time.
self
.
reset
()
metrics_dict
=
{}
for
i
,
name
in
enumerate
(
self
.
_metric_names
):
metrics_dict
[
name
]
=
metrics
[
i
].
astype
(
np
.
float32
)
return
metrics_dict
def
_process_predictions
(
self
,
predictions
):
image_scale
=
np
.
tile
(
predictions
[
'image_info'
][:,
2
:
3
,
:],
(
1
,
1
,
2
))
predictions
[
'detection_boxes'
]
=
(
predictions
[
'detection_boxes'
].
astype
(
np
.
float32
))
predictions
[
'detection_boxes'
]
/=
image_scale
if
'detection_outer_boxes'
in
predictions
:
predictions
[
'detection_outer_boxes'
]
=
(
predictions
[
'detection_outer_boxes'
].
astype
(
np
.
float32
))
predictions
[
'detection_outer_boxes'
]
/=
image_scale
def
update
(
self
,
predictions
,
groundtruths
=
None
):
"""Update and aggregate detection results and groundtruth data.
Args:
predictions: a dictionary of numpy arrays including the fields below. See
different parsers under `../dataloader` for more details.
Required fields:
- source_id: a numpy array of int or string of shape [batch_size].
- image_info [if `need_rescale_bboxes` is True]: a numpy array of
float of shape [batch_size, 4, 2].
- num_detections: a numpy array of int of shape [batch_size].
- detection_boxes: a numpy array of float of shape [batch_size, K, 4].
- detection_classes: a numpy array of int of shape [batch_size, K].
- detection_scores: a numpy array of float of shape [batch_size, K].
Optional fields:
- detection_masks: a numpy array of float of shape [batch_size, K,
mask_height, mask_width].
groundtruths: a dictionary of numpy arrays including the fields below. See
also different parsers under `../dataloader` for more details.
Required fields:
- source_id: a numpy array of int or string of shape [batch_size].
- height: a numpy array of int of shape [batch_size].
- width: a numpy array of int of shape [batch_size].
- num_detections: a numpy array of int of shape [batch_size].
- boxes: a numpy array of float of shape [batch_size, K, 4].
- classes: a numpy array of int of shape [batch_size, K].
Optional fields:
- is_crowds: a numpy array of int of shape [batch_size, K]. If the
field is absent, it is assumed that this instance is not crowd.
- areas: a numy array of float of shape [batch_size, K]. If the field
is absent, the area is calculated using either boxes or masks
depending on which one is available.
- masks: a numpy array of float of shape [batch_size, K, mask_height,
mask_width],
Raises:
ValueError: if the required prediction or groundtruth fields are not
present in the incoming `predictions` or `groundtruths`.
"""
for
k
in
self
.
_required_prediction_fields
:
if
k
not
in
predictions
:
raise
ValueError
(
'Missing the required key `{}` in predictions!'
.
format
(
k
))
if
self
.
_need_rescale_bboxes
:
self
.
_process_predictions
(
predictions
)
for
k
,
v
in
six
.
iteritems
(
predictions
):
if
k
not
in
self
.
_predictions
:
self
.
_predictions
[
k
]
=
[
v
]
else
:
self
.
_predictions
[
k
].
append
(
v
)
if
not
self
.
_annotation_file
:
assert
groundtruths
for
k
in
self
.
_required_groundtruth_fields
:
if
k
not
in
groundtruths
:
raise
ValueError
(
'Missing the required key `{}` in groundtruths!'
.
format
(
k
))
for
k
,
v
in
six
.
iteritems
(
groundtruths
):
if
k
not
in
self
.
_groundtruths
:
self
.
_groundtruths
[
k
]
=
[
v
]
else
:
self
.
_groundtruths
[
k
].
append
(
v
)
class
OlnXclassEvaluator
(
COCOEvaluator
):
"""COCO evaluation metric class."""
def
__init__
(
self
,
annotation_file
,
include_mask
,
need_rescale_bboxes
=
True
,
use_category
=
True
,
seen_class
=
'all'
):
"""Constructs COCO evaluation class.
The class provides the interface to metrics_fn in TPUEstimator. The
_update_op() takes detections from each image and push them to
self.detections. The _evaluate() loads a JSON file in COCO annotation format
as the groundtruths and runs COCO evaluation.
Args:
annotation_file: a JSON file that stores annotations of the eval dataset.
If `annotation_file` is None, groundtruth annotations will be loaded
from the dataloader.
include_mask: a boolean to indicate whether or not to include the mask
eval.
need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
to absolute values (`image_info` is needed in this case).
use_category: if `False`, treat all object in all classes in one
foreground category.
seen_class: 'all' or 'voc' or 'nonvoc'
"""
super
(
OlnXclassEvaluator
,
self
).
__init__
(
annotation_file
=
annotation_file
,
include_mask
=
include_mask
,
need_rescale_bboxes
=
need_rescale_bboxes
)
self
.
_use_category
=
use_category
self
.
_seen_class
=
seen_class
self
.
_seen_class_ids
=
class_utils
.
coco_split_class_ids
(
seen_class
)
self
.
_metric_names
=
[
'AP'
,
'AP50'
,
'AP75'
,
'APs'
,
'APm'
,
'APl'
,
'ARmax10'
,
'ARmax20'
,
'ARmax50'
,
'ARmax100'
,
'ARmax200'
,
'ARmax10s'
,
'ARmax10m'
,
'ARmax10l'
]
if
self
.
_seen_class
!=
'all'
:
self
.
_metric_names
.
extend
([
'AP_seen'
,
'AP50_seen'
,
'AP75_seen'
,
'APs_seen'
,
'APm_seen'
,
'APl_seen'
,
'ARmax10_seen'
,
'ARmax20_seen'
,
'ARmax50_seen'
,
'ARmax100_seen'
,
'ARmax200_seen'
,
'ARmax10s_seen'
,
'ARmax10m_seen'
,
'ARmax10l_seen'
,
'AP_novel'
,
'AP50_novel'
,
'AP75_novel'
,
'APs_novel'
,
'APm_novel'
,
'APl_novel'
,
'ARmax10_novel'
,
'ARmax20_novel'
,
'ARmax50_novel'
,
'ARmax100_novel'
,
'ARmax200_novel'
,
'ARmax10s_novel'
,
'ARmax10m_novel'
,
'ARmax10l_novel'
,
])
if
self
.
_include_mask
:
mask_metric_names
=
[
'mask_'
+
x
for
x
in
self
.
_metric_names
]
self
.
_metric_names
.
extend
(
mask_metric_names
)
self
.
_required_prediction_fields
.
extend
([
'detection_masks'
])
self
.
_required_groundtruth_fields
.
extend
([
'masks'
])
self
.
reset
()
def
evaluate
(
self
):
"""Evaluates with detections from all images with COCO API.
Returns:
coco_metric: float numpy array with shape [24] representing the
coco-style evaluation metrics (box and mask).
"""
if
not
self
.
_annotation_file
:
logging
.
info
(
'Thre is no annotation_file in COCOEvaluator.'
)
gt_dataset
=
coco_utils
.
convert_groundtruths_to_coco_dataset
(
self
.
_groundtruths
)
coco_gt
=
coco_utils
.
COCOWrapper
(
eval_type
=
(
'mask'
if
self
.
_include_mask
else
'box'
),
gt_dataset
=
gt_dataset
)
else
:
logging
.
info
(
'Using annotation file: %s'
,
self
.
_annotation_file
)
coco_gt
=
self
.
_coco_gt
coco_predictions
=
coco_utils
.
convert_predictions_to_coco_annotations
(
self
.
_predictions
)
coco_dt
=
coco_gt
.
loadRes
(
predictions
=
coco_predictions
)
image_ids
=
[
ann
[
'image_id'
]
for
ann
in
coco_predictions
]
# Class manipulation: 'all' split samples -> ignored_split = 0.
for
idx
,
ann
in
enumerate
(
coco_gt
.
dataset
[
'annotations'
]):
coco_gt
.
dataset
[
'annotations'
][
idx
][
'ignored_split'
]
=
0
coco_eval
=
cocoeval
.
OlnCOCOevalXclassWrapper
(
coco_gt
,
coco_dt
,
iou_type
=
'bbox'
)
coco_eval
.
params
.
maxDets
=
[
10
,
20
,
50
,
100
,
200
]
coco_eval
.
params
.
imgIds
=
image_ids
coco_eval
.
params
.
useCats
=
0
if
not
self
.
_use_category
else
1
coco_eval
.
evaluate
()
coco_eval
.
accumulate
()
coco_eval
.
summarize
()
coco_metrics
=
coco_eval
.
stats
if
self
.
_include_mask
:
mcoco_eval
=
cocoeval
.
OlnCOCOevalXclassWrapper
(
coco_gt
,
coco_dt
,
iou_type
=
'segm'
)
mcoco_eval
.
params
.
maxDets
=
[
10
,
20
,
50
,
100
,
200
]
mcoco_eval
.
params
.
imgIds
=
image_ids
mcoco_eval
.
params
.
useCats
=
0
if
not
self
.
_use_category
else
1
mcoco_eval
.
evaluate
()
mcoco_eval
.
accumulate
()
mcoco_eval
.
summarize
()
mask_coco_metrics
=
mcoco_eval
.
stats
if
self
.
_include_mask
:
metrics
=
np
.
hstack
((
coco_metrics
,
mask_coco_metrics
))
else
:
metrics
=
coco_metrics
if
self
.
_seen_class
!=
'all'
:
# for seen class eval, samples of novel_class are ignored.
coco_gt_seen
=
copy
.
deepcopy
(
coco_gt
)
for
idx
,
ann
in
enumerate
(
coco_gt
.
dataset
[
'annotations'
]):
if
ann
[
'category_id'
]
in
self
.
_seen_class_ids
:
coco_gt_seen
.
dataset
[
'annotations'
][
idx
][
'ignored_split'
]
=
0
else
:
coco_gt_seen
.
dataset
[
'annotations'
][
idx
][
'ignored_split'
]
=
1
coco_eval_seen
=
cocoeval
.
OlnCOCOevalXclassWrapper
(
coco_gt_seen
,
coco_dt
,
iou_type
=
'bbox'
)
coco_eval_seen
.
params
.
maxDets
=
[
10
,
20
,
50
,
100
,
200
]
coco_eval_seen
.
params
.
imgIds
=
image_ids
coco_eval_seen
.
params
.
useCats
=
0
if
not
self
.
_use_category
else
1
coco_eval_seen
.
evaluate
()
coco_eval_seen
.
accumulate
()
coco_eval_seen
.
summarize
()
coco_metrics_seen
=
coco_eval_seen
.
stats
if
self
.
_include_mask
:
mcoco_eval_seen
=
cocoeval
.
OlnCOCOevalXclassWrapper
(
coco_gt_seen
,
coco_dt
,
iou_type
=
'segm'
)
mcoco_eval_seen
.
params
.
maxDets
=
[
10
,
20
,
50
,
100
,
200
]
mcoco_eval_seen
.
params
.
imgIds
=
image_ids
mcoco_eval_seen
.
params
.
useCats
=
0
if
not
self
.
_use_category
else
1
mcoco_eval_seen
.
evaluate
()
mcoco_eval_seen
.
accumulate
()
mcoco_eval_seen
.
summarize
()
mask_coco_metrics_seen
=
mcoco_eval_seen
.
stats
# for novel class eval, samples of seen_class are ignored.
coco_gt_novel
=
copy
.
deepcopy
(
coco_gt
)
for
idx
,
ann
in
enumerate
(
coco_gt
.
dataset
[
'annotations'
]):
if
ann
[
'category_id'
]
in
self
.
_seen_class_ids
:
coco_gt_novel
.
dataset
[
'annotations'
][
idx
][
'ignored_split'
]
=
1
else
:
coco_gt_novel
.
dataset
[
'annotations'
][
idx
][
'ignored_split'
]
=
0
coco_eval_novel
=
cocoeval
.
OlnCOCOevalXclassWrapper
(
coco_gt_novel
,
coco_dt
,
iou_type
=
'bbox'
)
coco_eval_novel
.
params
.
maxDets
=
[
10
,
20
,
50
,
100
,
200
]
coco_eval_novel
.
params
.
imgIds
=
image_ids
coco_eval_novel
.
params
.
useCats
=
0
if
not
self
.
_use_category
else
1
coco_eval_novel
.
evaluate
()
coco_eval_novel
.
accumulate
()
coco_eval_novel
.
summarize
()
coco_metrics_novel
=
coco_eval_novel
.
stats
if
self
.
_include_mask
:
mcoco_eval_novel
=
cocoeval
.
OlnCOCOevalXclassWrapper
(
coco_gt_novel
,
coco_dt
,
iou_type
=
'segm'
)
mcoco_eval_novel
.
params
.
maxDets
=
[
10
,
20
,
50
,
100
,
200
]
mcoco_eval_novel
.
params
.
imgIds
=
image_ids
mcoco_eval_novel
.
params
.
useCats
=
0
if
not
self
.
_use_category
else
1
mcoco_eval_novel
.
evaluate
()
mcoco_eval_novel
.
accumulate
()
mcoco_eval_novel
.
summarize
()
mask_coco_metrics_novel
=
mcoco_eval_novel
.
stats
# Combine all splits.
if
self
.
_include_mask
:
metrics
=
np
.
hstack
((
coco_metrics
,
coco_metrics_seen
,
coco_metrics_novel
,
mask_coco_metrics
,
mask_coco_metrics_seen
,
mask_coco_metrics_novel
))
else
:
metrics
=
np
.
hstack
((
coco_metrics
,
coco_metrics_seen
,
coco_metrics_novel
))
# Cleans up the internal variables in order for a fresh eval next time.
self
.
reset
()
metrics_dict
=
{}
for
i
,
name
in
enumerate
(
self
.
_metric_names
):
metrics_dict
[
name
]
=
metrics
[
i
].
astype
(
np
.
float32
)
return
metrics_dict
class
OlnXdataEvaluator
(
OlnXclassEvaluator
):
"""COCO evaluation metric class."""
def
__init__
(
self
,
annotation_file
,
include_mask
,
need_rescale_bboxes
=
True
,
use_category
=
True
,
seen_class
=
'all'
):
"""Constructs COCO evaluation class.
The class provides the interface to metrics_fn in TPUEstimator. The
_update_op() takes detections from each image and push them to
self.detections. The _evaluate() loads a JSON file in COCO annotation format
as the groundtruths and runs COCO evaluation.
Args:
annotation_file: a JSON file that stores annotations of the eval dataset.
If `annotation_file` is None, groundtruth annotations will be loaded
from the dataloader.
include_mask: a boolean to indicate whether or not to include the mask
eval.
need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
to absolute values (`image_info` is needed in this case).
use_category: if `False`, treat all object in all classes in one
foreground category.
seen_class: 'all' or 'voc' or 'nonvoc'
"""
super
(
OlnXdataEvaluator
,
self
).
__init__
(
annotation_file
=
annotation_file
,
include_mask
=
include_mask
,
need_rescale_bboxes
=
need_rescale_bboxes
,
use_category
=
False
,
seen_class
=
'all'
)
def
evaluate
(
self
):
"""Evaluates with detections from all images with COCO API.
Returns:
coco_metric: float numpy array with shape [24] representing the
coco-style evaluation metrics (box and mask).
"""
if
not
self
.
_annotation_file
:
logging
.
info
(
'Thre is no annotation_file in COCOEvaluator.'
)
gt_dataset
=
coco_utils
.
convert_groundtruths_to_coco_dataset
(
self
.
_groundtruths
)
coco_gt
=
coco_utils
.
COCOWrapper
(
eval_type
=
(
'mask'
if
self
.
_include_mask
else
'box'
),
gt_dataset
=
gt_dataset
)
else
:
logging
.
info
(
'Using annotation file: %s'
,
self
.
_annotation_file
)
coco_gt
=
self
.
_coco_gt
coco_predictions
=
coco_utils
.
convert_predictions_to_coco_annotations
(
self
.
_predictions
)
coco_dt
=
coco_gt
.
loadRes
(
predictions
=
coco_predictions
)
image_ids
=
[
ann
[
'image_id'
]
for
ann
in
coco_predictions
]
# Class manipulation: 'all' split samples -> ignored_split = 0.
for
idx
,
_
in
enumerate
(
coco_gt
.
dataset
[
'annotations'
]):
coco_gt
.
dataset
[
'annotations'
][
idx
][
'ignored_split'
]
=
0
coco_eval
=
cocoeval
.
OlnCOCOevalWrapper
(
coco_gt
,
coco_dt
,
iou_type
=
'bbox'
)
coco_eval
.
params
.
maxDets
=
[
10
,
20
,
50
,
100
,
200
]
coco_eval
.
params
.
imgIds
=
image_ids
coco_eval
.
params
.
useCats
=
0
if
not
self
.
_use_category
else
1
coco_eval
.
evaluate
()
coco_eval
.
accumulate
()
coco_eval
.
summarize
()
coco_metrics
=
coco_eval
.
stats
if
self
.
_include_mask
:
mcoco_eval
=
cocoeval
.
OlnCOCOevalWrapper
(
coco_gt
,
coco_dt
,
iou_type
=
'segm'
)
mcoco_eval
.
params
.
maxDets
=
[
10
,
20
,
50
,
100
,
200
]
mcoco_eval
.
params
.
imgIds
=
image_ids
mcoco_eval
.
params
.
useCats
=
0
if
not
self
.
_use_category
else
1
mcoco_eval
.
evaluate
()
mcoco_eval
.
accumulate
()
mcoco_eval
.
summarize
()
mask_coco_metrics
=
mcoco_eval
.
stats
if
self
.
_include_mask
:
metrics
=
np
.
hstack
((
coco_metrics
,
mask_coco_metrics
))
else
:
metrics
=
coco_metrics
# Cleans up the internal variables in order for a fresh eval next time.
self
.
reset
()
metrics_dict
=
{}
for
i
,
name
in
enumerate
(
self
.
_metric_names
):
metrics_dict
[
name
]
=
metrics
[
i
].
astype
(
np
.
float32
)
return
metrics_dict
class
ShapeMaskCOCOEvaluator
(
COCOEvaluator
):
"""COCO evaluation metric class for ShapeMask."""
def
__init__
(
self
,
mask_eval_class
,
**
kwargs
):
"""Constructs COCO evaluation class.
The class provides the interface to metrics_fn in TPUEstimator. The
_update_op() takes detections from each image and push them to
self.detections. The _evaluate() loads a JSON file in COCO annotation format
as the groundtruths and runs COCO evaluation.
Args:
mask_eval_class: the set of classes for mask evaluation.
**kwargs: other keyword arguments passed to the parent class initializer.
"""
super
(
ShapeMaskCOCOEvaluator
,
self
).
__init__
(
**
kwargs
)
self
.
_mask_eval_class
=
mask_eval_class
self
.
_eval_categories
=
class_utils
.
coco_split_class_ids
(
mask_eval_class
)
if
mask_eval_class
!=
'all'
:
self
.
_metric_names
=
[
x
.
replace
(
'mask'
,
'novel_mask'
)
for
x
in
self
.
_metric_names
]
def
evaluate
(
self
):
"""Evaluates with detections from all images with COCO API.
Returns:
coco_metric: float numpy array with shape [24] representing the
coco-style evaluation metrics (box and mask).
"""
if
not
self
.
_annotation_file
:
gt_dataset
=
coco_utils
.
convert_groundtruths_to_coco_dataset
(
self
.
_groundtruths
)
coco_gt
=
coco_utils
.
COCOWrapper
(
eval_type
=
(
'mask'
if
self
.
_include_mask
else
'box'
),
gt_dataset
=
gt_dataset
)
else
:
coco_gt
=
self
.
_coco_gt
coco_predictions
=
coco_utils
.
convert_predictions_to_coco_annotations
(
self
.
_predictions
)
coco_dt
=
coco_gt
.
loadRes
(
predictions
=
coco_predictions
)
image_ids
=
[
ann
[
'image_id'
]
for
ann
in
coco_predictions
]
coco_eval
=
cocoeval
.
COCOeval
(
coco_gt
,
coco_dt
,
iouType
=
'bbox'
)
coco_eval
.
params
.
imgIds
=
image_ids
coco_eval
.
evaluate
()
coco_eval
.
accumulate
()
coco_eval
.
summarize
()
coco_metrics
=
coco_eval
.
stats
if
self
.
_include_mask
:
mcoco_eval
=
cocoeval
.
COCOeval
(
coco_gt
,
coco_dt
,
iouType
=
'segm'
)
mcoco_eval
.
params
.
imgIds
=
image_ids
mcoco_eval
.
evaluate
()
mcoco_eval
.
accumulate
()
mcoco_eval
.
summarize
()
if
self
.
_mask_eval_class
==
'all'
:
metrics
=
np
.
hstack
((
coco_metrics
,
mcoco_eval
.
stats
))
else
:
mask_coco_metrics
=
mcoco_eval
.
category_stats
val_catg_idx
=
np
.
isin
(
mcoco_eval
.
params
.
catIds
,
self
.
_eval_categories
)
# Gather the valid evaluation of the eval categories.
if
np
.
any
(
val_catg_idx
):
mean_val_metrics
=
[]
for
mid
in
range
(
len
(
self
.
_metric_names
)
//
2
):
mean_val_metrics
.
append
(
np
.
nanmean
(
mask_coco_metrics
[
mid
][
val_catg_idx
]))
mean_val_metrics
=
np
.
array
(
mean_val_metrics
)
else
:
mean_val_metrics
=
np
.
zeros
(
len
(
self
.
_metric_names
)
//
2
)
metrics
=
np
.
hstack
((
coco_metrics
,
mean_val_metrics
))
else
:
metrics
=
coco_metrics
# Cleans up the internal variables in order for a fresh eval next time.
self
.
reset
()
metrics_dict
=
{}
for
i
,
name
in
enumerate
(
self
.
_metric_names
):
metrics_dict
[
name
]
=
metrics
[
i
].
astype
(
np
.
float32
)
return
metrics_dict
models-2.13.1/official/legacy/detection/evaluation/coco_utils.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Util functions related to pycocotools and COCO eval."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
copy
import
json
from
absl
import
logging
import
numpy
as
np
from
PIL
import
Image
from
pycocotools
import
coco
from
pycocotools
import
mask
as
mask_api
import
six
import
tensorflow
as
tf
from
official.legacy.detection.dataloader
import
tf_example_decoder
from
official.legacy.detection.utils
import
box_utils
from
official.legacy.detection.utils
import
mask_utils
class
COCOWrapper
(
coco
.
COCO
):
"""COCO wrapper class.
This class wraps COCO API object, which provides the following additional
functionalities:
1. Support string type image id.
2. Support loading the groundtruth dataset using the external annotation
dictionary.
3. Support loading the prediction results using the external annotation
dictionary.
"""
def
__init__
(
self
,
eval_type
=
'box'
,
annotation_file
=
None
,
gt_dataset
=
None
):
"""Instantiates a COCO-style API object.
Args:
eval_type: either 'box' or 'mask'.
annotation_file: a JSON file that stores annotations of the eval dataset.
This is required if `gt_dataset` is not provided.
gt_dataset: the groundtruth eval datatset in COCO API format.
"""
if
((
annotation_file
and
gt_dataset
)
or
((
not
annotation_file
)
and
(
not
gt_dataset
))):
raise
ValueError
(
'One and only one of `annotation_file` and `gt_dataset` '
'needs to be specified.'
)
if
eval_type
not
in
[
'box'
,
'mask'
]:
raise
ValueError
(
'The `eval_type` can only be either `box` or `mask`.'
)
coco
.
COCO
.
__init__
(
self
,
annotation_file
=
annotation_file
)
self
.
_eval_type
=
eval_type
if
gt_dataset
:
self
.
dataset
=
gt_dataset
self
.
createIndex
()
def
loadRes
(
self
,
predictions
):
"""Loads result file and return a result api object.
Args:
predictions: a list of dictionary each representing an annotation in COCO
format. The required fields are `image_id`, `category_id`, `score`,
`bbox`, `segmentation`.
Returns:
res: result COCO api object.
Raises:
ValueError: if the set of image id from predctions is not the subset of
the set of image id of the groundtruth dataset.
"""
res
=
coco
.
COCO
()
res
.
dataset
[
'images'
]
=
copy
.
deepcopy
(
self
.
dataset
[
'images'
])
res
.
dataset
[
'categories'
]
=
copy
.
deepcopy
(
self
.
dataset
[
'categories'
])
image_ids
=
[
ann
[
'image_id'
]
for
ann
in
predictions
]
if
set
(
image_ids
)
!=
(
set
(
image_ids
)
&
set
(
self
.
getImgIds
())):
raise
ValueError
(
'Results do not correspond to the current dataset!'
)
for
ann
in
predictions
:
x1
,
x2
,
y1
,
y2
=
[
ann
[
'bbox'
][
0
],
ann
[
'bbox'
][
0
]
+
ann
[
'bbox'
][
2
],
ann
[
'bbox'
][
1
],
ann
[
'bbox'
][
1
]
+
ann
[
'bbox'
][
3
]]
if
self
.
_eval_type
==
'box'
:
ann
[
'area'
]
=
ann
[
'bbox'
][
2
]
*
ann
[
'bbox'
][
3
]
ann
[
'segmentation'
]
=
[
[
x1
,
y1
,
x1
,
y2
,
x2
,
y2
,
x2
,
y1
]]
elif
self
.
_eval_type
==
'mask'
:
ann
[
'area'
]
=
mask_api
.
area
(
ann
[
'segmentation'
])
res
.
dataset
[
'annotations'
]
=
copy
.
deepcopy
(
predictions
)
res
.
createIndex
()
return
res
def
convert_predictions_to_coco_annotations
(
predictions
):
"""Converts a batch of predictions to annotations in COCO format.
Args:
predictions: a dictionary of lists of numpy arrays including the following
fields. K below denotes the maximum number of instances per image.
Required fields:
- source_id: a list of numpy arrays of int or string of shape
[batch_size].
- num_detections: a list of numpy arrays of int of shape [batch_size].
- detection_boxes: a list of numpy arrays of float of shape
[batch_size, K, 4], where coordinates are in the original image
space (not the scaled image space).
- detection_classes: a list of numpy arrays of int of shape
[batch_size, K].
- detection_scores: a list of numpy arrays of float of shape
[batch_size, K].
Optional fields:
- detection_masks: a list of numpy arrays of float of shape
[batch_size, K, mask_height, mask_width].
Returns:
coco_predictions: prediction in COCO annotation format.
"""
coco_predictions
=
[]
num_batches
=
len
(
predictions
[
'source_id'
])
batch_size
=
predictions
[
'source_id'
][
0
].
shape
[
0
]
max_num_detections
=
predictions
[
'detection_classes'
][
0
].
shape
[
1
]
use_outer_box
=
'detection_outer_boxes'
in
predictions
for
i
in
range
(
num_batches
):
predictions
[
'detection_boxes'
][
i
]
=
box_utils
.
yxyx_to_xywh
(
predictions
[
'detection_boxes'
][
i
])
if
use_outer_box
:
predictions
[
'detection_outer_boxes'
][
i
]
=
box_utils
.
yxyx_to_xywh
(
predictions
[
'detection_outer_boxes'
][
i
])
mask_boxes
=
predictions
[
'detection_outer_boxes'
]
else
:
mask_boxes
=
predictions
[
'detection_boxes'
]
for
j
in
range
(
batch_size
):
if
'detection_masks'
in
predictions
:
image_masks
=
mask_utils
.
paste_instance_masks
(
predictions
[
'detection_masks'
][
i
][
j
],
mask_boxes
[
i
][
j
],
int
(
predictions
[
'image_info'
][
i
][
j
,
0
,
0
]),
int
(
predictions
[
'image_info'
][
i
][
j
,
0
,
1
]))
binary_masks
=
(
image_masks
>
0.0
).
astype
(
np
.
uint8
)
encoded_masks
=
[
mask_api
.
encode
(
np
.
asfortranarray
(
binary_mask
))
for
binary_mask
in
list
(
binary_masks
)]
for
k
in
range
(
max_num_detections
):
ann
=
{}
ann
[
'image_id'
]
=
predictions
[
'source_id'
][
i
][
j
]
ann
[
'category_id'
]
=
predictions
[
'detection_classes'
][
i
][
j
,
k
]
ann
[
'bbox'
]
=
predictions
[
'detection_boxes'
][
i
][
j
,
k
]
ann
[
'score'
]
=
predictions
[
'detection_scores'
][
i
][
j
,
k
]
if
'detection_masks'
in
predictions
:
ann
[
'segmentation'
]
=
encoded_masks
[
k
]
coco_predictions
.
append
(
ann
)
for
i
,
ann
in
enumerate
(
coco_predictions
):
ann
[
'id'
]
=
i
+
1
return
coco_predictions
def
convert_groundtruths_to_coco_dataset
(
groundtruths
,
label_map
=
None
):
"""Converts groundtruths to the dataset in COCO format.
Args:
groundtruths: a dictionary of numpy arrays including the fields below.
Note that each element in the list represent the number for a single
example without batch dimension. K below denotes the actual number of
instances for each image.
Required fields:
- source_id: a list of numpy arrays of int or string of shape
[batch_size].
- height: a list of numpy arrays of int of shape [batch_size].
- width: a list of numpy arrays of int of shape [batch_size].
- num_detections: a list of numpy arrays of int of shape [batch_size].
- boxes: a list of numpy arrays of float of shape [batch_size, K, 4],
where coordinates are in the original image space (not the
normalized coordinates).
- classes: a list of numpy arrays of int of shape [batch_size, K].
Optional fields:
- is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If
th field is absent, it is assumed that this instance is not crowd.
- areas: a list of numy arrays of float of shape [batch_size, K]. If the
field is absent, the area is calculated using either boxes or
masks depending on which one is available.
- masks: a list of numpy arrays of string of shape [batch_size, K],
label_map: (optional) a dictionary that defines items from the category id
to the category name. If `None`, collect the category mappping from the
`groundtruths`.
Returns:
coco_groundtruths: the groundtruth dataset in COCO format.
"""
source_ids
=
np
.
concatenate
(
groundtruths
[
'source_id'
],
axis
=
0
)
heights
=
np
.
concatenate
(
groundtruths
[
'height'
],
axis
=
0
)
widths
=
np
.
concatenate
(
groundtruths
[
'width'
],
axis
=
0
)
gt_images
=
[{
'id'
:
int
(
i
),
'height'
:
int
(
h
),
'width'
:
int
(
w
)}
for
i
,
h
,
w
in
zip
(
source_ids
,
heights
,
widths
)]
gt_annotations
=
[]
num_batches
=
len
(
groundtruths
[
'source_id'
])
batch_size
=
groundtruths
[
'source_id'
][
0
].
shape
[
0
]
for
i
in
range
(
num_batches
):
for
j
in
range
(
batch_size
):
num_instances
=
groundtruths
[
'num_detections'
][
i
][
j
]
for
k
in
range
(
num_instances
):
ann
=
{}
ann
[
'image_id'
]
=
int
(
groundtruths
[
'source_id'
][
i
][
j
])
if
'is_crowds'
in
groundtruths
:
ann
[
'iscrowd'
]
=
int
(
groundtruths
[
'is_crowds'
][
i
][
j
,
k
])
else
:
ann
[
'iscrowd'
]
=
0
ann
[
'category_id'
]
=
int
(
groundtruths
[
'classes'
][
i
][
j
,
k
])
boxes
=
groundtruths
[
'boxes'
][
i
]
ann
[
'bbox'
]
=
[
float
(
boxes
[
j
,
k
,
1
]),
float
(
boxes
[
j
,
k
,
0
]),
float
(
boxes
[
j
,
k
,
3
]
-
boxes
[
j
,
k
,
1
]),
float
(
boxes
[
j
,
k
,
2
]
-
boxes
[
j
,
k
,
0
])]
if
'areas'
in
groundtruths
:
ann
[
'area'
]
=
float
(
groundtruths
[
'areas'
][
i
][
j
,
k
])
else
:
ann
[
'area'
]
=
float
(
(
boxes
[
j
,
k
,
3
]
-
boxes
[
j
,
k
,
1
])
*
(
boxes
[
j
,
k
,
2
]
-
boxes
[
j
,
k
,
0
]))
if
'masks'
in
groundtruths
:
mask
=
Image
.
open
(
six
.
BytesIO
(
groundtruths
[
'masks'
][
i
][
j
,
k
]))
np_mask
=
np
.
array
(
mask
,
dtype
=
np
.
uint8
)
np_mask
[
np_mask
>
0
]
=
255
encoded_mask
=
mask_api
.
encode
(
np
.
asfortranarray
(
np_mask
))
ann
[
'segmentation'
]
=
encoded_mask
if
'areas'
not
in
groundtruths
:
ann
[
'area'
]
=
mask_api
.
area
(
encoded_mask
)
gt_annotations
.
append
(
ann
)
for
i
,
ann
in
enumerate
(
gt_annotations
):
ann
[
'id'
]
=
i
+
1
if
label_map
:
gt_categories
=
[{
'id'
:
i
,
'name'
:
label_map
[
i
]}
for
i
in
label_map
]
else
:
category_ids
=
[
gt
[
'category_id'
]
for
gt
in
gt_annotations
]
gt_categories
=
[{
'id'
:
i
}
for
i
in
set
(
category_ids
)]
gt_dataset
=
{
'images'
:
gt_images
,
'categories'
:
gt_categories
,
'annotations'
:
copy
.
deepcopy
(
gt_annotations
),
}
return
gt_dataset
class
COCOGroundtruthGenerator
(
object
):
"""Generates the groundtruth annotations from a single example."""
def
__init__
(
self
,
file_pattern
,
num_examples
,
include_mask
):
self
.
_file_pattern
=
file_pattern
self
.
_num_examples
=
num_examples
self
.
_include_mask
=
include_mask
self
.
_dataset_fn
=
tf
.
data
.
TFRecordDataset
def
_parse_single_example
(
self
,
example
):
"""Parses a single serialized tf.Example proto.
Args:
example: a serialized tf.Example proto string.
Returns:
A dictionary of groundtruth with the following fields:
source_id: a scalar tensor of int64 representing the image source_id.
height: a scalar tensor of int64 representing the image height.
width: a scalar tensor of int64 representing the image width.
boxes: a float tensor of shape [K, 4], representing the groundtruth
boxes in absolute coordinates with respect to the original image size.
classes: a int64 tensor of shape [K], representing the class labels of
each instances.
is_crowds: a bool tensor of shape [K], indicating whether the instance
is crowd.
areas: a float tensor of shape [K], indicating the area of each
instance.
masks: a string tensor of shape [K], containing the bytes of the png
mask of each instance.
"""
decoder
=
tf_example_decoder
.
TfExampleDecoder
(
include_mask
=
self
.
_include_mask
)
decoded_tensors
=
decoder
.
decode
(
example
)
image
=
decoded_tensors
[
'image'
]
image_size
=
tf
.
shape
(
image
)[
0
:
2
]
boxes
=
box_utils
.
denormalize_boxes
(
decoded_tensors
[
'groundtruth_boxes'
],
image_size
)
groundtruths
=
{
'source_id'
:
tf
.
string_to_number
(
decoded_tensors
[
'source_id'
],
out_type
=
tf
.
int64
),
'height'
:
decoded_tensors
[
'height'
],
'width'
:
decoded_tensors
[
'width'
],
'num_detections'
:
tf
.
shape
(
decoded_tensors
[
'groundtruth_classes'
])[
0
],
'boxes'
:
boxes
,
'classes'
:
decoded_tensors
[
'groundtruth_classes'
],
'is_crowds'
:
decoded_tensors
[
'groundtruth_is_crowd'
],
'areas'
:
decoded_tensors
[
'groundtruth_area'
],
}
if
self
.
_include_mask
:
groundtruths
.
update
({
'masks'
:
decoded_tensors
[
'groundtruth_instance_masks_png'
],
})
return
groundtruths
def
_build_pipeline
(
self
):
"""Builds data pipeline to generate groundtruth annotations."""
dataset
=
tf
.
data
.
Dataset
.
list_files
(
self
.
_file_pattern
,
shuffle
=
False
)
dataset
=
dataset
.
apply
(
tf
.
data
.
experimental
.
parallel_interleave
(
lambda
filename
:
self
.
_dataset_fn
(
filename
).
prefetch
(
1
),
cycle_length
=
32
,
sloppy
=
False
))
dataset
=
dataset
.
map
(
self
.
_parse_single_example
,
num_parallel_calls
=
64
)
dataset
=
dataset
.
prefetch
(
tf
.
data
.
experimental
.
AUTOTUNE
)
dataset
=
dataset
.
batch
(
1
,
drop_remainder
=
False
)
return
dataset
def
__call__
(
self
):
with
tf
.
Graph
().
as_default
():
dataset
=
self
.
_build_pipeline
()
groundtruth
=
dataset
.
make_one_shot_iterator
().
get_next
()
with
tf
.
Session
()
as
sess
:
for
_
in
range
(
self
.
_num_examples
):
groundtruth_result
=
sess
.
run
(
groundtruth
)
yield
groundtruth_result
def
scan_and_generator_annotation_file
(
file_pattern
,
num_samples
,
include_mask
,
annotation_file
):
"""Scans and generate the COCO-style annotation JSON file given a dataset."""
groundtruth_generator
=
COCOGroundtruthGenerator
(
file_pattern
,
num_samples
,
include_mask
)
generate_annotation_file
(
groundtruth_generator
,
annotation_file
)
def
generate_annotation_file
(
groundtruth_generator
,
annotation_file
):
"""Generates COCO-style annotation JSON file given a groundtruth generator."""
groundtruths
=
{}
logging
.
info
(
'Loading groundtruth annotations from dataset to memory...'
)
for
groundtruth
in
groundtruth_generator
():
for
k
,
v
in
six
.
iteritems
(
groundtruth
):
if
k
not
in
groundtruths
:
groundtruths
[
k
]
=
[
v
]
else
:
groundtruths
[
k
].
append
(
v
)
gt_dataset
=
convert_groundtruths_to_coco_dataset
(
groundtruths
)
logging
.
info
(
'Saving groundtruth annotations to the JSON file...'
)
with
tf
.
io
.
gfile
.
GFile
(
annotation_file
,
'w'
)
as
f
:
f
.
write
(
json
.
dumps
(
gt_dataset
))
logging
.
info
(
'Done saving the JSON file...'
)
models-2.13.1/official/legacy/detection/evaluation/factory.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluator factory."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
official.legacy.detection.evaluation
import
coco_evaluator
def
evaluator_generator
(
params
):
"""Generator function for various evaluators."""
if
params
.
type
==
'box'
:
evaluator
=
coco_evaluator
.
COCOEvaluator
(
annotation_file
=
params
.
val_json_file
,
include_mask
=
False
)
elif
params
.
type
==
'box_and_mask'
:
evaluator
=
coco_evaluator
.
COCOEvaluator
(
annotation_file
=
params
.
val_json_file
,
include_mask
=
True
)
elif
params
.
type
==
'oln_xclass_box'
:
evaluator
=
coco_evaluator
.
OlnXclassEvaluator
(
annotation_file
=
params
.
val_json_file
,
include_mask
=
False
,
use_category
=
False
,
seen_class
=
params
.
seen_class
,)
elif
params
.
type
==
'oln_xclass_box_and_mask'
:
evaluator
=
coco_evaluator
.
OlnXclassEvaluator
(
annotation_file
=
params
.
val_json_file
,
include_mask
=
True
,
use_category
=
False
,
seen_class
=
params
.
seen_class
,)
elif
params
.
type
==
'oln_xdata_box'
:
evaluator
=
coco_evaluator
.
OlnXdataEvaluator
(
annotation_file
=
params
.
val_json_file
,
include_mask
=
False
,
use_category
=
False
,
seen_class
=
'all'
,)
elif
params
.
type
==
'shapemask_box_and_mask'
:
evaluator
=
coco_evaluator
.
ShapeMaskCOCOEvaluator
(
mask_eval_class
=
params
.
mask_eval_class
,
annotation_file
=
params
.
val_json_file
,
include_mask
=
True
)
else
:
raise
ValueError
(
'Evaluator %s is not supported.'
%
params
.
type
)
return
coco_evaluator
.
MetricWrapper
(
evaluator
)
models-2.13.1/official/legacy/detection/executor/__init__.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models-2.13.1/official/legacy/detection/executor/detection_executor.py
0 → 100644
View file @
472e2f80
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""An executor class for running model on TensorFlow 2.0."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
absl
import
logging
import
tensorflow
as
tf
from
official.legacy.detection.executor
import
distributed_executor
as
executor
from
official.vision.utils.object_detection
import
visualization_utils
class
DetectionDistributedExecutor
(
executor
.
DistributedExecutor
):
"""Detection specific customer training loop executor.
Subclasses the DistributedExecutor and adds support for numpy based metrics.
"""
def
__init__
(
self
,
predict_post_process_fn
=
None
,
trainable_variables_filter
=
None
,
**
kwargs
):
super
(
DetectionDistributedExecutor
,
self
).
__init__
(
**
kwargs
)
if
predict_post_process_fn
:
assert
callable
(
predict_post_process_fn
)
if
trainable_variables_filter
:
assert
callable
(
trainable_variables_filter
)
self
.
_predict_post_process_fn
=
predict_post_process_fn
self
.
_trainable_variables_filter
=
trainable_variables_filter
self
.
eval_steps
=
tf
.
Variable
(
0
,
trainable
=
False
,
dtype
=
tf
.
int32
,
synchronization
=
tf
.
VariableSynchronization
.
ON_READ
,
aggregation
=
tf
.
VariableAggregation
.
ONLY_FIRST_REPLICA
,
shape
=
[])
def
_create_replicated_step
(
self
,
strategy
,
model
,
loss_fn
,
optimizer
,
metric
=
None
):
trainable_variables
=
model
.
trainable_variables
if
self
.
_trainable_variables_filter
:
trainable_variables
=
self
.
_trainable_variables_filter
(
trainable_variables
)
logging
.
info
(
'Filter trainable variables from %d to %d'
,
len
(
model
.
trainable_variables
),
len
(
trainable_variables
))
update_state_fn
=
lambda
labels
,
outputs
:
None
if
isinstance
(
metric
,
tf
.
keras
.
metrics
.
Metric
):
update_state_fn
=
metric
.
update_state
else
:
logging
.
error
(
'Detection: train metric is not an instance of '
'tf.keras.metrics.Metric.'
)
def
_replicated_step
(
inputs
):
"""Replicated training step."""
inputs
,
labels
=
inputs
with
tf
.
GradientTape
()
as
tape
:
outputs
=
model
(
inputs
,
training
=
True
)
all_losses
=
loss_fn
(
labels
,
outputs
)
losses
=
{}
for
k
,
v
in
all_losses
.
items
():
losses
[
k
]
=
tf
.
reduce_mean
(
v
)
per_replica_loss
=
losses
[
'total_loss'
]
/
strategy
.
num_replicas_in_sync
update_state_fn
(
labels
,
outputs
)
grads
=
tape
.
gradient
(
per_replica_loss
,
trainable_variables
)
clipped_grads
,
_
=
tf
.
clip_by_global_norm
(
grads
,
clip_norm
=
1.0
)
optimizer
.
apply_gradients
(
zip
(
clipped_grads
,
trainable_variables
))
return
losses
return
_replicated_step
def
_create_test_step
(
self
,
strategy
,
model
,
metric
):
"""Creates a distributed test step."""
@
tf
.
function
def
test_step
(
iterator
,
eval_steps
):
"""Calculates evaluation metrics on distributed devices."""
def
_test_step_fn
(
inputs
,
eval_steps
):
"""Replicated accuracy calculation."""
inputs
,
labels
=
inputs
model_outputs
=
model
(
inputs
,
training
=
False
)
if
self
.
_predict_post_process_fn
:
labels
,
prediction_outputs
=
self
.
_predict_post_process_fn
(
labels
,
model_outputs
)
num_remaining_visualizations
=
(
self
.
_params
.
eval
.
num_images_to_visualize
-
eval_steps
)
# If there are remaining number of visualizations that needs to be
# done, add next batch outputs for visualization.
#
# TODO(hongjunchoi): Once dynamic slicing is supported on TPU, only
# write correct slice of outputs to summary file.
if
num_remaining_visualizations
>
0
:
visualization_utils
.
visualize_images_with_bounding_boxes
(
inputs
,
prediction_outputs
[
'detection_boxes'
],
self
.
global_train_step
,
self
.
eval_summary_writer
)
return
labels
,
prediction_outputs
labels
,
outputs
=
strategy
.
run
(
_test_step_fn
,
args
=
(
next
(
iterator
),
eval_steps
,
))
outputs
=
tf
.
nest
.
map_structure
(
strategy
.
experimental_local_results
,
outputs
)
labels
=
tf
.
nest
.
map_structure
(
strategy
.
experimental_local_results
,
labels
)
eval_steps
.
assign_add
(
self
.
_params
.
eval
.
batch_size
)
return
labels
,
outputs
return
test_step
def
_run_evaluation
(
self
,
test_step
,
current_training_step
,
metric
,
test_iterator
):
"""Runs validation steps and aggregate metrics."""
self
.
eval_steps
.
assign
(
0
)
if
not
test_iterator
or
not
metric
:
logging
.
warning
(
'Both test_iterator (%s) and metrics (%s) must not be None.'
,
test_iterator
,
metric
)
return
None
logging
.
info
(
'Running evaluation after step: %s.'
,
current_training_step
)
while
True
:
try
:
labels
,
outputs
=
test_step
(
test_iterator
,
self
.
eval_steps
)
if
metric
:
metric
.
update_state
(
labels
,
outputs
)
except
(
StopIteration
,
tf
.
errors
.
OutOfRangeError
):
break
metric_result
=
metric
.
result
()
if
isinstance
(
metric
,
tf
.
keras
.
metrics
.
Metric
):
metric_result
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
().
astype
(
float
),
metric_result
)
logging
.
info
(
'Step: [%d] Validation metric = %s'
,
current_training_step
,
metric_result
)
return
metric_result
Prev
1
…
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment