Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
ff88581a
Unverified
Commit
ff88581a
authored
Oct 29, 2017
by
vivek rathod
Committed by
GitHub
Oct 29, 2017
Browse files
Merge pull request #2629 from tombstone/meta_arch_update
update post_processing module, builders, and meta architectures.
parents
018e62f0
aeeaf9a3
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1026 additions
and
265 deletions
+1026
-265
research/object_detection/builders/post_processing_builder.py
...arch/object_detection/builders/post_processing_builder.py
+19
-7
research/object_detection/builders/post_processing_builder_test.py
...object_detection/builders/post_processing_builder_test.py
+37
-3
research/object_detection/core/post_processing.py
research/object_detection/core/post_processing.py
+106
-23
research/object_detection/core/post_processing_test.py
research/object_detection/core/post_processing_test.py
+192
-18
research/object_detection/meta_architectures/BUILD
research/object_detection/meta_architectures/BUILD
+1
-0
research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
...ect_detection/meta_architectures/faster_rcnn_meta_arch.py
+219
-77
research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
...etection/meta_architectures/faster_rcnn_meta_arch_test.py
+23
-11
research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
...tion/meta_architectures/faster_rcnn_meta_arch_test_lib.py
+268
-81
research/object_detection/meta_architectures/rfcn_meta_arch.py
...rch/object_detection/meta_architectures/rfcn_meta_arch.py
+19
-3
research/object_detection/meta_architectures/rfcn_meta_arch_test.py
...bject_detection/meta_architectures/rfcn_meta_arch_test.py
+9
-0
research/object_detection/meta_architectures/ssd_meta_arch.py
...arch/object_detection/meta_architectures/ssd_meta_arch.py
+124
-38
research/object_detection/meta_architectures/ssd_meta_arch_test.py
...object_detection/meta_architectures/ssd_meta_arch_test.py
+9
-4
No files found.
research/object_detection/builders/post_processing_builder.py
View file @
ff88581a
...
...
@@ -28,8 +28,8 @@ def build(post_processing_config):
configuration.
Non-max suppression callable takes `boxes`, `scores`, and optionally
`clip_window`, `parallel_iterations` and `scope` as inputs. It returns
`nms_boxes`, `nms_scores`, `nms_
nms_
classes` and `num_detections`. See
`clip_window`, `parallel_iterations`
`masks,
and `scope` as inputs. It returns
`nms_boxes`, `nms_scores`, `nms_classes`
`nms_masks`
and `num_detections`. See
post_processing.batch_multiclass_non_max_suppression for the type and shape
of these tensors.
...
...
@@ -55,7 +55,8 @@ def build(post_processing_config):
non_max_suppressor_fn
=
_build_non_max_suppressor
(
post_processing_config
.
batch_non_max_suppression
)
score_converter_fn
=
_build_score_converter
(
post_processing_config
.
score_converter
)
post_processing_config
.
score_converter
,
post_processing_config
.
logit_scale
)
return
non_max_suppressor_fn
,
score_converter_fn
...
...
@@ -87,7 +88,17 @@ def _build_non_max_suppressor(nms_config):
return
non_max_suppressor_fn
def
_build_score_converter
(
score_converter_config
):
def
_score_converter_fn_with_logit_scale
(
tf_score_converter_fn
,
logit_scale
):
"""Create a function to scale logits then apply a Tensorflow function."""
def
score_converter_fn
(
logits
):
scaled_logits
=
tf
.
divide
(
logits
,
logit_scale
,
name
=
'scale_logits'
)
return
tf_score_converter_fn
(
scaled_logits
,
name
=
'convert_scores'
)
score_converter_fn
.
__name__
=
'%s_with_logit_scale'
%
(
tf_score_converter_fn
.
__name__
)
return
score_converter_fn
def
_build_score_converter
(
score_converter_config
,
logit_scale
):
"""Builds score converter based on the config.
Builds one of [tf.identity, tf.sigmoid, tf.softmax] score converters based on
...
...
@@ -95,6 +106,7 @@ def _build_score_converter(score_converter_config):
Args:
score_converter_config: post_processing_pb2.PostProcessing.score_converter.
logit_scale: temperature to use for SOFTMAX score_converter.
Returns:
Callable score converter op.
...
...
@@ -103,9 +115,9 @@ def _build_score_converter(score_converter_config):
ValueError: On unknown score converter.
"""
if
score_converter_config
==
post_processing_pb2
.
PostProcessing
.
IDENTITY
:
return
tf
.
identity
return
_score_converter_fn_with_logit_scale
(
tf
.
identity
,
logit_scale
)
if
score_converter_config
==
post_processing_pb2
.
PostProcessing
.
SIGMOID
:
return
tf
.
sigmoid
return
_score_converter_fn_with_logit_scale
(
tf
.
sigmoid
,
logit_scale
)
if
score_converter_config
==
post_processing_pb2
.
PostProcessing
.
SOFTMAX
:
return
tf
.
nn
.
softmax
return
_score_converter_fn_with_logit_scale
(
tf
.
nn
.
softmax
,
logit_scale
)
raise
ValueError
(
'Unknown score converter.'
)
research/object_detection/builders/post_processing_builder_test.py
View file @
ff88581a
...
...
@@ -48,7 +48,31 @@ class PostProcessingBuilderTest(tf.test.TestCase):
post_processing_config
=
post_processing_pb2
.
PostProcessing
()
text_format
.
Merge
(
post_processing_text_proto
,
post_processing_config
)
_
,
score_converter
=
post_processing_builder
.
build
(
post_processing_config
)
self
.
assertEqual
(
score_converter
,
tf
.
identity
)
self
.
assertEqual
(
score_converter
.
__name__
,
'identity_with_logit_scale'
)
inputs
=
tf
.
constant
([
1
,
1
],
tf
.
float32
)
outputs
=
score_converter
(
inputs
)
with
self
.
test_session
()
as
sess
:
converted_scores
=
sess
.
run
(
outputs
)
expected_converted_scores
=
sess
.
run
(
inputs
)
self
.
assertAllClose
(
converted_scores
,
expected_converted_scores
)
def
test_build_identity_score_converter_with_logit_scale
(
self
):
post_processing_text_proto
=
"""
score_converter: IDENTITY
logit_scale: 2.0
"""
post_processing_config
=
post_processing_pb2
.
PostProcessing
()
text_format
.
Merge
(
post_processing_text_proto
,
post_processing_config
)
_
,
score_converter
=
post_processing_builder
.
build
(
post_processing_config
)
self
.
assertEqual
(
score_converter
.
__name__
,
'identity_with_logit_scale'
)
inputs
=
tf
.
constant
([
1
,
1
],
tf
.
float32
)
outputs
=
score_converter
(
inputs
)
with
self
.
test_session
()
as
sess
:
converted_scores
=
sess
.
run
(
outputs
)
expected_converted_scores
=
sess
.
run
(
tf
.
constant
([.
5
,
.
5
],
tf
.
float32
))
self
.
assertAllClose
(
converted_scores
,
expected_converted_scores
)
def
test_build_sigmoid_score_converter
(
self
):
post_processing_text_proto
=
"""
...
...
@@ -57,7 +81,7 @@ class PostProcessingBuilderTest(tf.test.TestCase):
post_processing_config
=
post_processing_pb2
.
PostProcessing
()
text_format
.
Merge
(
post_processing_text_proto
,
post_processing_config
)
_
,
score_converter
=
post_processing_builder
.
build
(
post_processing_config
)
self
.
assertEqual
(
score_converter
,
tf
.
sigmoid
)
self
.
assertEqual
(
score_converter
.
__name__
,
'sigmoid_with_logit_scale'
)
def
test_build_softmax_score_converter
(
self
):
post_processing_text_proto
=
"""
...
...
@@ -66,7 +90,17 @@ class PostProcessingBuilderTest(tf.test.TestCase):
post_processing_config
=
post_processing_pb2
.
PostProcessing
()
text_format
.
Merge
(
post_processing_text_proto
,
post_processing_config
)
_
,
score_converter
=
post_processing_builder
.
build
(
post_processing_config
)
self
.
assertEqual
(
score_converter
,
tf
.
nn
.
softmax
)
self
.
assertEqual
(
score_converter
.
__name__
,
'softmax_with_logit_scale'
)
def
test_build_softmax_score_converter_with_temperature
(
self
):
post_processing_text_proto
=
"""
score_converter: SOFTMAX
logit_scale: 2.0
"""
post_processing_config
=
post_processing_pb2
.
PostProcessing
()
text_format
.
Merge
(
post_processing_text_proto
,
post_processing_config
)
_
,
score_converter
=
post_processing_builder
.
build
(
post_processing_config
)
self
.
assertEqual
(
score_converter
.
__name__
,
'softmax_with_logit_scale'
)
if
__name__
==
'__main__'
:
...
...
research/object_detection/core/post_processing.py
View file @
ff88581a
...
...
@@ -76,8 +76,6 @@ def multiclass_non_max_suppression(boxes,
a BoxList holding M boxes with a rank-1 scores field representing
corresponding scores for each box with scores sorted in decreasing order
and a rank-1 classes field representing a class label for each box.
If masks, keypoints, keypoint_heatmaps is not None, the boxlist will
contain masks, keypoints, keypoint_heatmaps corresponding to boxes.
Raises:
ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have
...
...
@@ -174,6 +172,7 @@ def batch_multiclass_non_max_suppression(boxes,
change_coordinate_frame
=
False
,
num_valid_boxes
=
None
,
masks
=
None
,
additional_fields
=
None
,
scope
=
None
,
parallel_iterations
=
32
):
"""Multi-class version of non maximum suppression that operates on a batch.
...
...
@@ -203,11 +202,13 @@ def batch_multiclass_non_max_suppression(boxes,
is provided)
num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape
[batch_size] representing the number of valid boxes to be considered
for each image in the batch. This parameter allows for ignoring zero
paddings.
for each image in the batch. This parameter allows for ignoring zero
paddings.
masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width]
float32 tensor containing box masks. `q` can be either number of classes
or 1 depending on whether a separate mask is predicted per class.
additional_fields: (optional) If not None, a dictionary that maps keys to
tensors whose dimensions are [batch_size, num_anchors, ...].
scope: tf scope name.
parallel_iterations: (optional) number of batch items to process in
parallel.
...
...
@@ -223,9 +224,13 @@ def batch_multiclass_non_max_suppression(boxes,
[batch_size, max_detections, mask_height, mask_width] float32 tensor
containing masks for each selected box. This is set to None if input
`masks` is None.
'nmsed_additional_fields': (optional) a dictionary of
[batch_size, max_detections, ...] float32 tensors corresponding to the
tensors specified in the input `additional_fields`. This is not returned
if input `additional_fields` is None.
'num_detections': A [batch_size] int32 tensor indicating the number of
valid detections per batch item. Only the top num_detections[i] entries in
nms_boxes[i], nms_scores[i] and nms_class[i] are valid.
t
he rest of the
nms_boxes[i], nms_scores[i] and nms_class[i] are valid.
T
he rest of the
entries are zero paddings.
Raises:
...
...
@@ -239,6 +244,7 @@ def batch_multiclass_non_max_suppression(boxes,
'to the third dimension of scores'
)
original_masks
=
masks
original_additional_fields
=
additional_fields
with
tf
.
name_scope
(
scope
,
'BatchMultiClassNonMaxSuppression'
):
boxes_shape
=
boxes
.
shape
batch_size
=
boxes_shape
[
0
].
value
...
...
@@ -255,15 +261,61 @@ def batch_multiclass_non_max_suppression(boxes,
num_valid_boxes
=
tf
.
ones
([
batch_size
],
dtype
=
tf
.
int32
)
*
num_anchors
# If masks aren't provided, create dummy masks so we can only have one copy
# of single_image_nms_fn and discard the dummy masks after map_fn.
# of
_
single_image_nms_fn and discard the dummy masks after map_fn.
if
masks
is
None
:
masks_shape
=
tf
.
stack
([
batch_size
,
num_anchors
,
1
,
0
,
0
])
masks
=
tf
.
zeros
(
masks_shape
)
def
single_image_nms_fn
(
args
):
"""Runs NMS on a single image and returns padded output."""
(
per_image_boxes
,
per_image_scores
,
per_image_masks
,
per_image_num_valid_boxes
)
=
args
if
additional_fields
is
None
:
additional_fields
=
{}
def
_single_image_nms_fn
(
args
):
"""Runs NMS on a single image and returns padded output.
Args:
args: A list of tensors consisting of the following:
per_image_boxes - A [num_anchors, q, 4] float32 tensor containing
detections. If `q` is 1 then same boxes are used for all classes
otherwise, if `q` is equal to number of classes, class-specific
boxes are used.
per_image_scores - A [num_anchors, num_classes] float32 tensor
containing the scores for each of the `num_anchors` detections.
per_image_masks - A [num_anchors, q, mask_height, mask_width] float32
tensor containing box masks. `q` can be either number of classes
or 1 depending on whether a separate mask is predicted per class.
per_image_additional_fields - (optional) A variable number of float32
tensors each with size [num_anchors, ...].
per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of
shape [batch_size] representing the number of valid boxes to be
considered for each image in the batch. This parameter allows for
ignoring zero paddings.
Returns:
'nmsed_boxes': A [max_detections, 4] float32 tensor containing the
non-max suppressed boxes.
'nmsed_scores': A [max_detections] float32 tensor containing the scores
for the boxes.
'nmsed_classes': A [max_detections] float32 tensor containing the class
for boxes.
'nmsed_masks': (optional) a [max_detections, mask_height, mask_width]
float32 tensor containing masks for each selected box. This is set to
None if input `masks` is None.
'nmsed_additional_fields': (optional) A variable number of float32
tensors each with size [max_detections, ...] corresponding to the
input `per_image_additional_fields`.
'num_detections': A [batch_size] int32 tensor indicating the number of
valid detections per batch item. Only the top num_detections[i]
entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The
rest of the entries are zero paddings.
"""
per_image_boxes
=
args
[
0
]
per_image_scores
=
args
[
1
]
per_image_masks
=
args
[
2
]
per_image_additional_fields
=
{
key
:
value
for
key
,
value
in
zip
(
additional_fields
,
args
[
3
:
-
1
])
}
per_image_num_valid_boxes
=
args
[
-
1
]
per_image_boxes
=
tf
.
reshape
(
tf
.
slice
(
per_image_boxes
,
3
*
[
0
],
tf
.
stack
([
per_image_num_valid_boxes
,
-
1
,
-
1
])),
[
-
1
,
q
,
4
])
...
...
@@ -271,12 +323,21 @@ def batch_multiclass_non_max_suppression(boxes,
tf
.
slice
(
per_image_scores
,
[
0
,
0
],
tf
.
stack
([
per_image_num_valid_boxes
,
-
1
])),
[
-
1
,
num_classes
])
per_image_masks
=
tf
.
reshape
(
tf
.
slice
(
per_image_masks
,
4
*
[
0
],
tf
.
stack
([
per_image_num_valid_boxes
,
-
1
,
-
1
,
-
1
])),
[
-
1
,
q
,
per_image_masks
.
shape
[
2
].
value
,
per_image_masks
.
shape
[
3
].
value
])
if
per_image_additional_fields
is
not
None
:
for
key
,
tensor
in
per_image_additional_fields
.
items
():
additional_field_shape
=
tensor
.
get_shape
()
additional_field_dim
=
len
(
additional_field_shape
)
per_image_additional_fields
[
key
]
=
tf
.
reshape
(
tf
.
slice
(
per_image_additional_fields
[
key
],
additional_field_dim
*
[
0
],
tf
.
stack
([
per_image_num_valid_boxes
]
+
(
additional_field_dim
-
1
)
*
[
-
1
])),
[
-
1
]
+
[
dim
.
value
for
dim
in
additional_field_shape
[
1
:]])
nmsed_boxlist
=
multiclass_non_max_suppression
(
per_image_boxes
,
per_image_scores
,
...
...
@@ -284,9 +345,10 @@ def batch_multiclass_non_max_suppression(boxes,
iou_thresh
,
max_size_per_class
,
max_total_size
,
masks
=
per_image_masks
,
clip_window
=
clip_window
,
change_coordinate_frame
=
change_coordinate_frame
)
change_coordinate_frame
=
change_coordinate_frame
,
masks
=
per_image_masks
,
additional_fields
=
per_image_additional_fields
)
padded_boxlist
=
box_list_ops
.
pad_or_clip_box_list
(
nmsed_boxlist
,
max_total_size
)
num_detections
=
nmsed_boxlist
.
num_boxes
()
...
...
@@ -294,19 +356,40 @@ def batch_multiclass_non_max_suppression(boxes,
nmsed_scores
=
padded_boxlist
.
get_field
(
fields
.
BoxListFields
.
scores
)
nmsed_classes
=
padded_boxlist
.
get_field
(
fields
.
BoxListFields
.
classes
)
nmsed_masks
=
padded_boxlist
.
get_field
(
fields
.
BoxListFields
.
masks
)
return
[
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
num_detections
]
nmsed_additional_fields
=
[
padded_boxlist
.
get_field
(
key
)
for
key
in
per_image_additional_fields
]
return
([
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
]
+
nmsed_additional_fields
+
[
num_detections
])
num_additional_fields
=
0
if
additional_fields
is
not
None
:
num_additional_fields
=
len
(
additional_fields
)
num_nmsed_outputs
=
4
+
num_additional_fields
(
batch_nmsed_boxes
,
batch_nmsed_scores
,
batch_nmsed_classes
,
batch_nmsed_masks
,
batch_num_detections
)
=
tf
.
map_fn
(
single_image_nms_fn
,
elems
=
[
boxes
,
scores
,
masks
,
num_valid_boxes
],
dtype
=
[
tf
.
float32
,
tf
.
float32
,
tf
.
float32
,
tf
.
float32
,
tf
.
int32
],
parallel_iterations
=
parallel_iterations
)
batch_outputs
=
tf
.
map_fn
(
_single_image_nms_fn
,
elems
=
([
boxes
,
scores
,
masks
]
+
list
(
additional_fields
.
values
())
+
[
num_valid_boxes
]),
dtype
=
(
num_nmsed_outputs
*
[
tf
.
float32
]
+
[
tf
.
int32
]),
parallel_iterations
=
parallel_iterations
)
batch_nmsed_boxes
=
batch_outputs
[
0
]
batch_nmsed_scores
=
batch_outputs
[
1
]
batch_nmsed_classes
=
batch_outputs
[
2
]
batch_nmsed_masks
=
batch_outputs
[
3
]
batch_nmsed_additional_fields
=
{
key
:
value
for
key
,
value
in
zip
(
additional_fields
,
batch_outputs
[
4
:
-
1
])
}
batch_num_detections
=
batch_outputs
[
-
1
]
if
original_masks
is
None
:
batch_nmsed_masks
=
None
if
original_additional_fields
is
None
:
batch_nmsed_additional_fields
=
None
return
(
batch_nmsed_boxes
,
batch_nmsed_scores
,
batch_nmsed_classes
,
batch_nmsed_masks
,
batch_num_detections
)
batch_nmsed_masks
,
batch_nmsed_additional_fields
,
batch_num_detections
)
research/object_detection/core/post_processing_test.py
View file @
ff88581a
...
...
@@ -497,11 +497,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
exp_nms_classes
=
[[
0
,
0
,
1
,
0
]]
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
)
nmsed_additional_fields
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
)
self
.
assertIsNone
(
nmsed_masks
)
self
.
assertIsNone
(
nmsed_additional_fields
)
with
self
.
test_session
()
as
sess
:
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
...
...
@@ -544,11 +546,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
[
1
,
0
,
0
,
0
]])
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
)
nmsed_additional_fields
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
)
self
.
assertIsNone
(
nmsed_masks
)
self
.
assertIsNone
(
nmsed_additional_fields
)
# Check static shapes
self
.
assertAllEqual
(
nmsed_boxes
.
shape
.
as_list
(),
exp_nms_corners
.
shape
)
...
...
@@ -616,11 +620,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
[[
0
,
0
],
[
0
,
0
]]]])
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
,
masks
=
masks
)
nmsed_additional_fields
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
,
masks
=
masks
)
self
.
assertIsNone
(
nmsed_additional_fields
)
# Check static shapes
self
.
assertAllEqual
(
nmsed_boxes
.
shape
.
as_list
(),
exp_nms_corners
.
shape
)
self
.
assertAllEqual
(
nmsed_scores
.
shape
.
as_list
(),
exp_nms_scores
.
shape
)
...
...
@@ -639,6 +645,91 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
self
.
assertAllClose
(
num_detections
,
[
2
,
3
])
self
.
assertAllClose
(
nmsed_masks
,
exp_nms_masks
)
def
test_batch_multiclass_nms_with_additional_fields
(
self
):
boxes
=
tf
.
constant
([[[[
0
,
0
,
1
,
1
],
[
0
,
0
,
4
,
5
]],
[[
0
,
0.1
,
1
,
1.1
],
[
0
,
0.1
,
2
,
1.1
]],
[[
0
,
-
0.1
,
1
,
0.9
],
[
0
,
-
0.1
,
1
,
0.9
]],
[[
0
,
10
,
1
,
11
],
[
0
,
10
,
1
,
11
]]],
[[[
0
,
10.1
,
1
,
11.1
],
[
0
,
10.1
,
1
,
11.1
]],
[[
0
,
100
,
1
,
101
],
[
0
,
100
,
1
,
101
]],
[[
0
,
1000
,
1
,
1002
],
[
0
,
999
,
2
,
1004
]],
[[
0
,
1000
,
1
,
1002.1
],
[
0
,
999
,
2
,
1002.7
]]]],
tf
.
float32
)
scores
=
tf
.
constant
([[[.
9
,
0.01
],
[.
75
,
0.05
],
[.
6
,
0.01
],
[.
95
,
0
]],
[[.
5
,
0.01
],
[.
3
,
0.01
],
[.
01
,
.
85
],
[.
01
,
.
5
]]])
additional_fields
=
{
'keypoints'
:
tf
.
constant
(
[[[[
6
,
7
],
[
8
,
9
]],
[[
0
,
1
],
[
2
,
3
]],
[[
0
,
0
],
[
0
,
0
]],
[[
0
,
0
],
[
0
,
0
]]],
[[[
13
,
14
],
[
15
,
16
]],
[[
8
,
9
],
[
10
,
11
]],
[[
10
,
11
],
[
12
,
13
]],
[[
0
,
0
],
[
0
,
0
]]]],
tf
.
float32
)
}
score_thresh
=
0.1
iou_thresh
=
.
5
max_output_size
=
4
exp_nms_corners
=
np
.
array
([[[
0
,
10
,
1
,
11
],
[
0
,
0
,
1
,
1
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]],
[[
0
,
999
,
2
,
1004
],
[
0
,
10.1
,
1
,
11.1
],
[
0
,
100
,
1
,
101
],
[
0
,
0
,
0
,
0
]]])
exp_nms_scores
=
np
.
array
([[.
95
,
.
9
,
0
,
0
],
[.
85
,
.
5
,
.
3
,
0
]])
exp_nms_classes
=
np
.
array
([[
0
,
0
,
0
,
0
],
[
1
,
0
,
0
,
0
]])
exp_nms_additional_fields
=
{
'keypoints'
:
np
.
array
([[[[
0
,
0
],
[
0
,
0
]],
[[
6
,
7
],
[
8
,
9
]],
[[
0
,
0
],
[
0
,
0
]],
[[
0
,
0
],
[
0
,
0
]]],
[[[
10
,
11
],
[
12
,
13
]],
[[
13
,
14
],
[
15
,
16
]],
[[
8
,
9
],
[
10
,
11
]],
[[
0
,
0
],
[
0
,
0
]]]])
}
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
nmsed_additional_fields
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
,
additional_fields
=
additional_fields
)
self
.
assertIsNone
(
nmsed_masks
)
# Check static shapes
self
.
assertAllEqual
(
nmsed_boxes
.
shape
.
as_list
(),
exp_nms_corners
.
shape
)
self
.
assertAllEqual
(
nmsed_scores
.
shape
.
as_list
(),
exp_nms_scores
.
shape
)
self
.
assertAllEqual
(
nmsed_classes
.
shape
.
as_list
(),
exp_nms_classes
.
shape
)
self
.
assertEqual
(
len
(
nmsed_additional_fields
),
len
(
exp_nms_additional_fields
))
for
key
in
exp_nms_additional_fields
:
self
.
assertAllEqual
(
nmsed_additional_fields
[
key
].
shape
.
as_list
(),
exp_nms_additional_fields
[
key
].
shape
)
self
.
assertEqual
(
num_detections
.
shape
.
as_list
(),
[
2
])
with
self
.
test_session
()
as
sess
:
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_additional_fields
,
num_detections
)
=
sess
.
run
([
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_additional_fields
,
num_detections
])
self
.
assertAllClose
(
nmsed_boxes
,
exp_nms_corners
)
self
.
assertAllClose
(
nmsed_scores
,
exp_nms_scores
)
self
.
assertAllClose
(
nmsed_classes
,
exp_nms_classes
)
for
key
in
exp_nms_additional_fields
:
self
.
assertAllClose
(
nmsed_additional_fields
[
key
],
exp_nms_additional_fields
[
key
])
self
.
assertAllClose
(
num_detections
,
[
2
,
3
])
def
test_batch_multiclass_nms_with_dynamic_batch_size
(
self
):
boxes_placeholder
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
(
None
,
None
,
2
,
4
))
scores_placeholder
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
(
None
,
None
,
2
))
...
...
@@ -690,11 +781,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
[[
0
,
0
],
[
0
,
0
]]]])
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes_placeholder
,
scores_placeholder
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
,
masks
=
masks_placeholder
)
nmsed_additional_fields
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes_placeholder
,
scores_placeholder
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
,
masks
=
masks_placeholder
)
self
.
assertIsNone
(
nmsed_additional_fields
)
# Check static shapes
self
.
assertAllEqual
(
nmsed_boxes
.
shape
.
as_list
(),
[
None
,
4
,
4
])
self
.
assertAllEqual
(
nmsed_scores
.
shape
.
as_list
(),
[
None
,
4
])
...
...
@@ -765,10 +858,13 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
[[
0
,
0
],
[
0
,
0
]]]]
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
,
num_valid_boxes
=
num_valid_boxes
,
masks
=
masks
)
nmsed_additional_fields
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
,
num_valid_boxes
=
num_valid_boxes
,
masks
=
masks
)
self
.
assertIsNone
(
nmsed_additional_fields
)
with
self
.
test_session
()
as
sess
:
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
...
...
@@ -780,6 +876,84 @@ class MulticlassNonMaxSuppressionTest(tf.test.TestCase):
self
.
assertAllClose
(
num_detections
,
[
1
,
1
])
self
.
assertAllClose
(
nmsed_masks
,
exp_nms_masks
)
def
test_batch_multiclass_nms_with_additional_fields_and_num_valid_boxes
(
self
):
boxes
=
tf
.
constant
([[[[
0
,
0
,
1
,
1
],
[
0
,
0
,
4
,
5
]],
[[
0
,
0.1
,
1
,
1.1
],
[
0
,
0.1
,
2
,
1.1
]],
[[
0
,
-
0.1
,
1
,
0.9
],
[
0
,
-
0.1
,
1
,
0.9
]],
[[
0
,
10
,
1
,
11
],
[
0
,
10
,
1
,
11
]]],
[[[
0
,
10.1
,
1
,
11.1
],
[
0
,
10.1
,
1
,
11.1
]],
[[
0
,
100
,
1
,
101
],
[
0
,
100
,
1
,
101
]],
[[
0
,
1000
,
1
,
1002
],
[
0
,
999
,
2
,
1004
]],
[[
0
,
1000
,
1
,
1002.1
],
[
0
,
999
,
2
,
1002.7
]]]],
tf
.
float32
)
scores
=
tf
.
constant
([[[.
9
,
0.01
],
[.
75
,
0.05
],
[.
6
,
0.01
],
[.
95
,
0
]],
[[.
5
,
0.01
],
[.
3
,
0.01
],
[.
01
,
.
85
],
[.
01
,
.
5
]]])
additional_fields
=
{
'keypoints'
:
tf
.
constant
(
[[[[
6
,
7
],
[
8
,
9
]],
[[
0
,
1
],
[
2
,
3
]],
[[
0
,
0
],
[
0
,
0
]],
[[
0
,
0
],
[
0
,
0
]]],
[[[
13
,
14
],
[
15
,
16
]],
[[
8
,
9
],
[
10
,
11
]],
[[
10
,
11
],
[
12
,
13
]],
[[
0
,
0
],
[
0
,
0
]]]],
tf
.
float32
)
}
num_valid_boxes
=
tf
.
constant
([
1
,
1
],
tf
.
int32
)
score_thresh
=
0.1
iou_thresh
=
.
5
max_output_size
=
4
exp_nms_corners
=
[[[
0
,
0
,
1
,
1
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]],
[[
0
,
10.1
,
1
,
11.1
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]]]
exp_nms_scores
=
[[.
9
,
0
,
0
,
0
],
[.
5
,
0
,
0
,
0
]]
exp_nms_classes
=
[[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]]
exp_nms_additional_fields
=
{
'keypoints'
:
np
.
array
([[[[
6
,
7
],
[
8
,
9
]],
[[
0
,
0
],
[
0
,
0
]],
[[
0
,
0
],
[
0
,
0
]],
[[
0
,
0
],
[
0
,
0
]]],
[[[
13
,
14
],
[
15
,
16
]],
[[
0
,
0
],
[
0
,
0
]],
[[
0
,
0
],
[
0
,
0
]],
[[
0
,
0
],
[
0
,
0
]]]])
}
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
nmsed_additional_fields
,
num_detections
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
boxes
,
scores
,
score_thresh
,
iou_thresh
,
max_size_per_class
=
max_output_size
,
max_total_size
=
max_output_size
,
num_valid_boxes
=
num_valid_boxes
,
additional_fields
=
additional_fields
)
self
.
assertIsNone
(
nmsed_masks
)
with
self
.
test_session
()
as
sess
:
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_additional_fields
,
num_detections
)
=
sess
.
run
([
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_additional_fields
,
num_detections
])
self
.
assertAllClose
(
nmsed_boxes
,
exp_nms_corners
)
self
.
assertAllClose
(
nmsed_scores
,
exp_nms_scores
)
self
.
assertAllClose
(
nmsed_classes
,
exp_nms_classes
)
for
key
in
exp_nms_additional_fields
:
self
.
assertAllClose
(
nmsed_additional_fields
[
key
],
exp_nms_additional_fields
[
key
])
self
.
assertAllClose
(
num_detections
,
[
1
,
1
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
research/object_detection/meta_architectures/BUILD
View file @
ff88581a
...
...
@@ -18,6 +18,7 @@ py_library(
"//tensorflow_models/object_detection/core:model"
,
"//tensorflow_models/object_detection/core:target_assigner"
,
"//tensorflow_models/object_detection/utils:shape_utils"
,
"//tensorflow_models/object_detection/utils:visualization_utils"
,
],
)
...
...
research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
View file @
ff88581a
...
...
@@ -62,8 +62,6 @@ Following the API (see model.DetectionModel definition), our outputs after
postprocessing operations are always normalized boxes however, internally, we
sometimes convert to absolute --- e.g. for loss computation. In particular,
anchors and proposal_boxes are both represented as absolute coordinates.
TODO: Support TPU implementations and sigmoid loss.
"""
from
abc
import
abstractmethod
from
functools
import
partial
...
...
@@ -91,6 +89,7 @@ class FasterRCNNFeatureExtractor(object):
def
__init__
(
self
,
is_training
,
first_stage_features_stride
,
batch_norm_trainable
=
False
,
reuse_weights
=
None
,
weight_decay
=
0.0
):
"""Constructor.
...
...
@@ -99,11 +98,15 @@ class FasterRCNNFeatureExtractor(object):
is_training: A boolean indicating whether the training version of the
computation graph should be constructed.
first_stage_features_stride: Output stride of extracted RPN feature map.
batch_norm_trainable: Whether to update batch norm parameters during
training or not. When training with a relative large batch size
(e.g. 8), it could be desirable to enable batch norm update.
reuse_weights: Whether to reuse variables. Default is None.
weight_decay: float weight decay for feature extractor (default: 0.0).
"""
self
.
_is_training
=
is_training
self
.
_first_stage_features_stride
=
first_stage_features_stride
self
.
_train_batch_norm
=
(
batch_norm_trainable
and
is_training
)
self
.
_reuse_weights
=
reuse_weights
self
.
_weight_decay
=
weight_decay
...
...
@@ -214,7 +217,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
second_stage_score_conversion_fn
,
second_stage_localization_loss_weight
,
second_stage_classification_loss_weight
,
hard_example_miner
,
second_stage_classification_loss
,
second_stage_mask_prediction_loss_weight
=
1.0
,
hard_example_miner
=
None
,
parallel_iterations
=
16
):
"""FasterRCNNMetaArch Constructor.
...
...
@@ -225,10 +230,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
image_resizer_fn: A callable for image resizing. This callable always
takes a rank-3 image tensor (corresponding to a single image) and
returns a rank-3 image tensor, possibly with new spatial dimensions.
See builders/image_resizer_builder.py.
image_resizer_fn: A callable for image resizing. This callable
takes a rank-3 image tensor of shape [height, width, channels]
(corresponding to a single image) and returns a rank-3 image tensor,
possibly with new spatial dimensions. See
builders/image_resizer_builder.py.
feature_extractor: A FasterRCNNFeatureExtractor object.
first_stage_only: Whether to construct only the Region Proposal Network
(RPN) part of the model.
...
...
@@ -295,19 +301,28 @@ class FasterRCNNMetaArch(model.DetectionModel):
second_stage_score_conversion_fn: Callable elementwise nonlinearity
(that takes tensors as inputs and returns tensors). This is usually
used to convert logits to probabilities.
second_stage_localization_loss_weight: A float
second_stage_classification_loss_weight: A float
second_stage_localization_loss_weight: A float indicating the scale factor
for second stage localization loss.
second_stage_classification_loss_weight: A float indicating the scale
factor for second stage classification loss.
second_stage_classification_loss: Classification loss used by the second
stage classifier. Either losses.WeightedSigmoidClassificationLoss or
losses.WeightedSoftmaxClassificationLoss.
second_stage_mask_prediction_loss_weight: A float indicating the scale
factor for second stage mask prediction loss. This is applicable only if
second stage box predictor is configured to predict masks.
hard_example_miner: A losses.HardExampleMiner object (can be None).
parallel_iterations: (Optional) The number of iterations allowed to run
in parallel for calls to tf.map_fn.
Raises:
ValueError: If `second_stage_batch_size` > `first_stage_max_proposals`
ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
training time.
ValueError: If first_stage_anchor_generator is not of type
grid_anchor_generator.GridAnchorGenerator.
"""
super
(
FasterRCNNMetaArch
,
self
).
__init__
(
num_classes
=
num_classes
)
if
second_stage_batch_size
>
first_stage_max_proposals
:
if
is_training
and
second_stage_batch_size
>
first_stage_max_proposals
:
raise
ValueError
(
'second_stage_batch_size should be no greater than '
'first_stage_max_proposals.'
)
if
not
isinstance
(
first_stage_anchor_generator
,
...
...
@@ -375,10 +390,13 @@ class FasterRCNNMetaArch(model.DetectionModel):
self
.
_second_stage_localization_loss
=
(
losses
.
WeightedSmoothL1LocalizationLoss
(
anchorwise_output
=
True
))
self
.
_second_stage_classification_loss
=
(
losses
.
WeightedSoftmaxClassificationLoss
(
anchorwise_output
=
True
))
self
.
_second_stage_classification_loss
=
second_stage_classification_loss
self
.
_second_stage_mask_loss
=
(
losses
.
WeightedSigmoidClassificationLoss
(
anchorwise_output
=
True
))
self
.
_second_stage_loc_loss_weight
=
second_stage_localization_loss_weight
self
.
_second_stage_cls_loss_weight
=
second_stage_classification_loss_weight
self
.
_second_stage_mask_loss_weight
=
(
second_stage_mask_prediction_loss_weight
)
self
.
_hard_example_miner
=
hard_example_miner
self
.
_parallel_iterations
=
parallel_iterations
...
...
@@ -491,7 +509,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
[total_num_proposals, num_classes, 4] representing predicted
(final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals
8) class_predictions_with_background: a
2
-D tensor with shape
8) class_predictions_with_background: a
3
-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
...
...
@@ -504,7 +522,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
`self.max_num_proposals` for each image.
10) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes
(
in absolute coordinates
)
.
decoded proposal bounding boxes in absolute coordinates.
11) mask_predictions: (optional) a 4-D tensor with shape
[total_num_padded_proposals, num_classes, mask_height, mask_width]
containing instance mask predictions.
...
...
@@ -553,10 +571,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
"""Predicts the output tensors from second stage of Faster R-CNN.
Args:
rpn_box_encodings:
3
-D float tensor of shape
rpn_box_encodings:
4
-D float tensor of shape
[batch_size, num_valid_anchors, self._box_coder.code_size] containing
predicted boxes.
rpn_objectness_predictions_with_background:
3
-D float tensor of shape
rpn_objectness_predictions_with_background:
2
-D float tensor of shape
[batch_size, num_valid_anchors, 2] containing class
predictions (logits) for each of the anchors. Note that this
tensor *includes* background class predictions (at class index 0).
...
...
@@ -573,7 +591,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
[total_num_proposals, num_classes, 4] representing predicted
(final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals
2) class_predictions_with_background: a
2
-D tensor with shape
2) class_predictions_with_background: a
3
-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
...
...
@@ -586,8 +604,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
`self.max_num_proposals` for each image.
4) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes (in absolute coordinates).
5) mask_predictions: (optional) a 4-D tensor with shape
decoded proposal bounding boxes in absolute coordinates.
5) proposal_boxes_normalized: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing decoded proposal
bounding boxes in normalized coordinates. Can be used to override the
boxes proposed by the RPN, thus enabling one to extract features and
get box classification and prediction for externally selected areas
of the image.
6) box_classifier_features: a 4-D float32 tensor representing the
features for each proposal.
7) mask_predictions: (optional) a 4-D tensor with shape
[total_num_padded_proposals, num_classes, mask_height, mask_width]
containing instance mask predictions.
"""
...
...
@@ -622,7 +648,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
class_predictions_with_background
,
'num_proposals'
:
num_proposals
,
'proposal_boxes'
:
absolute_proposal_boxes
,
'box_classifier_features'
:
box_classifier_features
,
'proposal_boxes_normalized'
:
proposal_boxes_normalized
,
}
if
box_predictor
.
MASK_PREDICTIONS
in
box_predictions
:
mask_predictions
=
tf
.
squeeze
(
box_predictions
[
box_predictor
.
MASK_PREDICTIONS
],
axis
=
1
)
prediction_dict
[
'mask_predictions'
]
=
mask_predictions
return
prediction_dict
def
_extract_rpn_feature_maps
(
self
,
preprocessed_inputs
):
...
...
@@ -729,10 +762,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
extent of the window to clip/prune to.
Returns:
box_encodings:
3
-D float tensor of shape
box_encodings:
4
-D float tensor of shape
[batch_size, num_valid_anchors, self._box_coder.code_size] containing
predicted boxes, where num_valid_anchors <= num_anchors
objectness_predictions_with_background:
3
-D float tensor of shape
objectness_predictions_with_background:
2
-D float tensor of shape
[batch_size, num_valid_anchors, 2] containing class
predictions (logits) for each of the anchors, where
num_valid_anchors <= num_anchors. Note that this
...
...
@@ -813,7 +846,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
return
{
'detection_boxes'
:
proposal_boxes
,
'detection_scores'
:
proposal_scores
,
'num_detections'
:
num_proposals
'num_detections'
:
tf
.
to_float
(
num_proposals
)
}
with
tf
.
name_scope
(
'SecondStagePostprocessor'
):
mask_predictions
=
prediction_dict
.
get
(
box_predictor
.
MASK_PREDICTIONS
)
...
...
@@ -877,7 +910,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
rpn_objectness_softmax_without_background
=
tf
.
nn
.
softmax
(
rpn_objectness_predictions_with_background_batch
)[:,
:,
1
]
clip_window
=
tf
.
to_float
(
tf
.
stack
([
0
,
0
,
image_shape
[
1
],
image_shape
[
2
]]))
(
proposal_boxes
,
proposal_scores
,
_
,
_
,
(
proposal_boxes
,
proposal_scores
,
_
,
_
,
_
,
num_proposals
)
=
post_processing
.
batch_multiclass_non_max_suppression
(
tf
.
expand_dims
(
proposal_boxes
,
axis
=
2
),
tf
.
expand_dims
(
rpn_objectness_softmax_without_background
,
...
...
@@ -891,7 +924,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
proposal_boxes
=
tf
.
stop_gradient
(
proposal_boxes
)
if
not
self
.
_hard_example_miner
:
(
groundtruth_boxlists
,
groundtruth_classes_with_background_list
,
)
=
self
.
_format_groundtruth_data
(
image_shape
)
_
)
=
self
.
_format_groundtruth_data
(
image_shape
)
(
proposal_boxes
,
proposal_scores
,
num_proposals
)
=
self
.
_unpad_proposals_and_sample_box_classifier_batch
(
proposal_boxes
,
proposal_scores
,
num_proposals
,
...
...
@@ -998,6 +1031,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
for target assignment, we:
1) convert boxes to absolute coordinates,
2) add a background class at class index 0
3) groundtruth instance masks, if available, are resized to match
image_shape.
Args:
image_shape: A 1-D int32 tensor of shape [4] representing the shape of the
...
...
@@ -1009,6 +1044,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
groundtruth_classes_with_background_list: A list of 2-D one-hot
(or k-hot) tensors of shape [num_boxes, num_classes+1] containing the
class targets with the 0th index assumed to map to the background class.
groundtruth_masks_list: If present, a list of 3-D tf.float32 tensors of
shape [num_boxes, image_height, image_width] containing instance masks.
This is set to None if no masks exist in the provided groundtruth.
"""
groundtruth_boxlists
=
[
box_list_ops
.
to_absolute_coordinates
(
...
...
@@ -1019,7 +1057,22 @@ class FasterRCNNMetaArch(model.DetectionModel):
tf
.
pad
(
one_hot_encoding
,
[[
0
,
0
],
[
1
,
0
]],
mode
=
'CONSTANT'
))
for
one_hot_encoding
in
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
classes
)]
return
groundtruth_boxlists
,
groundtruth_classes_with_background_list
groundtruth_masks_list
=
self
.
_groundtruth_lists
.
get
(
fields
.
BoxListFields
.
masks
)
if
groundtruth_masks_list
is
not
None
:
resized_masks_list
=
[]
for
mask
in
groundtruth_masks_list
:
resized_4d_mask
=
tf
.
image
.
resize_images
(
tf
.
expand_dims
(
mask
,
axis
=
3
),
image_shape
[
1
:
3
],
method
=
tf
.
image
.
ResizeMethod
.
NEAREST_NEIGHBOR
,
align_corners
=
True
)
resized_masks_list
.
append
(
tf
.
squeeze
(
resized_4d_mask
,
axis
=
3
))
groundtruth_masks_list
=
resized_masks_list
return
(
groundtruth_boxlists
,
groundtruth_classes_with_background_list
,
groundtruth_masks_list
)
def
_sample_box_classifier_minibatch
(
self
,
proposal_boxlist
,
...
...
@@ -1100,29 +1153,26 @@ class FasterRCNNMetaArch(model.DetectionModel):
proposal_boxes
,
num_proposals
,
image_shape
,
mask_predictions
=
None
,
mask_threshold
=
0.5
):
mask_predictions
=
None
):
"""Converts predictions from the second stage box classifier to detections.
Args:
refined_box_encodings: a 3-D tensor with shape
refined_box_encodings: a 3-D
float
tensor with shape
[total_num_padded_proposals, num_classes, 4] representing predicted
(final) refined box encodings.
class_predictions_with_background: a 3-D tensor with shape
class_predictions_with_background: a 3-D tensor
float
with shape
[total_num_padded_proposals, num_classes + 1] containing class
predictions (logits) for each of the proposals. Note that this tensor
*includes* background class predictions (at class index 0).
proposal_boxes:
[batch_size, self.max_num_proposals, 4] representing
decoded proposal
bounding boxes.
num_proposals: A Tensor of type `int32`. A 1-D tensor of shape [batch]
representing the number of proposals predicted for each image in
the batch.
image_shape: a 1-D tensor representing the input image shape.
mask_predictions: (optional) a 4-D tensor with shape
proposal_boxes:
a 3-D float tensor with shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal
bounding boxes in absolute coordinates.
num_proposals: a 1-D int32 tensor of shape [batch] representing the number
of proposals predicted for each image in
the batch.
image_shape: a 1-D
int32
tensor representing the input image shape.
mask_predictions: (optional) a 4-D
float
tensor with shape
[total_num_padded_proposals, num_classes, mask_height, mask_width]
containing instance mask predictions.
mask_threshold: a scalar threshold determining which mask values are
rounded to 0 or 1.
containing instance mask prediction logits.
Returns:
A dictionary containing:
...
...
@@ -1131,7 +1181,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
`detection_classes`: [batch, max_detections]
`num_detections`: [batch]
`detection_masks`:
(optional) [batch, max_detections, mask_height, mask_width]
(optional) [batch, max_detections, mask_height, mask_width]. Note
that a pixel-wise sigmoid score converter is applied to the detection
masks.
"""
refined_box_encodings_batch
=
tf
.
reshape
(
refined_box_encodings
,
[
-
1
,
self
.
max_num_proposals
,
...
...
@@ -1156,10 +1208,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
if
mask_predictions
is
not
None
:
mask_height
=
mask_predictions
.
shape
[
2
].
value
mask_width
=
mask_predictions
.
shape
[
3
].
value
mask_predictions
=
tf
.
sigmoid
(
mask_predictions
)
mask_predictions_batch
=
tf
.
reshape
(
mask_predictions
,
[
-
1
,
self
.
max_num_proposals
,
self
.
num_classes
,
mask_height
,
mask_width
])
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
nmsed_masks
,
_
,
num_detections
)
=
self
.
_second_stage_nms_fn
(
refined_decoded_boxes_batch
,
class_predictions_batch
,
...
...
@@ -1173,26 +1226,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
'num_detections'
:
tf
.
to_float
(
num_detections
)}
if
nmsed_masks
is
not
None
:
detections
[
'detection_masks'
]
=
nmsed_masks
if
mask_predictions
is
not
None
:
detections
[
'detection_masks'
]
=
tf
.
to_float
(
tf
.
greater_equal
(
detections
[
'detection_masks'
],
mask_threshold
))
return
detections
def
_batch_decode_boxes
(
self
,
box_encodings
,
anchor_boxes
):
"""Decode tensor of refined box encodings.
Args:
refined_box_encodings: a 4-D tensor with shape
[batch_size, max_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings.
proposal_boxes: [batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes.
Returns:
refined_box_predictions: a [batch_size, max_num_proposals, num_classes, 4]
float tensor representing (padded) refined bounding box predictions
(for each image in batch, proposal and class).
"""
"""Decodes box encodings with respect to the anchor boxes.
Args:
...
...
@@ -1246,7 +1282,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
corresponding loss values.
"""
with
tf
.
name_scope
(
scope
,
'Loss'
,
prediction_dict
.
values
()):
(
groundtruth_boxlists
,
groundtruth_classes_with_background_list
(
groundtruth_boxlists
,
groundtruth_classes_with_background_list
,
groundtruth_masks_list
)
=
self
.
_format_groundtruth_data
(
prediction_dict
[
'image_shape'
])
loss_dict
=
self
.
_loss_rpn
(
prediction_dict
[
'rpn_box_encodings'
],
...
...
@@ -1262,7 +1299,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
prediction_dict
[
'proposal_boxes'
],
prediction_dict
[
'num_proposals'
],
groundtruth_boxlists
,
groundtruth_classes_with_background_list
))
groundtruth_classes_with_background_list
,
prediction_dict
[
'image_shape'
],
prediction_dict
.
get
(
'mask_predictions'
),
groundtruth_masks_list
,
))
return
loss_dict
def
_loss_rpn
(
self
,
...
...
@@ -1278,10 +1319,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
participate in the loss computation, and returns the RPN losses.
Args:
rpn_box_encodings: A
3
-D float tensor of shape
rpn_box_encodings: A
4
-D float tensor of shape
[batch_size, num_anchors, self._box_coder.code_size] containing
predicted proposal box encodings.
rpn_objectness_predictions_with_background: A
3
-D float tensor of shape
rpn_objectness_predictions_with_background: A
2
-D float tensor of shape
[batch_size, num_anchors, 2] containing objectness predictions
(logits) for each of the anchors with 0 corresponding to background
and 1 corresponding to object.
...
...
@@ -1334,12 +1375,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
tf
.
reduce_sum
(
localization_losses
,
axis
=
1
)
/
normalizer
)
objectness_loss
=
tf
.
reduce_mean
(
tf
.
reduce_sum
(
objectness_losses
,
axis
=
1
)
/
normalizer
)
loss_dict
=
{
'first_stage_localization_loss'
:
self
.
_first_stage_loc_loss_weight
*
localization_loss
,
'first_stage_objectness_loss'
:
self
.
_first_stage_obj_loss_weight
*
objectness_loss
,
}
loss_dict
=
{}
with
tf
.
name_scope
(
'localization_loss'
):
loss_dict
[
'first_stage_localization_loss'
]
=
(
self
.
_first_stage_loc_loss_weight
*
localization_loss
)
with
tf
.
name_scope
(
'objectness_loss'
):
loss_dict
[
'first_stage_objectness_loss'
]
=
(
self
.
_first_stage_obj_loss_weight
*
objectness_loss
)
return
loss_dict
def
_loss_box_classifier
(
self
,
...
...
@@ -1348,17 +1391,23 @@ class FasterRCNNMetaArch(model.DetectionModel):
proposal_boxes
,
num_proposals
,
groundtruth_boxlists
,
groundtruth_classes_with_background_list
):
groundtruth_classes_with_background_list
,
image_shape
,
prediction_masks
=
None
,
groundtruth_masks_list
=
None
):
"""Computes scalar box classifier loss tensors.
Uses self._detector_target_assigner to obtain regression and classification
targets for the second stage box classifier, optionally performs
hard mining, and returns losses. All losses are computed independently
for each image and then averaged across the batch.
Please note that for boxes and masks with multiple labels, the box
regression and mask prediction losses are only computed for one label.
This function assumes that the proposal boxes in the "padded" regions are
actually zero (and thus should not be matched to).
Args:
refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, box_coder.code_size] representing
...
...
@@ -1377,11 +1426,23 @@ class FasterRCNNMetaArch(model.DetectionModel):
groundtruth_classes_with_background_list: a list of 2-D one-hot
(or k-hot) tensors of shape [num_boxes, num_classes + 1] containing the
class targets with the 0th index assumed to map to the background class.
image_shape: a 1-D tensor of shape [4] representing the image shape.
prediction_masks: an optional 4-D tensor with shape [total_num_proposals,
num_classes, mask_height, mask_width] containing the instance masks for
each box.
groundtruth_masks_list: an optional list of 3-D tensors of shape
[num_boxes, image_height, image_width] containing the instance masks for
each of the boxes.
Returns:
a dictionary mapping loss keys ('second_stage_localization_loss',
'second_stage_classification_loss') to scalar tensors representing
corresponding loss values.
Raises:
ValueError: if `predict_instance_masks` in
second_stage_mask_rcnn_box_predictor is True and
`groundtruth_masks_list` is not provided.
"""
with
tf
.
name_scope
(
'BoxClassifierLoss'
):
paddings_indicator
=
self
.
_padded_batched_proposals_indicator
(
...
...
@@ -1409,9 +1470,20 @@ class FasterRCNNMetaArch(model.DetectionModel):
[
batch_size
*
self
.
max_num_proposals
,
-
1
])
refined_box_encodings_with_background
=
tf
.
pad
(
refined_box_encodings
,
[[
0
,
0
],
[
1
,
0
],
[
0
,
0
]])
# For anchors with multiple labels, picks refined_location_encodings
# for just one class to avoid over-counting for regression loss and
# (optionally) mask loss.
one_hot_flat_cls_targets_with_background
=
tf
.
argmax
(
flat_cls_targets_with_background
,
axis
=
1
)
one_hot_flat_cls_targets_with_background
=
tf
.
one_hot
(
one_hot_flat_cls_targets_with_background
,
flat_cls_targets_with_background
.
get_shape
()[
1
])
refined_box_encodings_masked_by_class_targets
=
tf
.
boolean_mask
(
refined_box_encodings_with_background
,
tf
.
greater
(
flat_cls_targets_with_background
,
0
))
tf
.
greater
(
one_hot_flat_cls_targets_with_background
,
0
))
class_predictions_with_background
=
tf
.
reshape
(
class_predictions_with_background
,
[
batch_size
,
self
.
max_num_proposals
,
-
1
])
reshaped_refined_box_encodings
=
tf
.
reshape
(
refined_box_encodings_masked_by_class_targets
,
[
batch_size
,
-
1
,
4
])
...
...
@@ -1433,12 +1505,82 @@ class FasterRCNNMetaArch(model.DetectionModel):
)
=
self
.
_unpad_proposals_and_apply_hard_mining
(
proposal_boxlists
,
second_stage_loc_losses
,
second_stage_cls_losses
,
num_proposals
)
loss_dict
=
{
'second_stage_localization_loss'
:
(
self
.
_second_stage_loc_loss_weight
*
second_stage_loc_loss
),
'second_stage_classification_loss'
:
(
self
.
_second_stage_cls_loss_weight
*
second_stage_cls_loss
),
}
loss_dict
=
{}
with
tf
.
name_scope
(
'localization_loss'
):
loss_dict
[
'second_stage_localization_loss'
]
=
(
self
.
_second_stage_loc_loss_weight
*
second_stage_loc_loss
)
with
tf
.
name_scope
(
'classification_loss'
):
loss_dict
[
'second_stage_classification_loss'
]
=
(
self
.
_second_stage_cls_loss_weight
*
second_stage_cls_loss
)
second_stage_mask_loss
=
None
if
prediction_masks
is
not
None
:
if
groundtruth_masks_list
is
None
:
raise
ValueError
(
'Groundtruth instance masks not provided. '
'Please configure input reader.'
)
# Create a new target assigner that matches the proposals to groundtruth
# and returns the mask targets.
# TODO: Move `unmatched_cls_target` from constructor to assign function.
# This will enable reuse of a single target assigner for both class
# targets and mask targets.
mask_target_assigner
=
target_assigner
.
create_target_assigner
(
'FasterRCNN'
,
'detection'
,
unmatched_cls_target
=
tf
.
zeros
(
image_shape
[
1
:
3
],
dtype
=
tf
.
float32
))
(
batch_mask_targets
,
_
,
_
,
batch_mask_target_weights
,
_
)
=
target_assigner
.
batch_assign_targets
(
mask_target_assigner
,
proposal_boxlists
,
groundtruth_boxlists
,
groundtruth_masks_list
)
# Pad the prediction_masks with to add zeros for background class to be
# consistent with class predictions.
prediction_masks_with_background
=
tf
.
pad
(
prediction_masks
,
[[
0
,
0
],
[
1
,
0
],
[
0
,
0
],
[
0
,
0
]])
prediction_masks_masked_by_class_targets
=
tf
.
boolean_mask
(
prediction_masks_with_background
,
tf
.
greater
(
one_hot_flat_cls_targets_with_background
,
0
))
mask_height
=
prediction_masks
.
shape
[
2
].
value
mask_width
=
prediction_masks
.
shape
[
3
].
value
reshaped_prediction_masks
=
tf
.
reshape
(
prediction_masks_masked_by_class_targets
,
[
batch_size
,
-
1
,
mask_height
*
mask_width
])
batch_mask_targets_shape
=
tf
.
shape
(
batch_mask_targets
)
flat_gt_masks
=
tf
.
reshape
(
batch_mask_targets
,
[
-
1
,
batch_mask_targets_shape
[
2
],
batch_mask_targets_shape
[
3
]])
# Use normalized proposals to crop mask targets from image masks.
flat_normalized_proposals
=
box_list_ops
.
to_normalized_coordinates
(
box_list
.
BoxList
(
tf
.
reshape
(
proposal_boxes
,
[
-
1
,
4
])),
image_shape
[
1
],
image_shape
[
2
]).
get
()
flat_cropped_gt_mask
=
tf
.
image
.
crop_and_resize
(
tf
.
expand_dims
(
flat_gt_masks
,
-
1
),
flat_normalized_proposals
,
tf
.
range
(
flat_normalized_proposals
.
shape
[
0
].
value
),
[
mask_height
,
mask_width
])
batch_cropped_gt_mask
=
tf
.
reshape
(
flat_cropped_gt_mask
,
[
batch_size
,
-
1
,
mask_height
*
mask_width
])
second_stage_mask_losses
=
self
.
_second_stage_mask_loss
(
reshaped_prediction_masks
,
batch_cropped_gt_mask
,
weights
=
batch_mask_target_weights
)
/
(
mask_height
*
mask_width
*
tf
.
maximum
(
tf
.
reduce_sum
(
batch_mask_target_weights
,
axis
=
1
,
keep_dims
=
True
),
tf
.
ones
((
batch_size
,
1
))))
second_stage_mask_loss
=
tf
.
reduce_sum
(
tf
.
boolean_mask
(
second_stage_mask_losses
,
paddings_indicator
))
if
second_stage_mask_loss
is
not
None
:
with
tf
.
name_scope
(
'mask_loss'
):
loss_dict
[
'second_stage_mask_loss'
]
=
(
self
.
_second_stage_mask_loss_weight
*
second_stage_mask_loss
)
return
loss_dict
def
_padded_batched_proposals_indicator
(
self
,
...
...
research/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py
View file @
ff88581a
...
...
@@ -15,6 +15,7 @@
"""Tests for object_detection.meta_architectures.faster_rcnn_meta_arch."""
import
numpy
as
np
import
tensorflow
as
tf
from
object_detection.meta_architectures
import
faster_rcnn_meta_arch_test_lib
...
...
@@ -46,19 +47,19 @@ class FasterRCNNMetaArchTest(
mask_height
=
2
mask_width
=
2
mask_predictions
=
.
6
*
tf
.
ones
(
mask_predictions
=
30
.
*
tf
.
ones
(
[
total_num_padded_proposals
,
model
.
num_classes
,
mask_height
,
mask_width
],
dtype
=
tf
.
float32
)
exp_detection_masks
=
[[[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]]],
[[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
0
,
0
],
[
0
,
0
]]]]
exp_detection_masks
=
np
.
array
(
[[[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]]],
[[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
1
,
1
],
[
1
,
1
]],
[[
0
,
0
],
[
0
,
0
]]]]
)
detections
=
model
.
postprocess
({
'refined_box_encodings'
:
refined_box_encodings
,
...
...
@@ -79,6 +80,17 @@ class FasterRCNNMetaArchTest(
self
.
assertAllClose
(
detections_out
[
'detection_masks'
],
exp_detection_masks
)
def
_get_box_classifier_features_shape
(
self
,
image_size
,
batch_size
,
max_num_proposals
,
initial_crop_size
,
maxpool_stride
,
num_features
):
return
(
batch_size
*
max_num_proposals
,
initial_crop_size
/
maxpool_stride
,
initial_crop_size
/
maxpool_stride
,
num_features
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
View file @
ff88581a
...
...
@@ -113,7 +113,8 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
second_stage_batch_size
,
first_stage_max_proposals
=
8
,
num_classes
=
2
,
hard_mining
=
False
):
hard_mining
=
False
,
softmax_second_stage_classification_loss
=
True
):
def
image_resizer_fn
(
image
):
return
tf
.
identity
(
image
)
...
...
@@ -178,6 +179,12 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
second_stage_score_conversion_fn
=
tf
.
identity
second_stage_localization_loss_weight
=
1.0
second_stage_classification_loss_weight
=
1.0
if
softmax_second_stage_classification_loss
:
second_stage_classification_loss
=
(
losses
.
WeightedSoftmaxClassificationLoss
(
anchorwise_output
=
True
))
else
:
second_stage_classification_loss
=
(
losses
.
WeightedSigmoidClassificationLoss
(
anchorwise_output
=
True
))
hard_example_miner
=
None
if
hard_mining
:
...
...
@@ -221,52 +228,68 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
second_stage_localization_loss_weight
,
'second_stage_classification_loss_weight'
:
second_stage_classification_loss_weight
,
'second_stage_classification_loss'
:
second_stage_classification_loss
,
'hard_example_miner'
:
hard_example_miner
}
return
self
.
_get_model
(
self
.
_get_second_stage_box_predictor
(
num_classes
=
num_classes
,
is_training
=
is_training
),
**
common_kwargs
)
def
test_predict_correct_shapes_in_inference_mode_
both
_stage
s
(
def
test_predict_
gives_
correct_shapes_in_inference_mode_
first
_stage
_only
(
self
):
batch_size
=
2
image_size
=
10
input_shapes
=
[(
batch_size
,
image_size
,
image_size
,
3
),
(
None
,
image_size
,
image_size
,
3
),
(
batch_size
,
None
,
None
,
3
),
(
None
,
None
,
None
,
3
)]
expected_num_anchors
=
image_size
*
image_size
*
3
*
3
expected_shapes
=
{
'rpn_box_predictor_features'
:
(
2
,
image_size
,
image_size
,
512
),
'rpn_features_to_crop'
:
(
2
,
image_size
,
image_size
,
3
),
'image_shape'
:
(
4
,),
'rpn_box_encodings'
:
(
2
,
expected_num_anchors
,
4
),
'rpn_objectness_predictions_with_background'
:
(
2
,
expected_num_anchors
,
2
),
'anchors'
:
(
expected_num_anchors
,
4
),
'refined_box_encodings'
:
(
2
*
8
,
2
,
4
),
'class_predictions_with_background'
:
(
2
*
8
,
2
+
1
),
'num_proposals'
:
(
2
,),
'proposal_boxes'
:
(
2
,
8
,
4
),
}
for
input_shape
in
input_shapes
:
test_graph
=
tf
.
Graph
()
with
test_graph
.
as_default
():
model
=
self
.
_build_model
(
is_training
=
False
,
first_stage_only
=
False
,
second_stage_batch_size
=
2
)
preprocessed_inputs
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
input_shape
)
result_tensor_dict
=
model
.
predict
(
preprocessed_inputs
)
init_op
=
tf
.
global_variables_initializer
()
with
self
.
test_session
(
graph
=
test_graph
)
as
sess
:
test_graph
=
tf
.
Graph
()
with
test_graph
.
as_default
():
model
=
self
.
_build_model
(
is_training
=
False
,
first_stage_only
=
True
,
second_stage_batch_size
=
2
)
batch_size
=
2
height
=
10
width
=
12
input_image_shape
=
(
batch_size
,
height
,
width
,
3
)
preprocessed_inputs
=
tf
.
placeholder
(
dtype
=
tf
.
float32
,
shape
=
(
batch_size
,
None
,
None
,
3
))
prediction_dict
=
model
.
predict
(
preprocessed_inputs
)
# In inference mode, anchors are clipped to the image window, but not
# pruned. Since MockFasterRCNN.extract_proposal_features returns a
# tensor with the same shape as its input, the expected number of anchors
# is height * width * the number of anchors per location (i.e. 3x3).
expected_num_anchors
=
height
*
width
*
3
*
3
expected_output_keys
=
set
([
'rpn_box_predictor_features'
,
'rpn_features_to_crop'
,
'image_shape'
,
'rpn_box_encodings'
,
'rpn_objectness_predictions_with_background'
,
'anchors'
])
expected_output_shapes
=
{
'rpn_box_predictor_features'
:
(
batch_size
,
height
,
width
,
512
),
'rpn_features_to_crop'
:
(
batch_size
,
height
,
width
,
3
),
'rpn_box_encodings'
:
(
batch_size
,
expected_num_anchors
,
4
),
'rpn_objectness_predictions_with_background'
:
(
batch_size
,
expected_num_anchors
,
2
),
'anchors'
:
(
expected_num_anchors
,
4
)
}
init_op
=
tf
.
global_variables_initializer
()
with
self
.
test_session
()
as
sess
:
sess
.
run
(
init_op
)
tensor_dict_out
=
sess
.
run
(
result_tensor_dict
,
feed_dict
=
{
preprocessed_inputs
:
np
.
zeros
((
batch_size
,
image_size
,
image_size
,
3
))})
self
.
assertEqual
(
set
(
tensor_dict_out
.
keys
()),
set
(
expected_shapes
.
keys
()))
for
key
in
expected_shapes
:
self
.
assertAllEqual
(
tensor_dict_out
[
key
].
shape
,
expected_shapes
[
key
])
prediction_out
=
sess
.
run
(
prediction_dict
,
feed_dict
=
{
preprocessed_inputs
:
np
.
zeros
(
input_image_shape
)
})
self
.
assertEqual
(
set
(
prediction_out
.
keys
()),
expected_output_keys
)
self
.
assertAllEqual
(
prediction_out
[
'image_shape'
],
input_image_shape
)
for
output_key
,
expected_shape
in
expected_output_shapes
.
items
():
self
.
assertAllEqual
(
prediction_out
[
output_key
].
shape
,
expected_shape
)
# Check that anchors are clipped to window.
anchors
=
prediction_out
[
'anchors'
]
self
.
assertTrue
(
np
.
all
(
np
.
greater_equal
(
anchors
,
0
)))
self
.
assertTrue
(
np
.
all
(
np
.
less_equal
(
anchors
[:,
0
],
height
)))
self
.
assertTrue
(
np
.
all
(
np
.
less_equal
(
anchors
[:,
1
],
width
)))
self
.
assertTrue
(
np
.
all
(
np
.
less_equal
(
anchors
[:,
2
],
height
)))
self
.
assertTrue
(
np
.
all
(
np
.
less_equal
(
anchors
[:,
3
],
width
)))
def
test_predict_gives_valid_anchors_in_training_mode_first_stage_only
(
self
):
test_graph
=
tf
.
Graph
()
...
...
@@ -321,48 +344,73 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
prediction_out
[
'rpn_objectness_predictions_with_background'
].
shape
,
(
batch_size
,
num_anchors_out
,
2
))
def
test_predict_gives_correct_shapes_in_inference_mode_both_stages
(
self
):
test_graph
=
tf
.
Graph
()
with
test_graph
.
as_default
():
model
=
self
.
_build_model
(
is_training
=
False
,
first_stage_only
=
False
,
second_stage_batch_size
=
2
)
batch_size
=
2
image_size
=
10
image_shape
=
(
batch_size
,
image_size
,
image_size
,
3
)
preprocessed_inputs
=
tf
.
zeros
(
image_shape
,
dtype
=
tf
.
float32
)
result_tensor_dict
=
model
.
predict
(
preprocessed_inputs
)
expected_num_anchors
=
image_size
*
image_size
*
3
*
3
def
test_predict_correct_shapes_in_inference_mode_both_stages
(
self
):
batch_size
=
2
image_size
=
10
max_num_proposals
=
8
initial_crop_size
=
3
maxpool_stride
=
1
expected_shapes
=
{
'rpn_box_predictor_features'
:
(
2
,
image_size
,
image_size
,
512
),
'rpn_features_to_crop'
:
(
2
,
image_size
,
image_size
,
3
),
'image_shape'
:
(
4
,),
'rpn_box_encodings'
:
(
2
,
expected_num_anchors
,
4
),
'rpn_objectness_predictions_with_background'
:
(
2
,
expected_num_anchors
,
2
),
'anchors'
:
(
expected_num_anchors
,
4
),
'refined_box_encodings'
:
(
2
*
8
,
2
,
4
),
'class_predictions_with_background'
:
(
2
*
8
,
2
+
1
),
'num_proposals'
:
(
2
,),
'proposal_boxes'
:
(
2
,
8
,
4
),
}
init_op
=
tf
.
global_variables_initializer
()
with
self
.
test_session
()
as
sess
:
input_shapes
=
[(
batch_size
,
image_size
,
image_size
,
3
),
(
None
,
image_size
,
image_size
,
3
),
(
batch_size
,
None
,
None
,
3
),
(
None
,
None
,
None
,
3
)]
expected_num_anchors
=
image_size
*
image_size
*
3
*
3
expected_shapes
=
{
'rpn_box_predictor_features'
:
(
2
,
image_size
,
image_size
,
512
),
'rpn_features_to_crop'
:
(
2
,
image_size
,
image_size
,
3
),
'image_shape'
:
(
4
,),
'rpn_box_encodings'
:
(
2
,
expected_num_anchors
,
4
),
'rpn_objectness_predictions_with_background'
:
(
2
,
expected_num_anchors
,
2
),
'anchors'
:
(
expected_num_anchors
,
4
),
'refined_box_encodings'
:
(
2
*
max_num_proposals
,
2
,
4
),
'class_predictions_with_background'
:
(
2
*
max_num_proposals
,
2
+
1
),
'num_proposals'
:
(
2
,),
'proposal_boxes'
:
(
2
,
max_num_proposals
,
4
),
'proposal_boxes_normalized'
:
(
2
,
max_num_proposals
,
4
),
'box_classifier_features'
:
self
.
_get_box_classifier_features_shape
(
image_size
,
batch_size
,
max_num_proposals
,
initial_crop_size
,
maxpool_stride
,
3
)
}
for
input_shape
in
input_shapes
:
test_graph
=
tf
.
Graph
()
with
test_graph
.
as_default
():
model
=
self
.
_build_model
(
is_training
=
False
,
first_stage_only
=
False
,
second_stage_batch_size
=
2
)
preprocessed_inputs
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
input_shape
)
result_tensor_dict
=
model
.
predict
(
preprocessed_inputs
)
init_op
=
tf
.
global_variables_initializer
()
with
self
.
test_session
(
graph
=
test_graph
)
as
sess
:
sess
.
run
(
init_op
)
tensor_dict_out
=
sess
.
run
(
result_tensor_dict
)
self
.
assertEqual
(
set
(
tensor_dict_out
.
keys
()),
set
(
expected_shapes
.
keys
()))
for
key
in
expected_shapes
:
self
.
assertAllEqual
(
tensor_dict_out
[
key
].
shape
,
expected_shapes
[
key
])
tensor_dict_out
=
sess
.
run
(
result_tensor_dict
,
feed_dict
=
{
preprocessed_inputs
:
np
.
zeros
((
batch_size
,
image_size
,
image_size
,
3
))})
self
.
assertEqual
(
set
(
tensor_dict_out
.
keys
()),
set
(
expected_shapes
.
keys
()))
for
key
in
expected_shapes
:
self
.
assertAllEqual
(
tensor_dict_out
[
key
].
shape
,
expected_shapes
[
key
])
def
test_predict_gives_correct_shapes_in_train_mode_both_stages
(
self
):
test_graph
=
tf
.
Graph
()
with
test_graph
.
as_default
():
model
=
self
.
_build_model
(
is_training
=
True
,
first_stage_only
=
False
,
second_stage_batch_size
=
7
)
batch_size
=
2
image_size
=
10
max_num_proposals
=
7
initial_crop_size
=
3
maxpool_stride
=
1
image_shape
=
(
batch_size
,
image_size
,
image_size
,
3
)
preprocessed_inputs
=
tf
.
zeros
(
image_shape
,
dtype
=
tf
.
float32
)
groundtruth_boxes_list
=
[
...
...
@@ -381,11 +429,20 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
(
2
,
image_size
,
image_size
,
512
),
'rpn_features_to_crop'
:
(
2
,
image_size
,
image_size
,
3
),
'image_shape'
:
(
4
,),
'refined_box_encodings'
:
(
2
*
7
,
2
,
4
),
'class_predictions_with_background'
:
(
2
*
7
,
2
+
1
),
'refined_box_encodings'
:
(
2
*
max_num_proposals
,
2
,
4
),
'class_predictions_with_background'
:
(
2
*
max_num_proposals
,
2
+
1
),
'num_proposals'
:
(
2
,),
'proposal_boxes'
:
(
2
,
7
,
4
),
'proposal_boxes'
:
(
2
,
max_num_proposals
,
4
),
'proposal_boxes_normalized'
:
(
2
,
max_num_proposals
,
4
),
'box_classifier_features'
:
self
.
_get_box_classifier_features_shape
(
image_size
,
batch_size
,
max_num_proposals
,
initial_crop_size
,
maxpool_stride
,
3
)
}
init_op
=
tf
.
global_variables_initializer
()
with
self
.
test_session
()
as
sess
:
sess
.
run
(
init_op
)
...
...
@@ -600,6 +657,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
preprocessed_inputs
=
model
.
preprocess
(
image_placeholder
)
self
.
assertAllEqual
(
preprocessed_inputs
.
shape
.
as_list
(),
image_shape
)
# TODO: Split test into two - with and without masks.
def
test_loss_first_stage_only_mode
(
self
):
model
=
self
.
_build_model
(
is_training
=
True
,
first_stage_only
=
True
,
second_stage_batch_size
=
6
)
...
...
@@ -650,6 +708,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
self
.
assertTrue
(
'second_stage_localization_loss'
not
in
loss_dict_out
)
self
.
assertTrue
(
'second_stage_classification_loss'
not
in
loss_dict_out
)
# TODO: Split test into two - with and without masks.
def
test_loss_full
(
self
):
model
=
self
.
_build_model
(
is_training
=
True
,
first_stage_only
=
False
,
second_stage_batch_size
=
6
)
...
...
@@ -702,12 +761,26 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
[
10
,
-
10
,
-
10
],
[
-
10
,
10
,
-
10
]],
dtype
=
tf
.
float32
)
mask_predictions_logits
=
20
*
tf
.
ones
((
batch_size
*
model
.
max_num_proposals
,
model
.
num_classes
,
14
,
14
),
dtype
=
tf
.
float32
)
groundtruth_boxes_list
=
[
tf
.
constant
([[
0
,
0
,
.
5
,
.
5
],
[.
5
,
.
5
,
1
,
1
]],
dtype
=
tf
.
float32
),
tf
.
constant
([[
0
,
.
5
,
.
5
,
1
],
[.
5
,
0
,
1
,
.
5
]],
dtype
=
tf
.
float32
)]
groundtruth_classes_list
=
[
tf
.
constant
([[
1
,
0
],
[
0
,
1
]],
dtype
=
tf
.
float32
),
tf
.
constant
([[
1
,
0
],
[
1
,
0
]],
dtype
=
tf
.
float32
)]
# Set all elements of groundtruth mask to 1.0. In this case all proposal
# crops of the groundtruth masks should return a mask that covers the entire
# proposal. Thus, if mask_predictions_logits element values are all greater
# than 20, the loss should be zero.
groundtruth_masks_list
=
[
tf
.
convert_to_tensor
(
np
.
ones
((
2
,
32
,
32
)),
dtype
=
tf
.
float32
),
tf
.
convert_to_tensor
(
np
.
ones
((
2
,
32
,
32
)),
dtype
=
tf
.
float32
)]
prediction_dict
=
{
'rpn_box_encodings'
:
rpn_box_encodings
,
'rpn_objectness_predictions_with_background'
:
...
...
@@ -717,10 +790,12 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'refined_box_encodings'
:
refined_box_encodings
,
'class_predictions_with_background'
:
class_predictions_with_background
,
'proposal_boxes'
:
proposal_boxes
,
'num_proposals'
:
num_proposals
'num_proposals'
:
num_proposals
,
'mask_predictions'
:
mask_predictions_logits
}
model
.
provide_groundtruth
(
groundtruth_boxes_list
,
groundtruth_classes_list
)
groundtruth_classes_list
,
groundtruth_masks_list
)
loss_dict
=
model
.
loss
(
prediction_dict
)
with
self
.
test_session
()
as
sess
:
...
...
@@ -729,6 +804,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
self
.
assertAllClose
(
loss_dict_out
[
'first_stage_objectness_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_localization_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_classification_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_mask_loss'
],
0
)
def
test_loss_full_zero_padded_proposals
(
self
):
model
=
self
.
_build_model
(
...
...
@@ -775,10 +851,23 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
[
0
,
0
,
0
],
[
0
,
0
,
0
]],
dtype
=
tf
.
float32
)
mask_predictions_logits
=
20
*
tf
.
ones
((
batch_size
*
model
.
max_num_proposals
,
model
.
num_classes
,
14
,
14
),
dtype
=
tf
.
float32
)
groundtruth_boxes_list
=
[
tf
.
constant
([[
0
,
0
,
.
5
,
.
5
]],
dtype
=
tf
.
float32
)]
groundtruth_classes_list
=
[
tf
.
constant
([[
1
,
0
]],
dtype
=
tf
.
float32
)]
# Set all elements of groundtruth mask to 1.0. In this case all proposal
# crops of the groundtruth masks should return a mask that covers the entire
# proposal. Thus, if mask_predictions_logits element values are all greater
# than 20, the loss should be zero.
groundtruth_masks_list
=
[
tf
.
convert_to_tensor
(
np
.
ones
((
1
,
32
,
32
)),
dtype
=
tf
.
float32
)]
prediction_dict
=
{
'rpn_box_encodings'
:
rpn_box_encodings
,
'rpn_objectness_predictions_with_background'
:
...
...
@@ -788,10 +877,12 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'refined_box_encodings'
:
refined_box_encodings
,
'class_predictions_with_background'
:
class_predictions_with_background
,
'proposal_boxes'
:
proposal_boxes
,
'num_proposals'
:
num_proposals
'num_proposals'
:
num_proposals
,
'mask_predictions'
:
mask_predictions_logits
}
model
.
provide_groundtruth
(
groundtruth_boxes_list
,
groundtruth_classes_list
)
groundtruth_classes_list
,
groundtruth_masks_list
)
loss_dict
=
model
.
loss
(
prediction_dict
)
with
self
.
test_session
()
as
sess
:
...
...
@@ -800,6 +891,102 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
self
.
assertAllClose
(
loss_dict_out
[
'first_stage_objectness_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_localization_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_classification_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_mask_loss'
],
0
)
def
test_loss_full_multiple_label_groundtruth
(
self
):
model
=
self
.
_build_model
(
is_training
=
True
,
first_stage_only
=
False
,
second_stage_batch_size
=
6
,
softmax_second_stage_classification_loss
=
False
)
batch_size
=
1
anchors
=
tf
.
constant
(
[[
0
,
0
,
16
,
16
],
[
0
,
16
,
16
,
32
],
[
16
,
0
,
32
,
16
],
[
16
,
16
,
32
,
32
]],
dtype
=
tf
.
float32
)
rpn_box_encodings
=
tf
.
zeros
(
[
batch_size
,
anchors
.
get_shape
().
as_list
()[
0
],
BOX_CODE_SIZE
],
dtype
=
tf
.
float32
)
# use different numbers for the objectness category to break ties in
# order of boxes returned by NMS
rpn_objectness_predictions_with_background
=
tf
.
constant
([
[[
-
10
,
13
],
[
10
,
-
10
],
[
10
,
-
11
],
[
10
,
-
12
]],],
dtype
=
tf
.
float32
)
image_shape
=
tf
.
constant
([
batch_size
,
32
,
32
,
3
],
dtype
=
tf
.
int32
)
# box_classifier_batch_size is 6, but here we assume that the number of
# actual proposals (not counting zero paddings) is fewer (3).
num_proposals
=
tf
.
constant
([
3
],
dtype
=
tf
.
int32
)
proposal_boxes
=
tf
.
constant
(
[[[
0
,
0
,
16
,
16
],
[
0
,
16
,
16
,
32
],
[
16
,
0
,
32
,
16
],
[
0
,
0
,
0
,
0
],
# begin paddings
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]]],
dtype
=
tf
.
float32
)
# second_stage_localization_loss should only be computed for predictions
# that match groundtruth. For multiple label groundtruth boxes, the loss
# should only be computed once for the label with the smaller index.
refined_box_encodings
=
tf
.
constant
(
[[[
0
,
0
,
0
,
0
],
[
1
,
1
,
-
1
,
-
1
]],
[[
1
,
1
,
-
1
,
-
1
],
[
1
,
1
,
1
,
1
]],
[[
1
,
1
,
-
1
,
-
1
],
[
1
,
1
,
1
,
1
]],
[[
1
,
1
,
-
1
,
-
1
],
[
1
,
1
,
1
,
1
]],
[[
1
,
1
,
-
1
,
-
1
],
[
1
,
1
,
1
,
1
]],
[[
1
,
1
,
-
1
,
-
1
],
[
1
,
1
,
1
,
1
]]],
dtype
=
tf
.
float32
)
class_predictions_with_background
=
tf
.
constant
(
[[
-
100
,
100
,
100
],
[
100
,
-
100
,
-
100
],
[
100
,
-
100
,
-
100
],
[
0
,
0
,
0
],
# begin paddings
[
0
,
0
,
0
],
[
0
,
0
,
0
]],
dtype
=
tf
.
float32
)
mask_predictions_logits
=
20
*
tf
.
ones
((
batch_size
*
model
.
max_num_proposals
,
model
.
num_classes
,
14
,
14
),
dtype
=
tf
.
float32
)
groundtruth_boxes_list
=
[
tf
.
constant
([[
0
,
0
,
.
5
,
.
5
]],
dtype
=
tf
.
float32
)]
# Box contains two ground truth labels.
groundtruth_classes_list
=
[
tf
.
constant
([[
1
,
1
]],
dtype
=
tf
.
float32
)]
# Set all elements of groundtruth mask to 1.0. In this case all proposal
# crops of the groundtruth masks should return a mask that covers the entire
# proposal. Thus, if mask_predictions_logits element values are all greater
# than 20, the loss should be zero.
groundtruth_masks_list
=
[
tf
.
convert_to_tensor
(
np
.
ones
((
1
,
32
,
32
)),
dtype
=
tf
.
float32
)]
prediction_dict
=
{
'rpn_box_encodings'
:
rpn_box_encodings
,
'rpn_objectness_predictions_with_background'
:
rpn_objectness_predictions_with_background
,
'image_shape'
:
image_shape
,
'anchors'
:
anchors
,
'refined_box_encodings'
:
refined_box_encodings
,
'class_predictions_with_background'
:
class_predictions_with_background
,
'proposal_boxes'
:
proposal_boxes
,
'num_proposals'
:
num_proposals
,
'mask_predictions'
:
mask_predictions_logits
}
model
.
provide_groundtruth
(
groundtruth_boxes_list
,
groundtruth_classes_list
,
groundtruth_masks_list
)
loss_dict
=
model
.
loss
(
prediction_dict
)
with
self
.
test_session
()
as
sess
:
loss_dict_out
=
sess
.
run
(
loss_dict
)
self
.
assertAllClose
(
loss_dict_out
[
'first_stage_localization_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'first_stage_objectness_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_localization_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_classification_loss'
],
0
)
self
.
assertAllClose
(
loss_dict_out
[
'second_stage_mask_loss'
],
0
)
def
test_loss_full_zero_padded_proposals_nonzero_loss_with_two_images
(
self
):
model
=
self
.
_build_model
(
...
...
@@ -828,7 +1015,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
image_shape
=
tf
.
constant
([
batch_size
,
32
,
32
,
3
],
dtype
=
tf
.
int32
)
# box_classifier_batch_size is 6, but here we assume that the number of
# actual proposals (not counting zero paddings) is fewer
(3)
.
# actual proposals (not counting zero paddings) is fewer.
num_proposals
=
tf
.
constant
([
3
,
2
],
dtype
=
tf
.
int32
)
proposal_boxes
=
tf
.
constant
(
[[[
0
,
0
,
16
,
16
],
...
...
@@ -839,9 +1026,9 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
[
0
,
0
,
0
,
0
]],
[[
0
,
0
,
16
,
16
],
[
0
,
16
,
16
,
32
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
],
# begin paddings
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
]]],
dtype
=
tf
.
float32
)
refined_box_encodings
=
tf
.
zeros
(
...
...
research/object_detection/meta_architectures/rfcn_meta_arch.py
View file @
ff88581a
...
...
@@ -73,6 +73,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
second_stage_score_conversion_fn
,
second_stage_localization_loss_weight
,
second_stage_classification_loss_weight
,
second_stage_classification_loss
,
hard_example_miner
,
parallel_iterations
=
16
):
"""RFCNMetaArch Constructor.
...
...
@@ -149,6 +150,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
used to convert logits to probabilities.
second_stage_localization_loss_weight: A float
second_stage_classification_loss_weight: A float
second_stage_classification_loss: A string indicating which loss function
to use, supports 'softmax' and 'sigmoid'.
hard_example_miner: A losses.HardExampleMiner object (can be None).
parallel_iterations: (Optional) The number of iterations allowed to run
in parallel for calls to tf.map_fn.
...
...
@@ -185,6 +188,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
second_stage_score_conversion_fn
,
second_stage_localization_loss_weight
,
second_stage_classification_loss_weight
,
second_stage_classification_loss
,
1.0
,
# second stage mask prediction loss weight isn't used in R-FCN.
hard_example_miner
,
parallel_iterations
)
...
...
@@ -198,10 +203,10 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
"""Predicts the output tensors from 2nd stage of FasterRCNN.
Args:
rpn_box_encodings:
3
-D float tensor of shape
rpn_box_encodings:
4
-D float tensor of shape
[batch_size, num_valid_anchors, self._box_coder.code_size] containing
predicted boxes.
rpn_objectness_predictions_with_background:
3
-D float tensor of shape
rpn_objectness_predictions_with_background:
2
-D float tensor of shape
[batch_size, num_valid_anchors, 2] containing class
predictions (logits) for each of the anchors. Note that this
tensor *includes* background class predictions (at class index 0).
...
...
@@ -225,13 +230,22 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
Note that this tensor *includes* background class predictions
(at class index 0).
3) num_proposals: An int32 tensor of shape [batch_size] representing the
number of proposals generated by the RPN.
`num_proposals` allows us
number of proposals generated by the RPN. `num_proposals` allows us
to keep track of which entries are to be treated as zero paddings and
which are not since we always pad the number of proposals to be
`self.max_num_proposals` for each image.
4) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes (in absolute coordinates).
5) proposal_boxes_normalized: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing decoded proposal
bounding boxes (in normalized coordinates). Can be used to override
the boxes proposed by the RPN, thus enabling one to extract box
classification and prediction for externally selected areas of the
image.
6) box_classifier_features: a 4-D float32 tensor, of shape
[batch_size, feature_map_height, feature_map_width, depth],
representing the box classifier features.
"""
proposal_boxes_normalized
,
_
,
num_proposals
=
self
.
_postprocess_rpn
(
rpn_box_encodings
,
rpn_objectness_predictions_with_background
,
...
...
@@ -263,5 +277,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
class_predictions_with_background
,
'num_proposals'
:
num_proposals
,
'proposal_boxes'
:
absolute_proposal_boxes
,
'box_classifier_features'
:
box_classifier_features
,
'proposal_boxes_normalized'
:
proposal_boxes_normalized
,
}
return
prediction_dict
research/object_detection/meta_architectures/rfcn_meta_arch_test.py
View file @
ff88581a
...
...
@@ -51,6 +51,15 @@ class RFCNMetaArchTest(
return
rfcn_meta_arch
.
RFCNMetaArch
(
second_stage_rfcn_box_predictor
=
box_predictor
,
**
common_kwargs
)
def
_get_box_classifier_features_shape
(
self
,
image_size
,
batch_size
,
max_num_proposals
,
initial_crop_size
,
maxpool_stride
,
num_features
):
return
(
batch_size
,
image_size
,
image_size
,
num_features
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
research/object_detection/meta_architectures/ssd_meta_arch.py
View file @
ff88581a
...
...
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSD Meta-architecture definition.
General tensorflow implementation of convolutional Multibox/SSD detection
...
...
@@ -29,6 +28,7 @@ from object_detection.core import model
from
object_detection.core
import
standard_fields
as
fields
from
object_detection.core
import
target_assigner
from
object_detection.utils
import
shape_utils
from
object_detection.utils
import
visualization_utils
slim
=
tf
.
contrib
.
slim
...
...
@@ -37,13 +37,34 @@ class SSDFeatureExtractor(object):
"""SSD Feature Extractor definition."""
def
__init__
(
self
,
is_training
,
depth_multiplier
,
min_depth
,
pad_to_multiple
,
conv_hyperparams
,
batch_norm_trainable
=
True
,
reuse_weights
=
None
):
"""Constructor.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
batch_norm_trainable: Whether to update batch norm parameters during
training or not. When training with a small batch size
(e.g. 1), it is desirable to disable batch norm update and use
pretrained batch norm params.
reuse_weights: whether to reuse variables. Default is None.
"""
self
.
_is_training
=
is_training
self
.
_depth_multiplier
=
depth_multiplier
self
.
_min_depth
=
min_depth
self
.
_pad_to_multiple
=
pad_to_multiple
self
.
_conv_hyperparams
=
conv_hyperparams
self
.
_batch_norm_trainable
=
batch_norm_trainable
self
.
_reuse_weights
=
reuse_weights
@
abstractmethod
...
...
@@ -101,9 +122,9 @@ class SSDMetaArch(model.DetectionModel):
add_summaries
=
True
):
"""SSDMetaArch Constructor.
TODO: group NMS parameters + score converter into
a class and loss
parameters into a class and write config protos for
postprocessing
and losses.
TODO: group NMS parameters + score converter into
a class and loss
parameters into a class and write config protos for
postprocessing
and losses.
Args:
is_training: A boolean indicating whether the training version of the
...
...
@@ -204,8 +225,8 @@ class SSDMetaArch(model.DetectionModel):
if
inputs
.
dtype
is
not
tf
.
float32
:
raise
ValueError
(
'`preprocess` expects a tf.float32 tensor'
)
with
tf
.
name_scope
(
'Preprocessor'
):
# TODO: revisit whether to always use batch size as
the number of
#
parallel
iterations vs allow for dynamic batching.
# TODO: revisit whether to always use batch size as the number of
parallel
# iterations vs allow for dynamic batching.
resized_inputs
=
tf
.
map_fn
(
self
.
_image_resizer_fn
,
elems
=
inputs
,
dtype
=
tf
.
float32
)
...
...
@@ -226,7 +247,7 @@ class SSDMetaArch(model.DetectionModel):
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) box_encodings:
3
-D float tensor of shape [batch_size, num_anchors,
1) box_encodings:
4
-D float tensor of shape [batch_size, num_anchors,
box_code_dimension] containing predicted boxes.
2) class_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, num_classes+1] containing class predictions
...
...
@@ -234,19 +255,26 @@ class SSDMetaArch(model.DetectionModel):
background class predictions (at class index 0).
3) feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i].
4) anchors: 2-D float tensor of shape [num_anchors, 4] containing
the generated anchors in normalized coordinates.
"""
with
tf
.
variable_scope
(
None
,
self
.
_extract_features_scope
,
[
preprocessed_inputs
]):
feature_maps
=
self
.
_feature_extractor
.
extract_features
(
preprocessed_inputs
)
feature_map_spatial_dims
=
self
.
_get_feature_map_spatial_dims
(
feature_maps
)
self
.
_anchors
=
self
.
_anchor_generator
.
generate
(
feature_map_spatial_dims
)
image_shape
=
tf
.
shape
(
preprocessed_inputs
)
self
.
_anchors
=
self
.
_anchor_generator
.
generate
(
feature_map_spatial_dims
,
im_height
=
image_shape
[
1
],
im_width
=
image_shape
[
2
])
(
box_encodings
,
class_predictions_with_background
)
=
self
.
_add_box_predictions_to_feature_maps
(
feature_maps
)
predictions_dict
=
{
'box_encodings'
:
box_encodings
,
'class_predictions_with_background'
:
class_predictions_with_background
,
'feature_maps'
:
feature_maps
'feature_maps'
:
feature_maps
,
'anchors'
:
self
.
_anchors
.
get
()
}
return
predictions_dict
...
...
@@ -351,9 +379,11 @@ class SSDMetaArch(model.DetectionModel):
Returns:
detections: a dictionary containing the following fields
detection_boxes: [batch, max_detection, 4]
detection_boxes: [batch, max_detection
s
, 4]
detection_scores: [batch, max_detections]
detection_classes: [batch, max_detections]
detection_keypoints: [batch, max_detections, num_keypoints, 2] (if
encoded in the prediction_dict 'box_encodings')
num_detections: [batch]
Raises:
ValueError: if prediction_dict does not contain `box_encodings` or
...
...
@@ -365,7 +395,7 @@ class SSDMetaArch(model.DetectionModel):
with
tf
.
name_scope
(
'Postprocessor'
):
box_encodings
=
prediction_dict
[
'box_encodings'
]
class_predictions
=
prediction_dict
[
'class_predictions_with_background'
]
detection_boxes
=
self
.
_batch_decode
(
box_encodings
)
detection_boxes
,
detection_keypoints
=
self
.
_batch_decode
(
box_encodings
)
detection_boxes
=
tf
.
expand_dims
(
detection_boxes
,
axis
=
2
)
class_predictions_without_background
=
tf
.
slice
(
class_predictions
,
...
...
@@ -374,14 +404,25 @@ class SSDMetaArch(model.DetectionModel):
detection_scores
=
self
.
_score_conversion_fn
(
class_predictions_without_background
)
clip_window
=
tf
.
constant
([
0
,
0
,
1
,
1
],
tf
.
float32
)
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
_
,
num_detections
)
=
self
.
_non_max_suppression_fn
(
detection_boxes
,
detection_scores
,
clip_window
=
clip_window
)
return
{
'detection_boxes'
:
nmsed_boxes
,
'detection_scores'
:
nmsed_scores
,
'detection_classes'
:
nmsed_classes
,
'num_detections'
:
tf
.
to_float
(
num_detections
)}
additional_fields
=
None
if
detection_keypoints
is
not
None
:
additional_fields
=
{
fields
.
BoxListFields
.
keypoints
:
detection_keypoints
}
(
nmsed_boxes
,
nmsed_scores
,
nmsed_classes
,
_
,
nmsed_additional_fields
,
num_detections
)
=
self
.
_non_max_suppression_fn
(
detection_boxes
,
detection_scores
,
clip_window
=
clip_window
,
additional_fields
=
additional_fields
)
detection_dict
=
{
'detection_boxes'
:
nmsed_boxes
,
'detection_scores'
:
nmsed_scores
,
'detection_classes'
:
nmsed_classes
,
'num_detections'
:
tf
.
to_float
(
num_detections
)}
if
(
nmsed_additional_fields
is
not
None
and
fields
.
BoxListFields
.
keypoints
in
nmsed_additional_fields
):
detection_dict
[
'detection_keypoints'
]
=
nmsed_additional_fields
[
fields
.
BoxListFields
.
keypoints
]
return
detection_dict
def
loss
(
self
,
prediction_dict
,
scope
=
None
):
"""Compute scalar loss tensors with respect to provided groundtruth.
...
...
@@ -395,7 +436,7 @@ class SSDMetaArch(model.DetectionModel):
box_code_dimension] containing predicted boxes.
2) class_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, num_classes+1] containing class predictions
(logits) for each of the anchors.
Note that this tensor *includes*
(logits) for each of the anchors. Note that this tensor *includes*
background class predictions.
scope: Optional scope name.
...
...
@@ -405,10 +446,14 @@ class SSDMetaArch(model.DetectionModel):
values.
"""
with
tf
.
name_scope
(
scope
,
'Loss'
,
prediction_dict
.
values
()):
keypoints
=
None
if
self
.
groundtruth_has_field
(
fields
.
BoxListFields
.
keypoints
):
keypoints
=
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
keypoints
)
(
batch_cls_targets
,
batch_cls_weights
,
batch_reg_targets
,
batch_reg_weights
,
match_list
)
=
self
.
_assign_targets
(
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
boxes
),
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
classes
))
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
classes
),
keypoints
)
if
self
.
_add_summaries
:
self
.
_summarize_input
(
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
boxes
),
match_list
)
...
...
@@ -417,35 +462,60 @@ class SSDMetaArch(model.DetectionModel):
location_losses
=
self
.
_localization_loss
(
prediction_dict
[
'box_encodings'
],
batch_reg_targets
,
ignore_nan_targets
=
True
,
weights
=
batch_reg_weights
)
cls_losses
=
self
.
_classification_loss
(
prediction_dict
[
'class_predictions_with_background'
],
batch_cls_targets
,
weights
=
batch_cls_weights
)
# Optionally apply hard mining on top of loss values
localization_loss
=
tf
.
reduce_sum
(
location_losses
)
classification_loss
=
tf
.
reduce_sum
(
cls_losses
)
if
self
.
_hard_example_miner
:
(
localization_loss
,
classification_loss
)
=
self
.
_apply_hard_mining
(
location_losses
,
cls_losses
,
prediction_dict
,
match_list
)
if
self
.
_add_summaries
:
self
.
_hard_example_miner
.
summarize
()
else
:
if
self
.
_add_summaries
:
class_ids
=
tf
.
argmax
(
batch_cls_targets
,
axis
=
2
)
flattened_class_ids
=
tf
.
reshape
(
class_ids
,
[
-
1
])
flattened_classification_losses
=
tf
.
reshape
(
cls_losses
,
[
-
1
])
self
.
_summarize_anchor_classification_loss
(
flattened_class_ids
,
flattened_classification_losses
)
localization_loss
=
tf
.
reduce_sum
(
location_losses
)
classification_loss
=
tf
.
reduce_sum
(
cls_losses
)
# Optionally normalize by number of positive matches
normalizer
=
tf
.
constant
(
1.0
,
dtype
=
tf
.
float32
)
if
self
.
_normalize_loss_by_num_matches
:
normalizer
=
tf
.
maximum
(
tf
.
to_float
(
tf
.
reduce_sum
(
num_matches
)),
1.0
)
with
tf
.
name_scope
(
'localization_loss'
):
localization_loss
=
((
self
.
_localization_loss_weight
/
normalizer
)
*
localization_loss
)
with
tf
.
name_scope
(
'classification_loss'
):
classification_loss
=
((
self
.
_classification_loss_weight
/
normalizer
)
*
classification_loss
)
loss_dict
=
{
'localization_loss'
:
(
self
.
_localization_loss_weight
/
normalizer
)
*
localization_loss
,
'classification_loss'
:
(
self
.
_classification_loss_weight
/
normalizer
)
*
classification_loss
'localization_loss'
:
localization_loss
,
'classification_loss'
:
classification_loss
}
return
loss_dict
def
_assign_targets
(
self
,
groundtruth_boxes_list
,
groundtruth_classes_list
):
def
_summarize_anchor_classification_loss
(
self
,
class_ids
,
cls_losses
):
positive_indices
=
tf
.
where
(
tf
.
greater
(
class_ids
,
0
))
positive_anchor_cls_loss
=
tf
.
squeeze
(
tf
.
gather
(
cls_losses
,
positive_indices
),
axis
=
1
)
visualization_utils
.
add_cdf_image_summary
(
positive_anchor_cls_loss
,
'PositiveAnchorLossCDF'
)
negative_indices
=
tf
.
where
(
tf
.
equal
(
class_ids
,
0
))
negative_anchor_cls_loss
=
tf
.
squeeze
(
tf
.
gather
(
cls_losses
,
negative_indices
),
axis
=
1
)
visualization_utils
.
add_cdf_image_summary
(
negative_anchor_cls_loss
,
'NegativeAnchorLossCDF'
)
def
_assign_targets
(
self
,
groundtruth_boxes_list
,
groundtruth_classes_list
,
groundtruth_keypoints_list
=
None
):
"""Assign groundtruth targets.
Adds a background class to each one-hot encoding of groundtruth classes
...
...
@@ -460,6 +530,8 @@ class SSDMetaArch(model.DetectionModel):
groundtruth_classes_list: a list of 2-D one-hot (or k-hot) tensors of
shape [num_boxes, num_classes] containing the class targets with the 0th
index assumed to map to the first non-background class.
groundtruth_keypoints_list: (optional) a list of 3-D tensors of shape
[num_boxes, num_keypoints, 2]
Returns:
batch_cls_targets: a tensor with shape [batch_size, num_anchors,
...
...
@@ -480,6 +552,10 @@ class SSDMetaArch(model.DetectionModel):
tf
.
pad
(
one_hot_encoding
,
[[
0
,
0
],
[
1
,
0
]],
mode
=
'CONSTANT'
)
for
one_hot_encoding
in
groundtruth_classes_list
]
if
groundtruth_keypoints_list
is
not
None
:
for
boxlist
,
keypoints
in
zip
(
groundtruth_boxlists
,
groundtruth_keypoints_list
):
boxlist
.
add_field
(
fields
.
BoxListFields
.
keypoints
,
keypoints
)
return
target_assigner
.
batch_assign_targets
(
self
.
_target_assigner
,
self
.
anchors
,
groundtruth_boxlists
,
groundtruth_classes_with_background_list
)
...
...
@@ -544,12 +620,11 @@ class SSDMetaArch(model.DetectionModel):
mined_cls_loss: a float scalar with sum of classification losses from
selected hard examples.
"""
class_pred_shape
=
[
-
1
,
self
.
anchors
.
num_boxes_static
(),
self
.
num_classes
]
class_predictions
=
tf
.
reshape
(
tf
.
slice
(
prediction_dict
[
'class_predictions_with_background'
],
[
0
,
0
,
1
],
class_pred_shape
),
class_pred_shape
)
class_predictions
=
tf
.
slice
(
prediction_dict
[
'class_predictions_with_background'
],
[
0
,
0
,
1
],
[
-
1
,
-
1
,
-
1
])
decoded_boxes
=
self
.
_batch_decode
(
prediction_dict
[
'box_encodings'
])
decoded_boxes
,
_
=
self
.
_batch_decode
(
prediction_dict
[
'box_encodings'
])
decoded_box_tensors_list
=
tf
.
unstack
(
decoded_boxes
)
class_prediction_list
=
tf
.
unstack
(
class_predictions
)
decoded_boxlist_list
=
[]
...
...
@@ -574,6 +649,9 @@ class SSDMetaArch(model.DetectionModel):
Returns:
decoded_boxes: A float32 tensor of shape
[batch_size, num_anchors, 4] containing the decoded boxes.
decoded_keypoints: A float32 tensor of shape
[batch_size, num_anchors, num_keypoints, 2] containing the decoded
keypoints if present in the input `box_encodings`, None otherwise.
"""
combined_shape
=
shape_utils
.
combined_static_and_dynamic_shape
(
box_encodings
)
...
...
@@ -581,13 +659,21 @@ class SSDMetaArch(model.DetectionModel):
tiled_anchor_boxes
=
tf
.
tile
(
tf
.
expand_dims
(
self
.
anchors
.
get
(),
0
),
[
batch_size
,
1
,
1
])
tiled_anchors_boxlist
=
box_list
.
BoxList
(
tf
.
reshape
(
tiled_anchor_boxes
,
[
-
1
,
self
.
_box_coder
.
code_size
]))
tf
.
reshape
(
tiled_anchor_boxes
,
[
-
1
,
4
]))
decoded_boxes
=
self
.
_box_coder
.
decode
(
tf
.
reshape
(
box_encodings
,
[
-
1
,
self
.
_box_coder
.
code_size
]),
tiled_anchors_boxlist
)
return
tf
.
reshape
(
decoded_boxes
.
get
(),
tf
.
stack
([
combined_shape
[
0
],
combined_shape
[
1
],
4
]))
decoded_keypoints
=
None
if
decoded_boxes
.
has_field
(
fields
.
BoxListFields
.
keypoints
):
decoded_keypoints
=
decoded_boxes
.
get_field
(
fields
.
BoxListFields
.
keypoints
)
num_keypoints
=
decoded_keypoints
.
get_shape
()[
1
]
decoded_keypoints
=
tf
.
reshape
(
decoded_keypoints
,
tf
.
stack
([
combined_shape
[
0
],
combined_shape
[
1
],
num_keypoints
,
2
]))
decoded_boxes
=
tf
.
reshape
(
decoded_boxes
.
get
(),
tf
.
stack
(
[
combined_shape
[
0
],
combined_shape
[
1
],
4
]))
return
decoded_boxes
,
decoded_keypoints
def
restore_map
(
self
,
from_detection_checkpoint
=
True
):
"""Returns a map of variables to load from a foreign checkpoint.
...
...
research/object_detection/meta_architectures/ssd_meta_arch_test.py
View file @
ff88581a
...
...
@@ -18,7 +18,6 @@ import functools
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.training
import
saver
as
tf_saver
from
object_detection.core
import
anchor_generator
from
object_detection.core
import
box_list
from
object_detection.core
import
losses
...
...
@@ -34,7 +33,12 @@ class FakeSSDFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
def
__init__
(
self
):
super
(
FakeSSDFeatureExtractor
,
self
).
__init__
(
depth_multiplier
=
0
,
min_depth
=
0
,
conv_hyperparams
=
None
)
is_training
=
True
,
depth_multiplier
=
0
,
min_depth
=
0
,
pad_to_multiple
=
1
,
batch_norm_trainable
=
True
,
conv_hyperparams
=
None
)
def
preprocess
(
self
,
resized_inputs
):
return
tf
.
identity
(
resized_inputs
)
...
...
@@ -55,7 +59,7 @@ class MockAnchorGenerator2x2(anchor_generator.AnchorGenerator):
def
num_anchors_per_location
(
self
):
return
[
1
]
def
_generate
(
self
,
feature_map_shape_list
):
def
_generate
(
self
,
feature_map_shape_list
,
im_height
,
im_width
):
return
box_list
.
BoxList
(
tf
.
constant
([[
0
,
0
,
.
5
,
.
5
],
[
0
,
.
5
,
.
5
,
1
],
...
...
@@ -147,6 +151,7 @@ class SsdMetaArchTest(tf.test.TestCase):
self
.
assertTrue
(
'box_encodings'
in
prediction_dict
)
self
.
assertTrue
(
'class_predictions_with_background'
in
prediction_dict
)
self
.
assertTrue
(
'feature_maps'
in
prediction_dict
)
self
.
assertTrue
(
'anchors'
in
prediction_dict
)
init_op
=
tf
.
global_variables_initializer
()
with
self
.
test_session
(
graph
=
tf_graph
)
as
sess
:
...
...
@@ -242,7 +247,7 @@ class SsdMetaArchTest(tf.test.TestCase):
def
test_restore_map_for_detection_ckpt
(
self
):
init_op
=
tf
.
global_variables_initializer
()
saver
=
tf
_saver
.
Saver
()
saver
=
tf
.
train
.
Saver
()
save_path
=
self
.
get_temp_dir
()
with
self
.
test_session
()
as
sess
:
sess
.
run
(
init_op
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment