Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
0225b135
Unverified
Commit
0225b135
authored
Mar 05, 2022
by
Srihari Humbarwadi
Committed by
GitHub
Mar 05, 2022
Browse files
Merge branch 'tensorflow:master' into panoptic-deeplab-modeling
parents
7479dbb8
4c571a3c
Changes
332
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5293 additions
and
0 deletions
+5293
-0
official/vision/modeling/layers/nn_blocks_test.py
official/vision/modeling/layers/nn_blocks_test.py
+341
-0
official/vision/modeling/layers/nn_layers.py
official/vision/modeling/layers/nn_layers.py
+1277
-0
official/vision/modeling/layers/nn_layers_test.py
official/vision/modeling/layers/nn_layers_test.py
+419
-0
official/vision/modeling/layers/roi_aligner.py
official/vision/modeling/layers/roi_aligner.py
+72
-0
official/vision/modeling/layers/roi_aligner_test.py
official/vision/modeling/layers/roi_aligner_test.py
+42
-0
official/vision/modeling/layers/roi_generator.py
official/vision/modeling/layers/roi_generator.py
+313
-0
official/vision/modeling/layers/roi_sampler.py
official/vision/modeling/layers/roi_sampler.py
+175
-0
official/vision/modeling/maskrcnn_model.py
official/vision/modeling/maskrcnn_model.py
+429
-0
official/vision/modeling/maskrcnn_model_test.py
official/vision/modeling/maskrcnn_model_test.py
+398
-0
official/vision/modeling/retinanet_model.py
official/vision/modeling/retinanet_model.py
+216
-0
official/vision/modeling/retinanet_model_test.py
official/vision/modeling/retinanet_model_test.py
+314
-0
official/vision/modeling/segmentation_model.py
official/vision/modeling/segmentation_model.py
+94
-0
official/vision/modeling/segmentation_model_test.py
official/vision/modeling/segmentation_model_test.py
+86
-0
official/vision/modeling/video_classification_model.py
official/vision/modeling/video_classification_model.py
+128
-0
official/vision/modeling/video_classification_model_test.py
official/vision/modeling/video_classification_model_test.py
+92
-0
official/vision/ops/__init__.py
official/vision/ops/__init__.py
+14
-0
official/vision/ops/anchor.py
official/vision/ops/anchor.py
+378
-0
official/vision/ops/anchor_generator.py
official/vision/ops/anchor_generator.py
+182
-0
official/vision/ops/anchor_generator_test.py
official/vision/ops/anchor_generator_test.py
+137
-0
official/vision/ops/anchor_test.py
official/vision/ops/anchor_test.py
+186
-0
No files found.
official/vision/modeling/layers/nn_blocks_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for nn_blocks."""
from
typing
import
Any
,
Iterable
,
Tuple
# Import libraries
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.distribute
import
strategy_combinations
from
official.vision.modeling.layers
import
nn_blocks
def
distribution_strategy_combinations
()
->
Iterable
[
Tuple
[
Any
,
...]]:
"""Returns the combinations of end-to-end tests to run."""
return
combinations
.
combine
(
distribution
=
[
strategy_combinations
.
default_strategy
,
strategy_combinations
.
cloud_tpu_strategy
,
strategy_combinations
.
one_device_strategy_gpu
,
],)
class
NNBlocksTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
(
nn_blocks
.
ResidualBlock
,
1
,
False
,
0.0
,
None
),
(
nn_blocks
.
ResidualBlock
,
2
,
True
,
0.2
,
0.25
),
)
def
test_residual_block_creation
(
self
,
block_fn
,
strides
,
use_projection
,
stochastic_depth_drop_rate
,
se_ratio
):
input_size
=
128
filter_size
=
256
inputs
=
tf
.
keras
.
Input
(
shape
=
(
input_size
,
input_size
,
filter_size
),
batch_size
=
1
)
block
=
block_fn
(
filter_size
,
strides
,
use_projection
=
use_projection
,
se_ratio
=
se_ratio
,
stochastic_depth_drop_rate
=
stochastic_depth_drop_rate
,
)
features
=
block
(
inputs
)
self
.
assertAllEqual
(
[
1
,
input_size
//
strides
,
input_size
//
strides
,
filter_size
],
features
.
shape
.
as_list
())
@
parameterized
.
parameters
(
(
nn_blocks
.
BottleneckBlock
,
1
,
False
,
0.0
,
None
),
(
nn_blocks
.
BottleneckBlock
,
2
,
True
,
0.2
,
0.25
),
)
def
test_bottleneck_block_creation
(
self
,
block_fn
,
strides
,
use_projection
,
stochastic_depth_drop_rate
,
se_ratio
):
input_size
=
128
filter_size
=
256
inputs
=
tf
.
keras
.
Input
(
shape
=
(
input_size
,
input_size
,
filter_size
*
4
),
batch_size
=
1
)
block
=
block_fn
(
filter_size
,
strides
,
use_projection
=
use_projection
,
se_ratio
=
se_ratio
,
stochastic_depth_drop_rate
=
stochastic_depth_drop_rate
)
features
=
block
(
inputs
)
self
.
assertAllEqual
(
[
1
,
input_size
//
strides
,
input_size
//
strides
,
filter_size
*
4
],
features
.
shape
.
as_list
())
@
parameterized
.
parameters
(
(
nn_blocks
.
InvertedBottleneckBlock
,
1
,
1
,
None
,
None
),
(
nn_blocks
.
InvertedBottleneckBlock
,
6
,
1
,
None
,
None
),
(
nn_blocks
.
InvertedBottleneckBlock
,
1
,
2
,
None
,
None
),
(
nn_blocks
.
InvertedBottleneckBlock
,
1
,
1
,
0.2
,
None
),
(
nn_blocks
.
InvertedBottleneckBlock
,
1
,
1
,
None
,
0.2
),
)
def
test_invertedbottleneck_block_creation
(
self
,
block_fn
,
expand_ratio
,
strides
,
se_ratio
,
stochastic_depth_drop_rate
):
input_size
=
128
in_filters
=
24
out_filters
=
40
inputs
=
tf
.
keras
.
Input
(
shape
=
(
input_size
,
input_size
,
in_filters
),
batch_size
=
1
)
block
=
block_fn
(
in_filters
=
in_filters
,
out_filters
=
out_filters
,
expand_ratio
=
expand_ratio
,
strides
=
strides
,
se_ratio
=
se_ratio
,
stochastic_depth_drop_rate
=
stochastic_depth_drop_rate
)
features
=
block
(
inputs
)
self
.
assertAllEqual
(
[
1
,
input_size
//
strides
,
input_size
//
strides
,
out_filters
],
features
.
shape
.
as_list
())
@
parameterized
.
parameters
(
(
nn_blocks
.
TuckerConvBlock
,
1
,
0.25
,
0.25
),
(
nn_blocks
.
TuckerConvBlock
,
2
,
0.25
,
0.25
),
)
def
test_tucker_conv_block
(
self
,
block_fn
,
strides
,
input_compression_ratio
,
output_compression_ratio
):
input_size
=
128
in_filters
=
24
out_filters
=
24
inputs
=
tf
.
keras
.
Input
(
shape
=
(
input_size
,
input_size
,
in_filters
),
batch_size
=
1
)
block
=
block_fn
(
in_filters
=
in_filters
,
out_filters
=
out_filters
,
input_compression_ratio
=
input_compression_ratio
,
output_compression_ratio
=
output_compression_ratio
,
strides
=
strides
)
features
=
block
(
inputs
)
self
.
assertAllEqual
(
[
1
,
input_size
//
strides
,
input_size
//
strides
,
out_filters
],
features
.
shape
.
as_list
())
class
ResidualInnerTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
combinations
.
generate
(
distribution_strategy_combinations
())
def
test_shape
(
self
,
distribution
):
bsz
,
h
,
w
,
c
=
8
,
32
,
32
,
32
filters
=
64
strides
=
2
input_tensor
=
tf
.
random
.
uniform
(
shape
=
[
bsz
,
h
,
w
,
c
])
with
distribution
.
scope
():
test_layer
=
nn_blocks
.
ResidualInner
(
filters
,
strides
)
output
=
test_layer
(
input_tensor
)
expected_output_shape
=
[
bsz
,
h
//
strides
,
w
//
strides
,
filters
]
self
.
assertEqual
(
expected_output_shape
,
output
.
shape
.
as_list
())
class
BottleneckResidualInnerTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
combinations
.
generate
(
distribution_strategy_combinations
())
def
test_shape
(
self
,
distribution
):
bsz
,
h
,
w
,
c
=
8
,
32
,
32
,
32
filters
=
64
strides
=
2
input_tensor
=
tf
.
random
.
uniform
(
shape
=
[
bsz
,
h
,
w
,
c
])
with
distribution
.
scope
():
test_layer
=
nn_blocks
.
BottleneckResidualInner
(
filters
,
strides
)
output
=
test_layer
(
input_tensor
)
expected_output_shape
=
[
bsz
,
h
//
strides
,
w
//
strides
,
filters
*
4
]
self
.
assertEqual
(
expected_output_shape
,
output
.
shape
.
as_list
())
class
DepthwiseSeparableConvBlockTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
combinations
.
generate
(
distribution_strategy_combinations
())
def
test_shape
(
self
,
distribution
):
batch_size
,
height
,
width
,
num_channels
=
8
,
32
,
32
,
32
num_filters
=
64
strides
=
2
input_tensor
=
tf
.
random
.
normal
(
shape
=
[
batch_size
,
height
,
width
,
num_channels
])
with
distribution
.
scope
():
block
=
nn_blocks
.
DepthwiseSeparableConvBlock
(
num_filters
,
strides
=
strides
)
config_dict
=
block
.
get_config
()
recreate_block
=
nn_blocks
.
DepthwiseSeparableConvBlock
(
**
config_dict
)
output_tensor
=
block
(
input_tensor
)
expected_output_shape
=
[
batch_size
,
height
//
strides
,
width
//
strides
,
num_filters
]
self
.
assertEqual
(
output_tensor
.
shape
.
as_list
(),
expected_output_shape
)
output_tensor
=
recreate_block
(
input_tensor
)
self
.
assertEqual
(
output_tensor
.
shape
.
as_list
(),
expected_output_shape
)
class
ReversibleLayerTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
combinations
.
generate
(
distribution_strategy_combinations
())
def
test_downsampling_non_reversible_step
(
self
,
distribution
):
bsz
,
h
,
w
,
c
=
8
,
32
,
32
,
32
filters
=
64
strides
=
2
input_tensor
=
tf
.
random
.
uniform
(
shape
=
[
bsz
,
h
,
w
,
c
])
with
distribution
.
scope
():
f
=
nn_blocks
.
ResidualInner
(
filters
=
filters
//
2
,
strides
=
strides
,
batch_norm_first
=
True
)
g
=
nn_blocks
.
ResidualInner
(
filters
=
filters
//
2
,
strides
=
1
,
batch_norm_first
=
True
)
test_layer
=
nn_blocks
.
ReversibleLayer
(
f
,
g
)
test_layer
.
build
(
input_tensor
.
shape
)
optimizer
=
tf
.
keras
.
optimizers
.
SGD
(
learning_rate
=
0.01
)
@
tf
.
function
def
step_fn
():
with
tf
.
GradientTape
()
as
tape
:
output
=
test_layer
(
input_tensor
,
training
=
True
)
grads
=
tape
.
gradient
(
output
,
test_layer
.
trainable_variables
)
# Test applying gradients with optimizer works
optimizer
.
apply_gradients
(
zip
(
grads
,
test_layer
.
trainable_variables
))
return
output
replica_output
=
distribution
.
run
(
step_fn
)
outputs
=
distribution
.
experimental_local_results
(
replica_output
)
# Assert forward pass shape
expected_output_shape
=
[
bsz
,
h
//
strides
,
w
//
strides
,
filters
]
for
output
in
outputs
:
self
.
assertEqual
(
expected_output_shape
,
output
.
shape
.
as_list
())
@
combinations
.
generate
(
distribution_strategy_combinations
())
def
test_reversible_step
(
self
,
distribution
):
# Reversible layers satisfy: (a) strides = 1 (b) in_filter = out_filter
bsz
,
h
,
w
,
c
=
8
,
32
,
32
,
32
filters
=
c
strides
=
1
input_tensor
=
tf
.
random
.
uniform
(
shape
=
[
bsz
,
h
,
w
,
c
])
with
distribution
.
scope
():
f
=
nn_blocks
.
ResidualInner
(
filters
=
filters
//
2
,
strides
=
strides
,
batch_norm_first
=
False
)
g
=
nn_blocks
.
ResidualInner
(
filters
=
filters
//
2
,
strides
=
1
,
batch_norm_first
=
False
)
test_layer
=
nn_blocks
.
ReversibleLayer
(
f
,
g
)
test_layer
(
input_tensor
,
training
=
False
)
# init weights
optimizer
=
tf
.
keras
.
optimizers
.
SGD
(
learning_rate
=
0.01
)
@
tf
.
function
def
step_fn
():
with
tf
.
GradientTape
()
as
tape
:
output
=
test_layer
(
input_tensor
,
training
=
True
)
grads
=
tape
.
gradient
(
output
,
test_layer
.
trainable_variables
)
# Test applying gradients with optimizer works
optimizer
.
apply_gradients
(
zip
(
grads
,
test_layer
.
trainable_variables
))
return
output
@
tf
.
function
def
fwd
():
test_layer
(
input_tensor
)
distribution
.
run
(
fwd
)
# Initialize variables
prev_variables
=
tf
.
identity_n
(
test_layer
.
trainable_variables
)
replica_output
=
distribution
.
run
(
step_fn
)
outputs
=
distribution
.
experimental_local_results
(
replica_output
)
# Assert variables values have changed values
for
v0
,
v1
in
zip
(
prev_variables
,
test_layer
.
trainable_variables
):
self
.
assertNotAllEqual
(
v0
,
v1
)
# Assert forward pass shape
expected_output_shape
=
[
bsz
,
h
//
strides
,
w
//
strides
,
filters
]
for
output
in
outputs
:
self
.
assertEqual
(
expected_output_shape
,
output
.
shape
.
as_list
())
@
combinations
.
generate
(
distribution_strategy_combinations
())
def
test_manual_gradients_correctness
(
self
,
distribution
):
bsz
,
h
,
w
,
c
=
8
,
32
,
32
,
32
filters
=
c
strides
=
1
input_tensor
=
tf
.
random
.
uniform
(
shape
=
[
bsz
,
h
,
w
,
c
*
4
])
# bottleneck
with
distribution
.
scope
():
f_manual
=
nn_blocks
.
BottleneckResidualInner
(
filters
=
filters
//
2
,
strides
=
strides
,
batch_norm_first
=
False
)
g_manual
=
nn_blocks
.
BottleneckResidualInner
(
filters
=
filters
//
2
,
strides
=
1
,
batch_norm_first
=
False
)
manual_grad_layer
=
nn_blocks
.
ReversibleLayer
(
f_manual
,
g_manual
)
manual_grad_layer
(
input_tensor
,
training
=
False
)
# init weights
f_auto
=
nn_blocks
.
BottleneckResidualInner
(
filters
=
filters
//
2
,
strides
=
strides
,
batch_norm_first
=
False
)
g_auto
=
nn_blocks
.
BottleneckResidualInner
(
filters
=
filters
//
2
,
strides
=
1
,
batch_norm_first
=
False
)
auto_grad_layer
=
nn_blocks
.
ReversibleLayer
(
f_auto
,
g_auto
,
manual_grads
=
False
)
auto_grad_layer
(
input_tensor
)
# init weights
# Clone all weights (tf.keras.layers.Layer has no .clone())
auto_grad_layer
.
_f
.
set_weights
(
manual_grad_layer
.
_f
.
get_weights
())
auto_grad_layer
.
_g
.
set_weights
(
manual_grad_layer
.
_g
.
get_weights
())
@
tf
.
function
def
manual_fn
():
with
tf
.
GradientTape
()
as
tape
:
output
=
manual_grad_layer
(
input_tensor
,
training
=
True
)
grads
=
tape
.
gradient
(
output
,
manual_grad_layer
.
trainable_variables
)
return
grads
@
tf
.
function
def
auto_fn
():
with
tf
.
GradientTape
()
as
tape
:
output
=
auto_grad_layer
(
input_tensor
,
training
=
True
)
grads
=
tape
.
gradient
(
output
,
auto_grad_layer
.
trainable_variables
)
return
grads
manual_grads
=
distribution
.
run
(
manual_fn
)
auto_grads
=
distribution
.
run
(
auto_fn
)
# Assert gradients calculated manually are close to that from autograd
for
manual_grad
,
auto_grad
in
zip
(
manual_grads
,
auto_grads
):
self
.
assertAllClose
(
distribution
.
experimental_local_results
(
manual_grad
),
distribution
.
experimental_local_results
(
auto_grad
),
atol
=
5e-3
,
rtol
=
5e-3
)
# Verify that BN moving mean and variance is correct.
for
manual_var
,
auto_var
in
zip
(
manual_grad_layer
.
non_trainable_variables
,
auto_grad_layer
.
non_trainable_variables
):
self
.
assertAllClose
(
manual_var
,
auto_var
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/modeling/layers/nn_layers.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains common building blocks for neural networks."""
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Mapping
,
Optional
,
Tuple
,
Union
from
absl
import
logging
import
tensorflow
as
tf
import
tensorflow_addons
as
tfa
from
official.modeling
import
tf_utils
from
official.vision.ops
import
spatial_transform_ops
# Type annotations.
States
=
Dict
[
str
,
tf
.
Tensor
]
Activation
=
Union
[
str
,
Callable
]
def
make_divisible
(
value
:
float
,
divisor
:
int
,
min_value
:
Optional
[
float
]
=
None
,
round_down_protect
:
bool
=
True
,
)
->
int
:
"""This is to ensure that all layers have channels that are divisible by 8.
Args:
value: A `float` of original value.
divisor: An `int` of the divisor that need to be checked upon.
min_value: A `float` of minimum value threshold.
round_down_protect: A `bool` indicating whether round down more than 10%
will be allowed.
Returns:
The adjusted value in `int` that is divisible against divisor.
"""
if
min_value
is
None
:
min_value
=
divisor
new_value
=
max
(
min_value
,
int
(
value
+
divisor
/
2
)
//
divisor
*
divisor
)
# Make sure that round down does not go down by more than 10%.
if
round_down_protect
and
new_value
<
0.9
*
value
:
new_value
+=
divisor
return
int
(
new_value
)
def
round_filters
(
filters
:
int
,
multiplier
:
float
,
divisor
:
int
=
8
,
min_depth
:
Optional
[
int
]
=
None
,
round_down_protect
:
bool
=
True
,
skip
:
bool
=
False
)
->
int
:
"""Rounds number of filters based on width multiplier."""
orig_f
=
filters
if
skip
or
not
multiplier
:
return
filters
new_filters
=
make_divisible
(
value
=
filters
*
multiplier
,
divisor
=
divisor
,
min_value
=
min_depth
,
round_down_protect
=
round_down_protect
)
logging
.
info
(
'round_filter input=%s output=%s'
,
orig_f
,
new_filters
)
return
int
(
new_filters
)
def
get_padding_for_kernel_size
(
kernel_size
):
"""Compute padding size given kernel size."""
if
kernel_size
==
7
:
return
(
3
,
3
)
elif
kernel_size
==
3
:
return
(
1
,
1
)
else
:
raise
ValueError
(
'Padding for kernel size {} not known.'
.
format
(
kernel_size
))
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
SqueezeExcitation
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a squeeze and excitation layer."""
def
__init__
(
self
,
in_filters
,
out_filters
,
se_ratio
,
divisible_by
=
1
,
use_3d_input
=
False
,
kernel_initializer
=
'VarianceScaling'
,
kernel_regularizer
=
None
,
bias_regularizer
=
None
,
activation
=
'relu'
,
gating_activation
=
'sigmoid'
,
round_down_protect
=
True
,
**
kwargs
):
"""Initializes a squeeze and excitation layer.
Args:
in_filters: An `int` number of filters of the input tensor.
out_filters: An `int` number of filters of the output tensor.
se_ratio: A `float` or None. If not None, se ratio for the squeeze and
excitation layer.
divisible_by: An `int` that ensures all inner dimensions are divisible by
this number.
use_3d_input: A `bool` of whether input is 2D or 3D image.
kernel_initializer: A `str` of kernel_initializer for convolutional
layers.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d.
Default to None.
activation: A `str` name of the activation function.
gating_activation: A `str` name of the activation function for final
gating function.
round_down_protect: A `bool` of whether round down more than 10% will be
allowed.
**kwargs: Additional keyword arguments to be passed.
"""
super
(
SqueezeExcitation
,
self
).
__init__
(
**
kwargs
)
self
.
_in_filters
=
in_filters
self
.
_out_filters
=
out_filters
self
.
_se_ratio
=
se_ratio
self
.
_divisible_by
=
divisible_by
self
.
_round_down_protect
=
round_down_protect
self
.
_use_3d_input
=
use_3d_input
self
.
_activation
=
activation
self
.
_gating_activation
=
gating_activation
self
.
_kernel_initializer
=
kernel_initializer
self
.
_kernel_regularizer
=
kernel_regularizer
self
.
_bias_regularizer
=
bias_regularizer
if
tf
.
keras
.
backend
.
image_data_format
()
==
'channels_last'
:
if
not
use_3d_input
:
self
.
_spatial_axis
=
[
1
,
2
]
else
:
self
.
_spatial_axis
=
[
1
,
2
,
3
]
else
:
if
not
use_3d_input
:
self
.
_spatial_axis
=
[
2
,
3
]
else
:
self
.
_spatial_axis
=
[
2
,
3
,
4
]
self
.
_activation_fn
=
tf_utils
.
get_activation
(
activation
)
self
.
_gating_activation_fn
=
tf_utils
.
get_activation
(
gating_activation
)
def
build
(
self
,
input_shape
):
num_reduced_filters
=
make_divisible
(
max
(
1
,
int
(
self
.
_in_filters
*
self
.
_se_ratio
)),
divisor
=
self
.
_divisible_by
,
round_down_protect
=
self
.
_round_down_protect
)
self
.
_se_reduce
=
tf
.
keras
.
layers
.
Conv2D
(
filters
=
num_reduced_filters
,
kernel_size
=
1
,
strides
=
1
,
padding
=
'same'
,
use_bias
=
True
,
kernel_initializer
=
self
.
_kernel_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
)
self
.
_se_expand
=
tf
.
keras
.
layers
.
Conv2D
(
filters
=
self
.
_out_filters
,
kernel_size
=
1
,
strides
=
1
,
padding
=
'same'
,
use_bias
=
True
,
kernel_initializer
=
self
.
_kernel_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
)
super
(
SqueezeExcitation
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
config
=
{
'in_filters'
:
self
.
_in_filters
,
'out_filters'
:
self
.
_out_filters
,
'se_ratio'
:
self
.
_se_ratio
,
'divisible_by'
:
self
.
_divisible_by
,
'use_3d_input'
:
self
.
_use_3d_input
,
'kernel_initializer'
:
self
.
_kernel_initializer
,
'kernel_regularizer'
:
self
.
_kernel_regularizer
,
'bias_regularizer'
:
self
.
_bias_regularizer
,
'activation'
:
self
.
_activation
,
'gating_activation'
:
self
.
_gating_activation
,
'round_down_protect'
:
self
.
_round_down_protect
,
}
base_config
=
super
(
SqueezeExcitation
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
x
=
tf
.
reduce_mean
(
inputs
,
self
.
_spatial_axis
,
keepdims
=
True
)
x
=
self
.
_activation_fn
(
self
.
_se_reduce
(
x
))
x
=
self
.
_gating_activation_fn
(
self
.
_se_expand
(
x
))
return
x
*
inputs
def
get_stochastic_depth_rate
(
init_rate
,
i
,
n
):
"""Get drop connect rate for the ith block.
Args:
init_rate: A `float` of initial drop rate.
i: An `int` of order of the current block.
n: An `int` total number of blocks.
Returns:
Drop rate of the ith block.
"""
if
init_rate
is
not
None
:
if
init_rate
<
0
or
init_rate
>
1
:
raise
ValueError
(
'Initial drop rate must be within 0 and 1.'
)
rate
=
init_rate
*
float
(
i
)
/
n
else
:
rate
=
None
return
rate
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
StochasticDepth
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a stochastic depth layer."""
def
__init__
(
self
,
stochastic_depth_drop_rate
,
**
kwargs
):
"""Initializes a stochastic depth layer.
Args:
stochastic_depth_drop_rate: A `float` of drop rate.
**kwargs: Additional keyword arguments to be passed.
Returns:
A output `tf.Tensor` of which should have the same shape as input.
"""
super
(
StochasticDepth
,
self
).
__init__
(
**
kwargs
)
self
.
_drop_rate
=
stochastic_depth_drop_rate
def
get_config
(
self
):
config
=
{
'drop_rate'
:
self
.
_drop_rate
}
base_config
=
super
(
StochasticDepth
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
,
training
=
None
):
if
training
is
None
:
training
=
tf
.
keras
.
backend
.
learning_phase
()
if
not
training
or
self
.
_drop_rate
is
None
or
self
.
_drop_rate
==
0
:
return
inputs
keep_prob
=
1.0
-
self
.
_drop_rate
batch_size
=
tf
.
shape
(
inputs
)[
0
]
random_tensor
=
keep_prob
random_tensor
+=
tf
.
random
.
uniform
(
[
batch_size
]
+
[
1
]
*
(
inputs
.
shape
.
rank
-
1
),
dtype
=
inputs
.
dtype
)
binary_tensor
=
tf
.
floor
(
random_tensor
)
output
=
tf
.
math
.
divide
(
inputs
,
keep_prob
)
*
binary_tensor
return
output
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
def
pyramid_feature_fusion
(
inputs
,
target_level
):
"""Fuses all feature maps in the feature pyramid at the target level.
Args:
inputs: A dictionary containing the feature pyramid. The size of the input
tensor needs to be fixed.
target_level: An `int` of the target feature level for feature fusion.
Returns:
A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
feature_channel].
"""
# Convert keys to int.
pyramid_feats
=
{
int
(
k
):
v
for
k
,
v
in
inputs
.
items
()}
min_level
=
min
(
pyramid_feats
.
keys
())
max_level
=
max
(
pyramid_feats
.
keys
())
resampled_feats
=
[]
for
l
in
range
(
min_level
,
max_level
+
1
):
if
l
==
target_level
:
resampled_feats
.
append
(
pyramid_feats
[
l
])
else
:
feat
=
pyramid_feats
[
l
]
target_size
=
list
(
feat
.
shape
[
1
:
3
])
target_size
[
0
]
*=
2
**
(
l
-
target_level
)
target_size
[
1
]
*=
2
**
(
l
-
target_level
)
# Casts feat to float32 so the resize op can be run on TPU.
feat
=
tf
.
cast
(
feat
,
tf
.
float32
)
feat
=
tf
.
image
.
resize
(
feat
,
size
=
target_size
,
method
=
tf
.
image
.
ResizeMethod
.
BILINEAR
)
# Casts it back to be compatible with the rest opetations.
feat
=
tf
.
cast
(
feat
,
pyramid_feats
[
l
].
dtype
)
resampled_feats
.
append
(
feat
)
return
tf
.
math
.
add_n
(
resampled_feats
)
class
PanopticFPNFusion
(
tf
.
keras
.
Model
):
"""Creates a Panoptic FPN feature Fusion layer.
This implements feature fusion for semantic segmentation head from the paper:
Alexander Kirillov, Ross Girshick, Kaiming He and Piotr Dollar.
Panoptic Feature Pyramid Networks.
(https://arxiv.org/pdf/1901.02446.pdf)
"""
def
__init__
(
self
,
min_level
:
int
=
2
,
max_level
:
int
=
5
,
target_level
:
int
=
2
,
num_filters
:
int
=
128
,
num_fpn_filters
:
int
=
256
,
activation
:
str
=
'relu'
,
kernel_regularizer
:
Optional
[
tf
.
keras
.
regularizers
.
Regularizer
]
=
None
,
bias_regularizer
:
Optional
[
tf
.
keras
.
regularizers
.
Regularizer
]
=
None
,
**
kwargs
):
"""Initializes panoptic FPN feature fusion layer.
Args:
min_level: An `int` of minimum level to use in feature fusion.
max_level: An `int` of maximum level to use in feature fusion.
target_level: An `int` of the target feature level for feature fusion.
num_filters: An `int` number of filters in conv2d layers.
num_fpn_filters: An `int` number of filters in the FPN outputs
activation: A `str` name of the activation function.
kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
Conv2D. Default is None.
bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
**kwargs: Additional keyword arguments to be passed.
Returns:
A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
feature_channel].
"""
if
target_level
>
max_level
:
raise
ValueError
(
'target_level should be less than max_level'
)
self
.
_config_dict
=
{
'min_level'
:
min_level
,
'max_level'
:
max_level
,
'target_level'
:
target_level
,
'num_filters'
:
num_filters
,
'num_fpn_filters'
:
num_fpn_filters
,
'activation'
:
activation
,
'kernel_regularizer'
:
kernel_regularizer
,
'bias_regularizer'
:
bias_regularizer
,
}
norm
=
tfa
.
layers
.
GroupNormalization
conv2d
=
tf
.
keras
.
layers
.
Conv2D
activation_fn
=
tf_utils
.
get_activation
(
activation
)
if
tf
.
keras
.
backend
.
image_data_format
()
==
'channels_last'
:
norm_axis
=
-
1
else
:
norm_axis
=
1
inputs
=
self
.
_build_inputs
(
num_fpn_filters
,
min_level
,
max_level
)
upscaled_features
=
[]
for
level
in
range
(
min_level
,
max_level
+
1
):
num_conv_layers
=
max
(
1
,
level
-
target_level
)
x
=
inputs
[
str
(
level
)]
for
i
in
range
(
num_conv_layers
):
x
=
conv2d
(
filters
=
num_filters
,
kernel_size
=
3
,
padding
=
'same'
,
kernel_initializer
=
tf
.
keras
.
initializers
.
VarianceScaling
(),
kernel_regularizer
=
kernel_regularizer
,
bias_regularizer
=
bias_regularizer
)(
x
)
x
=
norm
(
groups
=
32
,
axis
=
norm_axis
)(
x
)
x
=
activation_fn
(
x
)
if
level
!=
target_level
:
x
=
spatial_transform_ops
.
nearest_upsampling
(
x
,
scale
=
2
)
upscaled_features
.
append
(
x
)
fused_features
=
tf
.
math
.
add_n
(
upscaled_features
)
self
.
_output_specs
=
{
str
(
target_level
):
fused_features
.
get_shape
()}
super
(
PanopticFPNFusion
,
self
).
__init__
(
inputs
=
inputs
,
outputs
=
fused_features
,
**
kwargs
)
def
_build_inputs
(
self
,
num_filters
:
int
,
min_level
:
int
,
max_level
:
int
):
inputs
=
{}
for
level
in
range
(
min_level
,
max_level
+
1
):
inputs
[
str
(
level
)]
=
tf
.
keras
.
Input
(
shape
=
[
None
,
None
,
num_filters
])
return
inputs
def
get_config
(
self
)
->
Mapping
[
str
,
Any
]:
return
self
.
_config_dict
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
return
cls
(
**
config
)
@
property
def
output_specs
(
self
)
->
Mapping
[
str
,
tf
.
TensorShape
]:
"""A dict of {level: TensorShape} pairs for the model output."""
return
self
.
_output_specs
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
Scale
(
tf
.
keras
.
layers
.
Layer
):
"""Scales the input by a trainable scalar weight.
This is useful for applying ReZero to layers, which improves convergence
speed. This implements the paper:
ReZero is All You Need: Fast Convergence at Large Depth.
(https://arxiv.org/pdf/2003.04887.pdf).
"""
def
__init__
(
self
,
initializer
:
tf
.
keras
.
initializers
.
Initializer
=
'ones'
,
regularizer
:
Optional
[
tf
.
keras
.
regularizers
.
Regularizer
]
=
None
,
**
kwargs
):
"""Initializes a scale layer.
Args:
initializer: A `str` of initializer for the scalar weight.
regularizer: A `tf.keras.regularizers.Regularizer` for the scalar weight.
**kwargs: Additional keyword arguments to be passed to this layer.
Returns:
An `tf.Tensor` of which should have the same shape as input.
"""
super
(
Scale
,
self
).
__init__
(
**
kwargs
)
self
.
_initializer
=
initializer
self
.
_regularizer
=
regularizer
self
.
_scale
=
self
.
add_weight
(
name
=
'scale'
,
shape
=
[],
dtype
=
self
.
dtype
,
initializer
=
self
.
_initializer
,
regularizer
=
self
.
_regularizer
,
trainable
=
True
)
def
get_config
(
self
):
"""Returns a dictionary containing the config used for initialization."""
config
=
{
'initializer'
:
self
.
_initializer
,
'regularizer'
:
self
.
_regularizer
,
}
base_config
=
super
(
Scale
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
"""Calls the layer with the given inputs."""
scale
=
tf
.
cast
(
self
.
_scale
,
inputs
.
dtype
)
return
scale
*
inputs
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
TemporalSoftmaxPool
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a network layer corresponding to temporal softmax pooling.
This is useful for multi-class logits (used in e.g., Charades). Modified from
AssembleNet Charades evaluation from:
Michael S. Ryoo, AJ Piergiovanni, Mingxing Tan, Anelia Angelova.
AssembleNet: Searching for Multi-Stream Neural Connectivity in Video
Architectures.
(https://arxiv.org/pdf/1905.13209.pdf).
"""
def
call
(
self
,
inputs
):
"""Calls the layer with the given inputs."""
assert
inputs
.
shape
.
rank
in
(
3
,
4
,
5
)
frames
=
tf
.
shape
(
inputs
)[
1
]
pre_logits
=
inputs
/
tf
.
sqrt
(
tf
.
cast
(
frames
,
inputs
.
dtype
))
activations
=
tf
.
nn
.
softmax
(
pre_logits
,
axis
=
1
)
outputs
=
inputs
*
activations
return
outputs
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
PositionalEncoding
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a network layer that adds a sinusoidal positional encoding.
Positional encoding is incremented across frames, and is added to the input.
The positional encoding is first weighted at 0 so that the network can choose
to ignore it. This implements:
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin.
Attention Is All You Need.
(https://arxiv.org/pdf/1706.03762.pdf).
"""
def
__init__
(
self
,
initializer
:
tf
.
keras
.
initializers
.
Initializer
=
'zeros'
,
cache_encoding
:
bool
=
False
,
state_prefix
:
Optional
[
str
]
=
None
,
**
kwargs
):
"""Initializes positional encoding.
Args:
initializer: A `str` of initializer for weighting the positional encoding.
cache_encoding: A `bool`. If True, cache the positional encoding tensor
after calling build. Otherwise, rebuild the tensor for every call.
Setting this to False can be useful when we want to input a variable
number of frames, so the positional encoding tensor can change shape.
state_prefix: a prefix string to identify states.
**kwargs: Additional keyword arguments to be passed to this layer.
Returns:
A `tf.Tensor` of which should have the same shape as input.
"""
super
(
PositionalEncoding
,
self
).
__init__
(
**
kwargs
)
self
.
_initializer
=
initializer
self
.
_cache_encoding
=
cache_encoding
self
.
_pos_encoding
=
None
self
.
_rezero
=
Scale
(
initializer
=
initializer
,
name
=
'rezero'
)
state_prefix
=
state_prefix
if
state_prefix
is
not
None
else
''
self
.
_state_prefix
=
state_prefix
self
.
_frame_count_name
=
f
'
{
state_prefix
}
_pos_enc_frame_count'
def
get_config
(
self
):
"""Returns a dictionary containing the config used for initialization."""
config
=
{
'initializer'
:
self
.
_initializer
,
'cache_encoding'
:
self
.
_cache_encoding
,
'state_prefix'
:
self
.
_state_prefix
,
}
base_config
=
super
(
PositionalEncoding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
_positional_encoding
(
self
,
num_positions
:
Union
[
int
,
tf
.
Tensor
],
hidden_size
:
Union
[
int
,
tf
.
Tensor
],
start_position
:
Union
[
int
,
tf
.
Tensor
]
=
0
,
dtype
:
str
=
'float32'
)
->
tf
.
Tensor
:
"""Creates a sequence of sinusoidal positional encoding vectors.
Args:
num_positions: the total number of positions (frames).
hidden_size: the number of channels used for the hidden vectors.
start_position: the start position.
dtype: the dtype of the output tensor.
Returns:
The positional encoding tensor with shape [num_positions, hidden_size].
"""
if
isinstance
(
start_position
,
tf
.
Tensor
)
and
start_position
.
shape
.
rank
==
1
:
start_position
=
start_position
[
0
]
# Calling `tf.range` with `dtype=tf.bfloat16` results in an error,
# so we cast afterward.
positions
=
tf
.
range
(
start_position
,
start_position
+
num_positions
)
positions
=
tf
.
cast
(
positions
,
dtype
)[:,
tf
.
newaxis
]
idx
=
tf
.
range
(
hidden_size
)[
tf
.
newaxis
,
:]
power
=
tf
.
cast
(
2
*
(
idx
//
2
),
dtype
)
power
/=
tf
.
cast
(
hidden_size
,
dtype
)
angles
=
1.
/
tf
.
math
.
pow
(
10_000.
,
power
)
radians
=
positions
*
angles
sin
=
tf
.
math
.
sin
(
radians
[:,
0
::
2
])
cos
=
tf
.
math
.
cos
(
radians
[:,
1
::
2
])
pos_encoding
=
tf
.
concat
([
sin
,
cos
],
axis
=-
1
)
return
pos_encoding
def
_get_pos_encoding
(
self
,
input_shape
:
tf
.
Tensor
,
frame_count
:
int
=
0
)
->
tf
.
Tensor
:
"""Calculates the positional encoding from the input shape.
Args:
input_shape: the shape of the input.
frame_count: a count of frames that indicates the index of the first
frame.
Returns:
The positional encoding tensor with shape [num_positions, hidden_size].
"""
frames
=
input_shape
[
1
]
channels
=
input_shape
[
-
1
]
pos_encoding
=
self
.
_positional_encoding
(
frames
,
channels
,
start_position
=
frame_count
,
dtype
=
self
.
dtype
)
pos_encoding
=
tf
.
reshape
(
pos_encoding
,
[
1
,
frames
,
1
,
1
,
channels
])
return
pos_encoding
def
build
(
self
,
input_shape
):
"""Builds the layer with the given input shape.
Args:
input_shape: The input shape.
Raises:
ValueError: If using 'channels_first' data format.
"""
if
tf
.
keras
.
backend
.
image_data_format
()
==
'channels_first'
:
raise
ValueError
(
'"channels_first" mode is unsupported.'
)
if
self
.
_cache_encoding
:
self
.
_pos_encoding
=
self
.
_get_pos_encoding
(
input_shape
)
super
(
PositionalEncoding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
:
tf
.
Tensor
,
states
:
Optional
[
States
]
=
None
,
output_states
:
bool
=
True
,
)
->
Union
[
tf
.
Tensor
,
Tuple
[
tf
.
Tensor
,
States
]]:
"""Calls the layer with the given inputs.
Args:
inputs: An input `tf.Tensor`.
states: A `dict` of states such that, if any of the keys match for this
layer, will overwrite the contents of the buffer(s). Expected keys
include `state_prefix + '_pos_enc_frame_count'`.
output_states: A `bool`. If True, returns the output tensor and output
states. Returns just the output tensor otherwise.
Returns:
An output `tf.Tensor` (and optionally the states if `output_states=True`).
Raises:
ValueError: If using 'channels_first' data format.
"""
states
=
dict
(
states
)
if
states
is
not
None
else
{}
# Keep a count of frames encountered across input iterations in
# num_frames to be able to accurately update the positional encoding.
num_frames
=
tf
.
shape
(
inputs
)[
1
]
frame_count
=
tf
.
cast
(
states
.
get
(
self
.
_frame_count_name
,
[
0
]),
tf
.
int32
)
states
[
self
.
_frame_count_name
]
=
frame_count
+
num_frames
if
self
.
_cache_encoding
:
pos_encoding
=
self
.
_pos_encoding
else
:
pos_encoding
=
self
.
_get_pos_encoding
(
tf
.
shape
(
inputs
),
frame_count
=
frame_count
)
pos_encoding
=
tf
.
cast
(
pos_encoding
,
inputs
.
dtype
)
pos_encoding
=
self
.
_rezero
(
pos_encoding
)
outputs
=
inputs
+
pos_encoding
return
(
outputs
,
states
)
if
output_states
else
outputs
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
GlobalAveragePool3D
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a global average pooling layer with causal mode.
Implements causal mode, which runs a cumulative sum (with `tf.cumsum`) across
frames in the time dimension, allowing the use of a stream buffer. Sums any
valid input state with the current input to allow state to accumulate over
several iterations.
"""
def
__init__
(
self
,
keepdims
:
bool
=
False
,
causal
:
bool
=
False
,
state_prefix
:
Optional
[
str
]
=
None
,
**
kwargs
):
"""Initializes a global average pool layer.
Args:
keepdims: A `bool`. If True, keep the averaged dimensions.
causal: A `bool` of whether to run in causal mode with a cumulative sum
across frames.
state_prefix: a prefix string to identify states.
**kwargs: Additional keyword arguments to be passed to this layer.
Returns:
An output `tf.Tensor`.
"""
super
(
GlobalAveragePool3D
,
self
).
__init__
(
**
kwargs
)
self
.
_keepdims
=
keepdims
self
.
_causal
=
causal
state_prefix
=
state_prefix
if
state_prefix
is
not
None
else
''
self
.
_state_prefix
=
state_prefix
self
.
_state_name
=
f
'
{
state_prefix
}
_pool_buffer'
self
.
_frame_count_name
=
f
'
{
state_prefix
}
_pool_frame_count'
def
get_config
(
self
):
"""Returns a dictionary containing the config used for initialization."""
config
=
{
'keepdims'
:
self
.
_keepdims
,
'causal'
:
self
.
_causal
,
'state_prefix'
:
self
.
_state_prefix
,
}
base_config
=
super
(
GlobalAveragePool3D
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
:
tf
.
Tensor
,
states
:
Optional
[
States
]
=
None
,
output_states
:
bool
=
True
)
->
Union
[
tf
.
Tensor
,
Tuple
[
tf
.
Tensor
,
States
]]:
"""Calls the layer with the given inputs.
Args:
inputs: An input `tf.Tensor`.
states: A `dict` of states such that, if any of the keys match for this
layer, will overwrite the contents of the buffer(s).
Expected keys include `state_prefix + '__pool_buffer'` and
`state_prefix + '__pool_frame_count'`.
output_states: A `bool`. If True, returns the output tensor and output
states. Returns just the output tensor otherwise.
Returns:
An output `tf.Tensor` (and optionally the states if `output_states=True`).
If `causal=True`, the output tensor will have shape
`[batch_size, num_frames, 1, 1, channels]` if `keepdims=True`. We keep
the frame dimension in this case to simulate a cumulative global average
as if we are inputting one frame at a time. If `causal=False`, the output
is equivalent to `tf.keras.layers.GlobalAveragePooling3D` with shape
`[batch_size, 1, 1, 1, channels]` if `keepdims=True` (plus the optional
buffer stored in `states`).
Raises:
ValueError: If using 'channels_first' data format.
"""
states
=
dict
(
states
)
if
states
is
not
None
else
{}
if
tf
.
keras
.
backend
.
image_data_format
()
==
'channels_first'
:
raise
ValueError
(
'"channels_first" mode is unsupported.'
)
# Shape: [batch_size, 1, 1, 1, channels]
buffer
=
states
.
get
(
self
.
_state_name
,
None
)
if
buffer
is
None
:
buffer
=
tf
.
zeros_like
(
inputs
[:,
:
1
,
:
1
,
:
1
],
dtype
=
inputs
.
dtype
)
states
[
self
.
_state_name
]
=
buffer
# Keep a count of frames encountered across input iterations in
# num_frames to be able to accurately take a cumulative average across
# all frames when running in streaming mode
num_frames
=
tf
.
shape
(
inputs
)[
1
]
frame_count
=
states
.
get
(
self
.
_frame_count_name
,
tf
.
constant
([
0
]))
frame_count
=
tf
.
cast
(
frame_count
,
tf
.
int32
)
states
[
self
.
_frame_count_name
]
=
frame_count
+
num_frames
if
self
.
_causal
:
# Take a mean of spatial dimensions to make computation more efficient.
x
=
tf
.
reduce_mean
(
inputs
,
axis
=
[
2
,
3
],
keepdims
=
True
)
x
=
tf
.
cumsum
(
x
,
axis
=
1
)
x
=
x
+
buffer
# The last frame will be the value of the next state
# Shape: [batch_size, 1, 1, 1, channels]
states
[
self
.
_state_name
]
=
x
[:,
-
1
:]
# In causal mode, the divisor increments by 1 for every frame to
# calculate cumulative averages instead of one global average
mean_divisors
=
tf
.
range
(
num_frames
)
+
frame_count
+
1
mean_divisors
=
tf
.
reshape
(
mean_divisors
,
[
1
,
num_frames
,
1
,
1
,
1
])
mean_divisors
=
tf
.
cast
(
mean_divisors
,
x
.
dtype
)
# Shape: [batch_size, num_frames, 1, 1, channels]
x
=
x
/
mean_divisors
else
:
# In non-causal mode, we (optionally) sum across frames to take a
# cumulative average across input iterations rather than individual
# frames. If no buffer state is passed, this essentially becomes
# regular global average pooling.
# Shape: [batch_size, 1, 1, 1, channels]
x
=
tf
.
reduce_sum
(
inputs
,
axis
=
(
1
,
2
,
3
),
keepdims
=
True
)
x
=
x
/
tf
.
cast
(
tf
.
shape
(
inputs
)[
2
]
*
tf
.
shape
(
inputs
)[
3
],
x
.
dtype
)
x
=
x
+
buffer
# Shape: [batch_size, 1, 1, 1, channels]
states
[
self
.
_state_name
]
=
x
x
=
x
/
tf
.
cast
(
frame_count
+
num_frames
,
x
.
dtype
)
if
not
self
.
_keepdims
:
x
=
tf
.
squeeze
(
x
,
axis
=
(
1
,
2
,
3
))
return
(
x
,
states
)
if
output_states
else
x
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
SpatialAveragePool3D
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a global average pooling layer pooling across spatial dimentions."""
def
__init__
(
self
,
keepdims
:
bool
=
False
,
**
kwargs
):
"""Initializes a global average pool layer.
Args:
keepdims: A `bool`. If True, keep the averaged dimensions.
**kwargs: Additional keyword arguments to be passed to this layer.
Returns:
An output `tf.Tensor`.
"""
super
(
SpatialAveragePool3D
,
self
).
__init__
(
**
kwargs
)
self
.
_keepdims
=
keepdims
def
get_config
(
self
):
"""Returns a dictionary containing the config used for initialization."""
config
=
{
'keepdims'
:
self
.
_keepdims
,
}
base_config
=
super
(
SpatialAveragePool3D
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
"""Builds the layer with the given input shape."""
if
tf
.
keras
.
backend
.
image_data_format
()
==
'channels_first'
:
raise
ValueError
(
'"channels_first" mode is unsupported.'
)
super
(
SpatialAveragePool3D
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
"""Calls the layer with the given inputs."""
if
inputs
.
shape
.
rank
!=
5
:
raise
ValueError
(
'Input should have rank {}, got {}'
.
format
(
5
,
inputs
.
shape
.
rank
))
return
tf
.
reduce_mean
(
inputs
,
axis
=
(
2
,
3
),
keepdims
=
self
.
_keepdims
)
class
CausalConvMixin
:
"""Mixin class to implement CausalConv for `tf.keras.layers.Conv` layers."""
@
property
def
use_buffered_input
(
self
)
->
bool
:
return
self
.
_use_buffered_input
@
use_buffered_input
.
setter
def
use_buffered_input
(
self
,
variable
:
bool
):
self
.
_use_buffered_input
=
variable
def
_compute_buffered_causal_padding
(
self
,
inputs
:
tf
.
Tensor
,
use_buffered_input
:
bool
=
False
,
time_axis
:
int
=
1
,
)
->
List
[
List
[
int
]]:
"""Calculates padding for 'causal' option for conv layers.
Args:
inputs: An optional input `tf.Tensor` to be padded.
use_buffered_input: A `bool`. If True, use 'valid' padding along the time
dimension. This should be set when applying the stream buffer.
time_axis: An `int` of the axis of the time dimension.
Returns:
A list of paddings for `tf.pad`.
"""
input_shape
=
tf
.
shape
(
inputs
)[
1
:
-
1
]
if
tf
.
keras
.
backend
.
image_data_format
()
==
'channels_first'
:
raise
ValueError
(
'"channels_first" mode is unsupported.'
)
kernel_size_effective
=
[
(
self
.
kernel_size
[
i
]
+
(
self
.
kernel_size
[
i
]
-
1
)
*
(
self
.
dilation_rate
[
i
]
-
1
))
for
i
in
range
(
self
.
rank
)
]
pad_total
=
[
kernel_size_effective
[
0
]
-
1
]
for
i
in
range
(
1
,
self
.
rank
):
overlap
=
(
input_shape
[
i
]
-
1
)
%
self
.
strides
[
i
]
+
1
pad_total
.
append
(
tf
.
maximum
(
kernel_size_effective
[
i
]
-
overlap
,
0
))
pad_beg
=
[
pad_total
[
i
]
//
2
for
i
in
range
(
self
.
rank
)]
pad_end
=
[
pad_total
[
i
]
-
pad_beg
[
i
]
for
i
in
range
(
self
.
rank
)]
padding
=
[[
pad_beg
[
i
],
pad_end
[
i
]]
for
i
in
range
(
self
.
rank
)]
padding
=
[[
0
,
0
]]
+
padding
+
[[
0
,
0
]]
if
use_buffered_input
:
padding
[
time_axis
]
=
[
0
,
0
]
else
:
padding
[
time_axis
]
=
[
padding
[
time_axis
][
0
]
+
padding
[
time_axis
][
1
],
0
]
return
padding
def
_causal_validate_init
(
self
):
"""Validates the Conv layer initial configuration."""
# Overriding this method is meant to circumvent unnecessary errors when
# using causal padding.
if
(
self
.
filters
is
not
None
and
self
.
filters
%
self
.
groups
!=
0
):
raise
ValueError
(
'The number of filters must be evenly divisible by the number of '
'groups. Received: groups={}, filters={}'
.
format
(
self
.
groups
,
self
.
filters
))
if
not
all
(
self
.
kernel_size
):
raise
ValueError
(
'The argument `kernel_size` cannot contain 0(s). '
'Received: %s'
%
(
self
.
kernel_size
,))
def
_buffered_spatial_output_shape
(
self
,
spatial_output_shape
:
List
[
int
]):
"""Computes the spatial output shape from the input shape."""
# When buffer padding, use 'valid' padding across time. The output shape
# across time should be the input shape minus any padding, assuming
# the stride across time is 1.
if
self
.
_use_buffered_input
and
spatial_output_shape
[
0
]
is
not
None
:
padding
=
self
.
_compute_buffered_causal_padding
(
tf
.
zeros
([
1
]
+
spatial_output_shape
+
[
1
]),
use_buffered_input
=
False
)
spatial_output_shape
[
0
]
-=
sum
(
padding
[
1
])
return
spatial_output_shape
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
Conv2D
(
tf
.
keras
.
layers
.
Conv2D
,
CausalConvMixin
):
"""Conv2D layer supporting CausalConv.
Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
which applies causal padding to the temporal dimension, and same padding in
the spatial dimensions.
"""
def
__init__
(
self
,
*
args
,
use_buffered_input
=
False
,
**
kwargs
):
"""Initializes conv2d.
Args:
*args: Arguments to be passed.
use_buffered_input: A `bool`. If True, the input is expected to be padded
beforehand. In effect, calling this layer will use 'valid' padding on
the temporal dimension to simulate 'causal' padding.
**kwargs: Additional keyword arguments to be passed.
Returns:
An output `tf.Tensor` of the Conv2D operation.
"""
super
(
Conv2D
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
_use_buffered_input
=
use_buffered_input
def
get_config
(
self
):
"""Returns a dictionary containing the config used for initialization."""
config
=
{
'use_buffered_input'
:
self
.
_use_buffered_input
,
}
base_config
=
super
(
Conv2D
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
_compute_causal_padding
(
self
,
inputs
):
"""Computes causal padding dimensions for the given inputs."""
return
self
.
_compute_buffered_causal_padding
(
inputs
,
use_buffered_input
=
self
.
_use_buffered_input
)
def
_validate_init
(
self
):
"""Validates the Conv layer initial configuration."""
self
.
_causal_validate_init
()
def
_spatial_output_shape
(
self
,
spatial_input_shape
:
List
[
int
]):
"""Computes the spatial output shape from the input shape."""
shape
=
super
(
Conv2D
,
self
).
_spatial_output_shape
(
spatial_input_shape
)
return
self
.
_buffered_spatial_output_shape
(
shape
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
DepthwiseConv2D
(
tf
.
keras
.
layers
.
DepthwiseConv2D
,
CausalConvMixin
):
"""DepthwiseConv2D layer supporting CausalConv.
Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
which applies causal padding to the temporal dimension, and same padding in
the spatial dimensions.
"""
def
__init__
(
self
,
*
args
,
use_buffered_input
=
False
,
**
kwargs
):
"""Initializes depthwise conv2d.
Args:
*args: Arguments to be passed.
use_buffered_input: A `bool`. If True, the input is expected to be padded
beforehand. In effect, calling this layer will use 'valid' padding on
the temporal dimension to simulate 'causal' padding.
**kwargs: Additional keyword arguments to be passed.
Returns:
An output `tf.Tensor` of the DepthwiseConv2D operation.
"""
super
(
DepthwiseConv2D
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
_use_buffered_input
=
use_buffered_input
# Causal padding is unsupported by default for DepthwiseConv2D,
# so we resort to valid padding internally. However, we handle
# causal padding as a special case with `self._is_causal`, which is
# defined by the super class.
if
self
.
padding
==
'causal'
:
self
.
padding
=
'valid'
def
get_config
(
self
):
"""Returns a dictionary containing the config used for initialization."""
config
=
{
'use_buffered_input'
:
self
.
_use_buffered_input
,
}
base_config
=
super
(
DepthwiseConv2D
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
"""Calls the layer with the given inputs."""
if
self
.
_is_causal
:
inputs
=
tf
.
pad
(
inputs
,
self
.
_compute_causal_padding
(
inputs
))
return
super
(
DepthwiseConv2D
,
self
).
call
(
inputs
)
def
_compute_causal_padding
(
self
,
inputs
):
"""Computes causal padding dimensions for the given inputs."""
return
self
.
_compute_buffered_causal_padding
(
inputs
,
use_buffered_input
=
self
.
_use_buffered_input
)
def
_validate_init
(
self
):
"""Validates the Conv layer initial configuration."""
self
.
_causal_validate_init
()
def
_spatial_output_shape
(
self
,
spatial_input_shape
:
List
[
int
]):
"""Computes the spatial output shape from the input shape."""
shape
=
super
(
DepthwiseConv2D
,
self
).
_spatial_output_shape
(
spatial_input_shape
)
return
self
.
_buffered_spatial_output_shape
(
shape
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
Conv3D
(
tf
.
keras
.
layers
.
Conv3D
,
CausalConvMixin
):
"""Conv3D layer supporting CausalConv.
Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`),
which applies causal padding to the temporal dimension, and same padding in
the spatial dimensions.
"""
def
__init__
(
self
,
*
args
,
use_buffered_input
=
False
,
**
kwargs
):
"""Initializes conv3d.
Args:
*args: Arguments to be passed.
use_buffered_input: A `bool`. If True, the input is expected to be padded
beforehand. In effect, calling this layer will use 'valid' padding on
the temporal dimension to simulate 'causal' padding.
**kwargs: Additional keyword arguments to be passed.
Returns:
An output `tf.Tensor` of the Conv3D operation.
"""
super
(
Conv3D
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
_use_buffered_input
=
use_buffered_input
def
get_config
(
self
):
"""Returns a dictionary containing the config used for initialization."""
config
=
{
'use_buffered_input'
:
self
.
_use_buffered_input
,
}
base_config
=
super
(
Conv3D
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
"""Call the layer with the given inputs."""
# Note: tf.nn.conv3d with depthwise kernels on CPU is currently only
# supported when compiling with TF graph (XLA) using tf.function, so it
# is compiled by default here (b/186463870).
conv_fn
=
tf
.
function
(
super
(
Conv3D
,
self
).
call
,
jit_compile
=
True
)
return
conv_fn
(
inputs
)
def
_compute_causal_padding
(
self
,
inputs
):
"""Computes causal padding dimensions for the given inputs."""
return
self
.
_compute_buffered_causal_padding
(
inputs
,
use_buffered_input
=
self
.
_use_buffered_input
)
def
_validate_init
(
self
):
"""Validates the Conv layer initial configuration."""
self
.
_causal_validate_init
()
def
_spatial_output_shape
(
self
,
spatial_input_shape
:
List
[
int
]):
"""Computes the spatial output shape from the input shape."""
shape
=
super
(
Conv3D
,
self
).
_spatial_output_shape
(
spatial_input_shape
)
return
self
.
_buffered_spatial_output_shape
(
shape
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
SpatialPyramidPooling
(
tf
.
keras
.
layers
.
Layer
):
"""Implements the Atrous Spatial Pyramid Pooling.
References:
[Rethinking Atrous Convolution for Semantic Image Segmentation](
https://arxiv.org/pdf/1706.05587.pdf)
[Encoder-Decoder with Atrous Separable Convolution for Semantic Image
Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
"""
def
__init__
(
self
,
output_channels
:
int
,
dilation_rates
:
List
[
int
],
pool_kernel_size
:
Optional
[
List
[
int
]]
=
None
,
use_sync_bn
:
bool
=
False
,
batchnorm_momentum
:
float
=
0.99
,
batchnorm_epsilon
:
float
=
0.001
,
activation
:
str
=
'relu'
,
dropout
:
float
=
0.5
,
kernel_initializer
:
str
=
'GlorotUniform'
,
kernel_regularizer
:
Optional
[
tf
.
keras
.
regularizers
.
Regularizer
]
=
None
,
interpolation
:
str
=
'bilinear'
,
use_depthwise_convolution
:
bool
=
False
,
**
kwargs
):
"""Initializes `SpatialPyramidPooling`.
Args:
output_channels: Number of channels produced by SpatialPyramidPooling.
dilation_rates: A list of integers for parallel dilated conv.
pool_kernel_size: A list of integers or None. If None, global average
pooling is applied, otherwise an average pooling of pool_kernel_size is
applied.
use_sync_bn: A bool, whether or not to use sync batch normalization.
batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to
0.99.
batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to
0.001.
activation: A `str` for type of activation to be used. Defaults to 'relu'.
dropout: A float for the dropout rate before output. Defaults to 0.5.
kernel_initializer: Kernel initializer for conv layers. Defaults to
`glorot_uniform`.
kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
interpolation: The interpolation method for upsampling. Defaults to
`bilinear`.
use_depthwise_convolution: Allows spatial pooling to be separable
depthwise convolusions. [Encoder-Decoder with Atrous Separable
Convolution for Semantic Image Segmentation](
https://arxiv.org/pdf/1802.02611.pdf)
**kwargs: Other keyword arguments for the layer.
"""
super
().
__init__
(
**
kwargs
)
self
.
_output_channels
=
output_channels
self
.
_dilation_rates
=
dilation_rates
self
.
_use_sync_bn
=
use_sync_bn
self
.
_batchnorm_momentum
=
batchnorm_momentum
self
.
_batchnorm_epsilon
=
batchnorm_epsilon
self
.
_activation
=
activation
self
.
_dropout
=
dropout
self
.
_kernel_initializer
=
kernel_initializer
self
.
_kernel_regularizer
=
kernel_regularizer
self
.
_interpolation
=
interpolation
self
.
_pool_kernel_size
=
pool_kernel_size
self
.
_use_depthwise_convolution
=
use_depthwise_convolution
self
.
_activation_fn
=
tf_utils
.
get_activation
(
activation
)
if
self
.
_use_sync_bn
:
self
.
_bn_op
=
tf
.
keras
.
layers
.
experimental
.
SyncBatchNormalization
else
:
self
.
_bn_op
=
tf
.
keras
.
layers
.
BatchNormalization
if
tf
.
keras
.
backend
.
image_data_format
()
==
'channels_last'
:
self
.
_bn_axis
=
-
1
else
:
self
.
_bn_axis
=
1
def
build
(
self
,
input_shape
):
height
=
input_shape
[
1
]
width
=
input_shape
[
2
]
channels
=
input_shape
[
3
]
self
.
aspp_layers
=
[]
conv1
=
tf
.
keras
.
layers
.
Conv2D
(
filters
=
self
.
_output_channels
,
kernel_size
=
(
1
,
1
),
kernel_initializer
=
self
.
_kernel_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
use_bias
=
False
)
norm1
=
self
.
_bn_op
(
axis
=
self
.
_bn_axis
,
momentum
=
self
.
_batchnorm_momentum
,
epsilon
=
self
.
_batchnorm_epsilon
)
self
.
aspp_layers
.
append
([
conv1
,
norm1
])
for
dilation_rate
in
self
.
_dilation_rates
:
leading_layers
=
[]
kernel_size
=
(
3
,
3
)
if
self
.
_use_depthwise_convolution
:
leading_layers
+=
[
tf
.
keras
.
layers
.
DepthwiseConv2D
(
depth_multiplier
=
1
,
kernel_size
=
kernel_size
,
padding
=
'same'
,
depthwise_regularizer
=
self
.
_kernel_regularizer
,
depthwise_initializer
=
self
.
_kernel_initializer
,
dilation_rate
=
dilation_rate
,
use_bias
=
False
)
]
kernel_size
=
(
1
,
1
)
conv_dilation
=
leading_layers
+
[
tf
.
keras
.
layers
.
Conv2D
(
filters
=
self
.
_output_channels
,
kernel_size
=
kernel_size
,
padding
=
'same'
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
kernel_initializer
=
self
.
_kernel_initializer
,
dilation_rate
=
dilation_rate
,
use_bias
=
False
)
]
norm_dilation
=
self
.
_bn_op
(
axis
=
self
.
_bn_axis
,
momentum
=
self
.
_batchnorm_momentum
,
epsilon
=
self
.
_batchnorm_epsilon
)
self
.
aspp_layers
.
append
(
conv_dilation
+
[
norm_dilation
])
if
self
.
_pool_kernel_size
is
None
:
pooling
=
[
tf
.
keras
.
layers
.
GlobalAveragePooling2D
(),
tf
.
keras
.
layers
.
Reshape
((
1
,
1
,
channels
))
]
else
:
pooling
=
[
tf
.
keras
.
layers
.
AveragePooling2D
(
self
.
_pool_kernel_size
)]
conv2
=
tf
.
keras
.
layers
.
Conv2D
(
filters
=
self
.
_output_channels
,
kernel_size
=
(
1
,
1
),
kernel_initializer
=
self
.
_kernel_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
use_bias
=
False
)
norm2
=
self
.
_bn_op
(
axis
=
self
.
_bn_axis
,
momentum
=
self
.
_batchnorm_momentum
,
epsilon
=
self
.
_batchnorm_epsilon
)
self
.
aspp_layers
.
append
(
pooling
+
[
conv2
,
norm2
])
self
.
_resizing_layer
=
tf
.
keras
.
layers
.
Resizing
(
height
,
width
,
interpolation
=
self
.
_interpolation
,
dtype
=
tf
.
float32
)
self
.
_projection
=
[
tf
.
keras
.
layers
.
Conv2D
(
filters
=
self
.
_output_channels
,
kernel_size
=
(
1
,
1
),
kernel_initializer
=
self
.
_kernel_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
use_bias
=
False
),
self
.
_bn_op
(
axis
=
self
.
_bn_axis
,
momentum
=
self
.
_batchnorm_momentum
,
epsilon
=
self
.
_batchnorm_epsilon
)
]
self
.
_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout
)
self
.
_concat_layer
=
tf
.
keras
.
layers
.
Concatenate
(
axis
=-
1
)
def
call
(
self
,
inputs
:
tf
.
Tensor
,
training
:
Optional
[
bool
]
=
None
)
->
tf
.
Tensor
:
if
training
is
None
:
training
=
tf
.
keras
.
backend
.
learning_phase
()
result
=
[]
for
i
,
layers
in
enumerate
(
self
.
aspp_layers
):
x
=
inputs
for
layer
in
layers
:
# Apply layers sequentially.
x
=
layer
(
x
,
training
=
training
)
x
=
self
.
_activation_fn
(
x
)
# Apply resize layer to the end of the last set of layers.
if
i
==
len
(
self
.
aspp_layers
)
-
1
:
x
=
self
.
_resizing_layer
(
x
)
result
.
append
(
tf
.
cast
(
x
,
inputs
.
dtype
))
x
=
self
.
_concat_layer
(
result
)
for
layer
in
self
.
_projection
:
x
=
layer
(
x
,
training
=
training
)
x
=
self
.
_activation_fn
(
x
)
return
self
.
_dropout_layer
(
x
)
def
get_config
(
self
):
config
=
{
'output_channels'
:
self
.
_output_channels
,
'dilation_rates'
:
self
.
_dilation_rates
,
'pool_kernel_size'
:
self
.
_pool_kernel_size
,
'use_sync_bn'
:
self
.
_use_sync_bn
,
'batchnorm_momentum'
:
self
.
_batchnorm_momentum
,
'batchnorm_epsilon'
:
self
.
_batchnorm_epsilon
,
'activation'
:
self
.
_activation
,
'dropout'
:
self
.
_dropout
,
'kernel_initializer'
:
self
.
_kernel_initializer
,
'kernel_regularizer'
:
self
.
_kernel_regularizer
,
'interpolation'
:
self
.
_interpolation
,
}
base_config
=
super
().
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
official/vision/modeling/layers/nn_layers_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for nn_layers."""
# Import libraries
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.modeling.layers
import
nn_layers
class
NNLayersTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
def
test_scale
(
self
):
scale
=
nn_layers
.
Scale
(
initializer
=
tf
.
keras
.
initializers
.
constant
(
10.
))
output
=
scale
(
3.
)
self
.
assertAllEqual
(
output
,
30.
)
def
test_temporal_softmax_pool
(
self
):
inputs
=
tf
.
range
(
4
,
dtype
=
tf
.
float32
)
+
1.
inputs
=
tf
.
reshape
(
inputs
,
[
1
,
4
,
1
,
1
,
1
])
layer
=
nn_layers
.
TemporalSoftmaxPool
()
output
=
layer
(
inputs
)
self
.
assertAllClose
(
output
,
[[[[[
0.10153633
]]],
[[[
0.33481020
]]],
[[[
0.82801306
]]],
[[[
1.82021690
]]]]])
def
test_positional_encoding
(
self
):
pos_encoding
=
nn_layers
.
PositionalEncoding
(
initializer
=
'ones'
,
cache_encoding
=
False
)
pos_encoding_cached
=
nn_layers
.
PositionalEncoding
(
initializer
=
'ones'
,
cache_encoding
=
True
)
inputs
=
tf
.
ones
([
1
,
4
,
1
,
1
,
3
])
outputs
,
_
=
pos_encoding
(
inputs
)
outputs_cached
,
_
=
pos_encoding_cached
(
inputs
)
expected
=
tf
.
constant
(
[[[[[
1.0000000
,
1.0000000
,
2.0000000
]]],
[[[
1.8414710
,
1.0021545
,
1.5403023
]]],
[[[
1.9092975
,
1.0043088
,
0.5838531
]]],
[[[
1.1411200
,
1.0064633
,
0.0100075
]]]]])
self
.
assertEqual
(
outputs
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
outputs
,
expected
)
self
.
assertEqual
(
outputs
.
shape
,
outputs_cached
.
shape
)
self
.
assertAllClose
(
outputs
,
outputs_cached
)
inputs
=
tf
.
ones
([
1
,
5
,
1
,
1
,
3
])
_
=
pos_encoding
(
inputs
)
def
test_positional_encoding_bfloat16
(
self
):
pos_encoding
=
nn_layers
.
PositionalEncoding
(
initializer
=
'ones'
)
inputs
=
tf
.
ones
([
1
,
4
,
1
,
1
,
3
],
dtype
=
tf
.
bfloat16
)
outputs
,
_
=
pos_encoding
(
inputs
)
expected
=
tf
.
constant
(
[[[[[
1.0000000
,
1.0000000
,
2.0000000
]]],
[[[
1.8414710
,
1.0021545
,
1.5403023
]]],
[[[
1.9092975
,
1.0043088
,
0.5838531
]]],
[[[
1.1411200
,
1.0064633
,
0.0100075
]]]]])
self
.
assertEqual
(
outputs
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
outputs
,
expected
)
def
test_global_average_pool_basic
(
self
):
pool
=
nn_layers
.
GlobalAveragePool3D
(
keepdims
=
True
)
inputs
=
tf
.
ones
([
1
,
2
,
3
,
4
,
1
])
outputs
=
pool
(
inputs
,
output_states
=
False
)
expected
=
tf
.
ones
([
1
,
1
,
1
,
1
,
1
])
self
.
assertEqual
(
outputs
.
shape
,
expected
.
shape
)
self
.
assertAllEqual
(
outputs
,
expected
)
def
test_positional_encoding_stream
(
self
):
pos_encoding
=
nn_layers
.
PositionalEncoding
(
initializer
=
'ones'
,
cache_encoding
=
False
)
inputs
=
tf
.
range
(
4
,
dtype
=
tf
.
float32
)
+
1.
inputs
=
tf
.
reshape
(
inputs
,
[
1
,
4
,
1
,
1
,
1
])
inputs
=
tf
.
tile
(
inputs
,
[
1
,
1
,
1
,
1
,
3
])
expected
,
_
=
pos_encoding
(
inputs
)
for
num_splits
in
[
1
,
2
,
4
]:
frames
=
tf
.
split
(
inputs
,
num_splits
,
axis
=
1
)
states
=
{}
predicted
=
[]
for
frame
in
frames
:
output
,
states
=
pos_encoding
(
frame
,
states
=
states
)
predicted
.
append
(
output
)
predicted
=
tf
.
concat
(
predicted
,
axis
=
1
)
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
self
.
assertAllClose
(
predicted
,
[[[[[
1.0000000
,
1.0000000
,
2.0000000
]]],
[[[
2.8414710
,
2.0021544
,
2.5403023
]]],
[[[
3.9092975
,
3.0043090
,
2.5838532
]]],
[[[
4.1411200
,
4.0064630
,
3.0100074
]]]]])
def
test_global_average_pool_keras
(
self
):
pool
=
nn_layers
.
GlobalAveragePool3D
(
keepdims
=
False
)
keras_pool
=
tf
.
keras
.
layers
.
GlobalAveragePooling3D
()
inputs
=
10
*
tf
.
random
.
normal
([
1
,
2
,
3
,
4
,
1
])
outputs
=
pool
(
inputs
,
output_states
=
False
)
keras_output
=
keras_pool
(
inputs
)
self
.
assertAllEqual
(
outputs
.
shape
,
keras_output
.
shape
)
self
.
assertAllClose
(
outputs
,
keras_output
)
def
test_stream_global_average_pool
(
self
):
gap
=
nn_layers
.
GlobalAveragePool3D
(
keepdims
=
True
,
causal
=
False
)
inputs
=
tf
.
range
(
4
,
dtype
=
tf
.
float32
)
+
1.
inputs
=
tf
.
reshape
(
inputs
,
[
1
,
4
,
1
,
1
,
1
])
inputs
=
tf
.
tile
(
inputs
,
[
1
,
1
,
2
,
2
,
3
])
expected
,
_
=
gap
(
inputs
)
for
num_splits
in
[
1
,
2
,
4
]:
frames
=
tf
.
split
(
inputs
,
num_splits
,
axis
=
1
)
states
=
{}
predicted
=
None
for
frame
in
frames
:
predicted
,
states
=
gap
(
frame
,
states
=
states
)
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
self
.
assertAllClose
(
predicted
,
[[[[[
2.5
,
2.5
,
2.5
]]]]])
def
test_causal_stream_global_average_pool
(
self
):
gap
=
nn_layers
.
GlobalAveragePool3D
(
keepdims
=
True
,
causal
=
True
)
inputs
=
tf
.
range
(
4
,
dtype
=
tf
.
float32
)
+
1.
inputs
=
tf
.
reshape
(
inputs
,
[
1
,
4
,
1
,
1
,
1
])
inputs
=
tf
.
tile
(
inputs
,
[
1
,
1
,
2
,
2
,
3
])
expected
,
_
=
gap
(
inputs
)
for
num_splits
in
[
1
,
2
,
4
]:
frames
=
tf
.
split
(
inputs
,
num_splits
,
axis
=
1
)
states
=
{}
predicted
=
[]
for
frame
in
frames
:
x
,
states
=
gap
(
frame
,
states
=
states
)
predicted
.
append
(
x
)
predicted
=
tf
.
concat
(
predicted
,
axis
=
1
)
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
self
.
assertAllClose
(
predicted
,
[[[[[
1.0
,
1.0
,
1.0
]]],
[[[
1.5
,
1.5
,
1.5
]]],
[[[
2.0
,
2.0
,
2.0
]]],
[[[
2.5
,
2.5
,
2.5
]]]]])
def
test_spatial_average_pool
(
self
):
pool
=
nn_layers
.
SpatialAveragePool3D
(
keepdims
=
True
)
inputs
=
tf
.
range
(
64
,
dtype
=
tf
.
float32
)
+
1.
inputs
=
tf
.
reshape
(
inputs
,
[
1
,
4
,
4
,
4
,
1
])
output
=
pool
(
inputs
)
self
.
assertEqual
(
output
.
shape
,
[
1
,
4
,
1
,
1
,
1
])
self
.
assertAllClose
(
output
,
[[[[[
8.50
]]],
[[[
24.5
]]],
[[[
40.5
]]],
[[[
56.5
]]]]])
def
test_conv2d_causal
(
self
):
conv2d
=
nn_layers
.
Conv2D
(
filters
=
3
,
kernel_size
=
(
3
,
3
),
strides
=
(
1
,
2
),
padding
=
'causal'
,
use_buffered_input
=
True
,
kernel_initializer
=
'ones'
,
use_bias
=
False
,
)
inputs
=
tf
.
ones
([
1
,
4
,
2
,
3
])
paddings
=
[[
0
,
0
],
[
2
,
0
],
[
0
,
0
],
[
0
,
0
]]
padded_inputs
=
tf
.
pad
(
inputs
,
paddings
)
predicted
=
conv2d
(
padded_inputs
)
expected
=
tf
.
constant
(
[[[[
6.0
,
6.0
,
6.0
]],
[[
12.
,
12.
,
12.
]],
[[
18.
,
18.
,
18.
]],
[[
18.
,
18.
,
18.
]]]])
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
conv2d
.
use_buffered_input
=
False
predicted
=
conv2d
(
inputs
)
self
.
assertFalse
(
conv2d
.
use_buffered_input
)
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
def
test_depthwise_conv2d_causal
(
self
):
conv2d
=
nn_layers
.
DepthwiseConv2D
(
kernel_size
=
(
3
,
3
),
strides
=
(
1
,
1
),
padding
=
'causal'
,
use_buffered_input
=
True
,
depthwise_initializer
=
'ones'
,
use_bias
=
False
,
)
inputs
=
tf
.
ones
([
1
,
2
,
2
,
3
])
paddings
=
[[
0
,
0
],
[
2
,
0
],
[
0
,
0
],
[
0
,
0
]]
padded_inputs
=
tf
.
pad
(
inputs
,
paddings
)
predicted
=
conv2d
(
padded_inputs
)
expected
=
tf
.
constant
(
[[[[
2.
,
2.
,
2.
],
[
2.
,
2.
,
2.
]],
[[
4.
,
4.
,
4.
],
[
4.
,
4.
,
4.
]]]])
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
conv2d
.
use_buffered_input
=
False
predicted
=
conv2d
(
inputs
)
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
def
test_conv3d_causal
(
self
):
conv3d
=
nn_layers
.
Conv3D
(
filters
=
3
,
kernel_size
=
(
3
,
3
,
3
),
strides
=
(
1
,
2
,
2
),
padding
=
'causal'
,
use_buffered_input
=
True
,
kernel_initializer
=
'ones'
,
use_bias
=
False
,
)
inputs
=
tf
.
ones
([
1
,
2
,
4
,
4
,
3
])
paddings
=
[[
0
,
0
],
[
2
,
0
],
[
0
,
0
],
[
0
,
0
],
[
0
,
0
]]
padded_inputs
=
tf
.
pad
(
inputs
,
paddings
)
predicted
=
conv3d
(
padded_inputs
)
expected
=
tf
.
constant
(
[[[[[
27.
,
27.
,
27.
],
[
18.
,
18.
,
18.
]],
[[
18.
,
18.
,
18.
],
[
12.
,
12.
,
12.
]]],
[[[
54.
,
54.
,
54.
],
[
36.
,
36.
,
36.
]],
[[
36.
,
36.
,
36.
],
[
24.
,
24.
,
24.
]]]]])
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
conv3d
.
use_buffered_input
=
False
predicted
=
conv3d
(
inputs
)
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
def
test_depthwise_conv3d_causal
(
self
):
conv3d
=
nn_layers
.
Conv3D
(
filters
=
3
,
kernel_size
=
(
3
,
3
,
3
),
strides
=
(
1
,
2
,
2
),
padding
=
'causal'
,
use_buffered_input
=
True
,
kernel_initializer
=
'ones'
,
use_bias
=
False
,
groups
=
3
,
)
inputs
=
tf
.
ones
([
1
,
2
,
4
,
4
,
3
])
paddings
=
[[
0
,
0
],
[
2
,
0
],
[
0
,
0
],
[
0
,
0
],
[
0
,
0
]]
padded_inputs
=
tf
.
pad
(
inputs
,
paddings
)
predicted
=
conv3d
(
padded_inputs
)
expected
=
tf
.
constant
(
[[[[[
9.0
,
9.0
,
9.0
],
[
6.0
,
6.0
,
6.0
]],
[[
6.0
,
6.0
,
6.0
],
[
4.0
,
4.0
,
4.0
]]],
[[[
18.0
,
18.0
,
18.0
],
[
12.
,
12.
,
12.
]],
[[
12.
,
12.
,
12.
],
[
8.
,
8.
,
8.
]]]]])
output_shape
=
conv3d
.
_spatial_output_shape
([
4
,
4
,
4
])
self
.
assertAllClose
(
output_shape
,
[
2
,
2
,
2
])
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
conv3d
.
use_buffered_input
=
False
predicted
=
conv3d
(
inputs
)
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
def
test_conv3d_causal_padding_2d
(
self
):
"""Test to ensure causal padding works like standard padding."""
conv3d
=
nn_layers
.
Conv3D
(
filters
=
1
,
kernel_size
=
(
1
,
3
,
3
),
strides
=
(
1
,
2
,
2
),
padding
=
'causal'
,
use_buffered_input
=
False
,
kernel_initializer
=
'ones'
,
use_bias
=
False
,
)
keras_conv3d
=
tf
.
keras
.
layers
.
Conv3D
(
filters
=
1
,
kernel_size
=
(
1
,
3
,
3
),
strides
=
(
1
,
2
,
2
),
padding
=
'same'
,
kernel_initializer
=
'ones'
,
use_bias
=
False
,
)
inputs
=
tf
.
ones
([
1
,
1
,
4
,
4
,
1
])
predicted
=
conv3d
(
inputs
)
expected
=
keras_conv3d
(
inputs
)
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
self
.
assertAllClose
(
predicted
,
[[[[[
9.
],
[
6.
]],
[[
6.
],
[
4.
]]]]])
def
test_conv3d_causal_padding_1d
(
self
):
"""Test to ensure causal padding works like standard padding."""
conv3d
=
nn_layers
.
Conv3D
(
filters
=
1
,
kernel_size
=
(
3
,
1
,
1
),
strides
=
(
2
,
1
,
1
),
padding
=
'causal'
,
use_buffered_input
=
False
,
kernel_initializer
=
'ones'
,
use_bias
=
False
,
)
keras_conv1d
=
tf
.
keras
.
layers
.
Conv1D
(
filters
=
1
,
kernel_size
=
3
,
strides
=
2
,
padding
=
'causal'
,
kernel_initializer
=
'ones'
,
use_bias
=
False
,
)
inputs
=
tf
.
ones
([
1
,
4
,
1
,
1
,
1
])
predicted
=
conv3d
(
inputs
)
expected
=
keras_conv1d
(
tf
.
squeeze
(
inputs
,
axis
=
[
2
,
3
]))
expected
=
tf
.
reshape
(
expected
,
[
1
,
2
,
1
,
1
,
1
])
self
.
assertEqual
(
predicted
.
shape
,
expected
.
shape
)
self
.
assertAllClose
(
predicted
,
expected
)
self
.
assertAllClose
(
predicted
,
[[[[[
1.
]]],
[[[
3.
]]]]])
@
parameterized
.
parameters
(
(
None
,
[]),
(
None
,
[
6
,
12
,
18
]),
([
32
,
32
],
[
6
,
12
,
18
]),
)
def
test_aspp
(
self
,
pool_kernel_size
,
dilation_rates
):
inputs
=
tf
.
keras
.
Input
(
shape
=
(
64
,
64
,
128
),
dtype
=
tf
.
float32
)
layer
=
nn_layers
.
SpatialPyramidPooling
(
output_channels
=
256
,
dilation_rates
=
dilation_rates
,
pool_kernel_size
=
pool_kernel_size
)
output
=
layer
(
inputs
)
self
.
assertAllEqual
([
None
,
64
,
64
,
256
],
output
.
shape
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/modeling/layers/roi_aligner.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of ROI aligner."""
from
typing
import
Mapping
import
tensorflow
as
tf
from
official.vision.ops
import
spatial_transform_ops
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
MultilevelROIAligner
(
tf
.
keras
.
layers
.
Layer
):
"""Performs ROIAlign for the second stage processing."""
def
__init__
(
self
,
crop_size
:
int
=
7
,
sample_offset
:
float
=
0.5
,
**
kwargs
):
"""Initializes a ROI aligner.
Args:
crop_size: An `int` of the output size of the cropped features.
sample_offset: A `float` in [0, 1] of the subpixel sample offset.
**kwargs: Additional keyword arguments passed to Layer.
"""
self
.
_config_dict
=
{
'crop_size'
:
crop_size
,
'sample_offset'
:
sample_offset
,
}
super
(
MultilevelROIAligner
,
self
).
__init__
(
**
kwargs
)
def
call
(
self
,
features
:
Mapping
[
str
,
tf
.
Tensor
],
boxes
:
tf
.
Tensor
,
training
:
bool
=
None
):
"""Generates ROIs.
Args:
features: A dictionary with key as pyramid level and value as features.
The features are in shape of
[batch_size, height_l, width_l, num_filters].
boxes: A 3-D `tf.Tensor` of shape [batch_size, num_boxes, 4]. Each row
represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
from grid point.
training: A `bool` of whether it is in training mode.
Returns:
A 5-D `tf.Tensor` representing feature crop of shape
[batch_size, num_boxes, crop_size, crop_size, num_filters].
"""
roi_features
=
spatial_transform_ops
.
multilevel_crop_and_resize
(
features
,
boxes
,
output_size
=
self
.
_config_dict
[
'crop_size'
],
sample_offset
=
self
.
_config_dict
[
'sample_offset'
])
return
roi_features
def
get_config
(
self
):
return
self
.
_config_dict
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
return
cls
(
**
config
)
official/vision/modeling/layers/roi_aligner_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for roi_aligner.py."""
# Import libraries
import
tensorflow
as
tf
from
official.vision.modeling.layers
import
roi_aligner
class
MultilevelROIAlignerTest
(
tf
.
test
.
TestCase
):
def
test_serialize_deserialize
(
self
):
kwargs
=
dict
(
crop_size
=
7
,
sample_offset
=
0.5
,
)
aligner
=
roi_aligner
.
MultilevelROIAligner
(
**
kwargs
)
expected_config
=
dict
(
kwargs
)
self
.
assertEqual
(
aligner
.
get_config
(),
expected_config
)
new_aligner
=
roi_aligner
.
MultilevelROIAligner
.
from_config
(
aligner
.
get_config
())
self
.
assertAllEqual
(
aligner
.
get_config
(),
new_aligner
.
get_config
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/modeling/layers/roi_generator.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of ROI generator."""
from
typing
import
Optional
,
Mapping
# Import libraries
import
tensorflow
as
tf
from
official.vision.ops
import
box_ops
from
official.vision.ops
import
nms
def
_multilevel_propose_rois
(
raw_boxes
:
Mapping
[
str
,
tf
.
Tensor
],
raw_scores
:
Mapping
[
str
,
tf
.
Tensor
],
anchor_boxes
:
Mapping
[
str
,
tf
.
Tensor
],
image_shape
:
tf
.
Tensor
,
pre_nms_top_k
:
int
=
2000
,
pre_nms_score_threshold
:
float
=
0.0
,
pre_nms_min_size_threshold
:
float
=
0.0
,
nms_iou_threshold
:
float
=
0.7
,
num_proposals
:
int
=
1000
,
use_batched_nms
:
bool
=
False
,
decode_boxes
:
bool
=
True
,
clip_boxes
:
bool
=
True
,
apply_sigmoid_to_score
:
bool
=
True
):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Apply sigmoid transform if specified.
b. Decode boxes if specified.
c. Clip boxes if specified.
d. Filter small boxes and those fall outside image if specified.
e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
f. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
raw_boxes: A `dict` with keys representing FPN levels and values
representing box tenors of shape
[batch_size, feature_h, feature_w, num_anchors * 4].
raw_scores: A `dict` with keys representing FPN levels and values
representing logit tensors of shape
[batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: A `dict` with keys representing FPN levels and values
representing anchor box tensors of shape
[batch_size, feature_h * feature_w * num_anchors, 4].
image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
before applying NMS. Default: 2000.
pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
box score to keep before applying NMS. This is often used as a
pre-filtering step for better performance. Default: 0, no filtering is
applied.
pre_nms_min_size_threshold: A `float` representing the minimal box size in
each side (w.r.t. the scaled image) to keep before applying NMS. This is
often used as a pre-filtering step for better performance. Default: 0, no
filtering is applied.
nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
used for NMS. If 0.0, no NMS is applied. Default: 0.7.
num_proposals: An `int` of top scoring RPN proposals *in total* to keep
after applying NMS. Default: 1000.
use_batched_nms: A `bool` indicating whether NMS is applied in batch using
`tf.image.combined_non_max_suppression`. Currently only available in
CPU/GPU. Default is False.
decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
`anchor_boxes`. Default is True.
clip_boxes: A `bool` indicating whether boxes are first clipped to the
scaled image size before appliying NMS. If False, no clipping is applied
and `image_shape` is ignored. Default is True.
apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
`raw_scores` before applying NMS. Default is True.
Returns:
selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
representing the box coordinates of the selected proposals w.r.t. the
scaled image.
selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
representing the scores of the selected proposals.
"""
with
tf
.
name_scope
(
'multilevel_propose_rois'
):
rois
=
[]
roi_scores
=
[]
image_shape
=
tf
.
expand_dims
(
image_shape
,
axis
=
1
)
for
level
in
sorted
(
raw_scores
.
keys
()):
with
tf
.
name_scope
(
'level_%s'
%
level
):
_
,
feature_h
,
feature_w
,
num_anchors_per_location
=
(
raw_scores
[
level
].
get_shape
().
as_list
())
num_boxes
=
feature_h
*
feature_w
*
num_anchors_per_location
this_level_scores
=
tf
.
reshape
(
raw_scores
[
level
],
[
-
1
,
num_boxes
])
this_level_boxes
=
tf
.
reshape
(
raw_boxes
[
level
],
[
-
1
,
num_boxes
,
4
])
this_level_anchors
=
tf
.
cast
(
tf
.
reshape
(
anchor_boxes
[
level
],
[
-
1
,
num_boxes
,
4
]),
dtype
=
this_level_scores
.
dtype
)
if
apply_sigmoid_to_score
:
this_level_scores
=
tf
.
sigmoid
(
this_level_scores
)
if
decode_boxes
:
this_level_boxes
=
box_ops
.
decode_boxes
(
this_level_boxes
,
this_level_anchors
)
if
clip_boxes
:
this_level_boxes
=
box_ops
.
clip_boxes
(
this_level_boxes
,
image_shape
)
if
pre_nms_min_size_threshold
>
0.0
:
this_level_boxes
,
this_level_scores
=
box_ops
.
filter_boxes
(
this_level_boxes
,
this_level_scores
,
image_shape
,
pre_nms_min_size_threshold
)
this_level_pre_nms_top_k
=
min
(
num_boxes
,
pre_nms_top_k
)
this_level_post_nms_top_k
=
min
(
num_boxes
,
num_proposals
)
if
nms_iou_threshold
>
0.0
:
if
use_batched_nms
:
this_level_rois
,
this_level_roi_scores
,
_
,
_
=
(
tf
.
image
.
combined_non_max_suppression
(
tf
.
expand_dims
(
this_level_boxes
,
axis
=
2
),
tf
.
expand_dims
(
this_level_scores
,
axis
=-
1
),
max_output_size_per_class
=
this_level_pre_nms_top_k
,
max_total_size
=
this_level_post_nms_top_k
,
iou_threshold
=
nms_iou_threshold
,
score_threshold
=
pre_nms_score_threshold
,
pad_per_class
=
False
,
clip_boxes
=
False
))
else
:
if
pre_nms_score_threshold
>
0.0
:
this_level_boxes
,
this_level_scores
=
(
box_ops
.
filter_boxes_by_scores
(
this_level_boxes
,
this_level_scores
,
pre_nms_score_threshold
))
this_level_boxes
,
this_level_scores
=
box_ops
.
top_k_boxes
(
this_level_boxes
,
this_level_scores
,
k
=
this_level_pre_nms_top_k
)
this_level_roi_scores
,
this_level_rois
=
(
nms
.
sorted_non_max_suppression_padded
(
this_level_scores
,
this_level_boxes
,
max_output_size
=
this_level_post_nms_top_k
,
iou_threshold
=
nms_iou_threshold
))
else
:
this_level_rois
,
this_level_roi_scores
=
box_ops
.
top_k_boxes
(
this_level_boxes
,
this_level_scores
,
k
=
this_level_post_nms_top_k
)
rois
.
append
(
this_level_rois
)
roi_scores
.
append
(
this_level_roi_scores
)
all_rois
=
tf
.
concat
(
rois
,
axis
=
1
)
all_roi_scores
=
tf
.
concat
(
roi_scores
,
axis
=
1
)
with
tf
.
name_scope
(
'top_k_rois'
):
_
,
num_valid_rois
=
all_roi_scores
.
get_shape
().
as_list
()
overall_top_k
=
min
(
num_valid_rois
,
num_proposals
)
selected_rois
,
selected_roi_scores
=
box_ops
.
top_k_boxes
(
all_rois
,
all_roi_scores
,
k
=
overall_top_k
)
return
selected_rois
,
selected_roi_scores
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
MultilevelROIGenerator
(
tf
.
keras
.
layers
.
Layer
):
"""Proposes RoIs for the second stage processing."""
def
__init__
(
self
,
pre_nms_top_k
:
int
=
2000
,
pre_nms_score_threshold
:
float
=
0.0
,
pre_nms_min_size_threshold
:
float
=
0.0
,
nms_iou_threshold
:
float
=
0.7
,
num_proposals
:
int
=
1000
,
test_pre_nms_top_k
:
int
=
1000
,
test_pre_nms_score_threshold
:
float
=
0.0
,
test_pre_nms_min_size_threshold
:
float
=
0.0
,
test_nms_iou_threshold
:
float
=
0.7
,
test_num_proposals
:
int
=
1000
,
use_batched_nms
:
bool
=
False
,
**
kwargs
):
"""Initializes a ROI generator.
The ROI generator transforms the raw predictions from RPN to ROIs.
Args:
pre_nms_top_k: An `int` of the number of top scores proposals to be kept
before applying NMS.
pre_nms_score_threshold: A `float` of the score threshold to apply before
applying NMS. Proposals whose scores are below this threshold are
thrown away.
pre_nms_min_size_threshold: A `float` of the threshold of each side of the
box (w.r.t. the scaled image). Proposals whose sides are below this
threshold are thrown away.
nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
num_proposals: An `int` of the final number of proposals to generate.
test_pre_nms_top_k: An `int` of the number of top scores proposals to be
kept before applying NMS in testing.
test_pre_nms_score_threshold: A `float` of the score threshold to apply
before applying NMS in testing. Proposals whose scores are below this
threshold are thrown away.
test_pre_nms_min_size_threshold: A `float` of the threshold of each side
of the box (w.r.t. the scaled image) in testing. Proposals whose sides
are below this threshold are thrown away.
test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in
testing.
test_num_proposals: An `int` of the final number of proposals to generate
in testing.
use_batched_nms: A `bool` of whether or not use
`tf.image.combined_non_max_suppression`.
**kwargs: Additional keyword arguments passed to Layer.
"""
self
.
_config_dict
=
{
'pre_nms_top_k'
:
pre_nms_top_k
,
'pre_nms_score_threshold'
:
pre_nms_score_threshold
,
'pre_nms_min_size_threshold'
:
pre_nms_min_size_threshold
,
'nms_iou_threshold'
:
nms_iou_threshold
,
'num_proposals'
:
num_proposals
,
'test_pre_nms_top_k'
:
test_pre_nms_top_k
,
'test_pre_nms_score_threshold'
:
test_pre_nms_score_threshold
,
'test_pre_nms_min_size_threshold'
:
test_pre_nms_min_size_threshold
,
'test_nms_iou_threshold'
:
test_nms_iou_threshold
,
'test_num_proposals'
:
test_num_proposals
,
'use_batched_nms'
:
use_batched_nms
,
}
super
(
MultilevelROIGenerator
,
self
).
__init__
(
**
kwargs
)
def
call
(
self
,
raw_boxes
:
Mapping
[
str
,
tf
.
Tensor
],
raw_scores
:
Mapping
[
str
,
tf
.
Tensor
],
anchor_boxes
:
Mapping
[
str
,
tf
.
Tensor
],
image_shape
:
tf
.
Tensor
,
training
:
Optional
[
bool
]
=
None
):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Apply sigmoid transform if specified.
b. Decode boxes if specified.
c. Clip boxes if specified.
d. Filter small boxes and those fall outside image if specified.
e. Apply pre-NMS filtering including pre-NMS top k and score
thresholding.
f. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
raw_boxes: A `dict` with keys representing FPN levels and values
representing box tenors of shape
[batch, feature_h, feature_w, num_anchors * 4].
raw_scores: A `dict` with keys representing FPN levels and values
representing logit tensors of shape
[batch, feature_h, feature_w, num_anchors].
anchor_boxes: A `dict` with keys representing FPN levels and values
representing anchor box tensors of shape
[batch, feature_h * feature_w * num_anchors, 4].
image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension
are [height, width] of the scaled image.
training: A `bool` that indicates whether it is in training mode.
Returns:
roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed
ROIs in the scaled image coordinate.
roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the
proposed ROIs.
"""
roi_boxes
,
roi_scores
=
_multilevel_propose_rois
(
raw_boxes
,
raw_scores
,
anchor_boxes
,
image_shape
,
pre_nms_top_k
=
(
self
.
_config_dict
[
'pre_nms_top_k'
]
if
training
else
self
.
_config_dict
[
'test_pre_nms_top_k'
]),
pre_nms_score_threshold
=
(
self
.
_config_dict
[
'pre_nms_score_threshold'
]
if
training
else
self
.
_config_dict
[
'test_pre_nms_score_threshold'
]),
pre_nms_min_size_threshold
=
(
self
.
_config_dict
[
'pre_nms_min_size_threshold'
]
if
training
else
self
.
_config_dict
[
'test_pre_nms_min_size_threshold'
]),
nms_iou_threshold
=
(
self
.
_config_dict
[
'nms_iou_threshold'
]
if
training
else
self
.
_config_dict
[
'test_nms_iou_threshold'
]),
num_proposals
=
(
self
.
_config_dict
[
'num_proposals'
]
if
training
else
self
.
_config_dict
[
'test_num_proposals'
]),
use_batched_nms
=
self
.
_config_dict
[
'use_batched_nms'
],
decode_boxes
=
True
,
clip_boxes
=
True
,
apply_sigmoid_to_score
=
True
)
return
roi_boxes
,
roi_scores
def
get_config
(
self
):
return
self
.
_config_dict
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
return
cls
(
**
config
)
official/vision/modeling/layers/roi_sampler.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of ROI sampler."""
# Import libraries
import
tensorflow
as
tf
from
official.vision.modeling.layers
import
box_sampler
from
official.vision.ops
import
box_matcher
from
official.vision.ops
import
iou_similarity
from
official.vision.ops
import
target_gather
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
ROISampler
(
tf
.
keras
.
layers
.
Layer
):
"""Samples ROIs and assigns targets to the sampled ROIs."""
def
__init__
(
self
,
mix_gt_boxes
:
bool
=
True
,
num_sampled_rois
:
int
=
512
,
foreground_fraction
:
float
=
0.25
,
foreground_iou_threshold
:
float
=
0.5
,
background_iou_high_threshold
:
float
=
0.5
,
background_iou_low_threshold
:
float
=
0
,
skip_subsampling
:
bool
=
False
,
**
kwargs
):
"""Initializes a ROI sampler.
Args:
mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with
proposed ROIs.
num_sampled_rois: An `int` of the number of sampled ROIs per image.
foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs
should be sampled from the foreground boxes.
foreground_iou_threshold: A `float` that represents the IoU threshold for
a box to be considered as positive (if >= `foreground_iou_threshold`).
background_iou_high_threshold: A `float` that represents the IoU threshold
for a box to be considered as negative (if overlap in
[`background_iou_low_threshold`, `background_iou_high_threshold`]).
background_iou_low_threshold: A `float` that represents the IoU threshold
for a box to be considered as negative (if overlap in
[`background_iou_low_threshold`, `background_iou_high_threshold`])
skip_subsampling: a bool that determines if we want to skip the sampling
procedure than balances the fg/bg classes. Used for upper frcnn layers
in cascade RCNN.
**kwargs: Additional keyword arguments passed to Layer.
"""
self
.
_config_dict
=
{
'mix_gt_boxes'
:
mix_gt_boxes
,
'num_sampled_rois'
:
num_sampled_rois
,
'foreground_fraction'
:
foreground_fraction
,
'foreground_iou_threshold'
:
foreground_iou_threshold
,
'background_iou_high_threshold'
:
background_iou_high_threshold
,
'background_iou_low_threshold'
:
background_iou_low_threshold
,
'skip_subsampling'
:
skip_subsampling
,
}
self
.
_sim_calc
=
iou_similarity
.
IouSimilarity
()
self
.
_box_matcher
=
box_matcher
.
BoxMatcher
(
thresholds
=
[
background_iou_low_threshold
,
background_iou_high_threshold
,
foreground_iou_threshold
],
indicators
=
[
-
3
,
-
1
,
-
2
,
1
])
self
.
_target_gather
=
target_gather
.
TargetGather
()
self
.
_sampler
=
box_sampler
.
BoxSampler
(
num_sampled_rois
,
foreground_fraction
)
super
(
ROISampler
,
self
).
__init__
(
**
kwargs
)
def
call
(
self
,
boxes
:
tf
.
Tensor
,
gt_boxes
:
tf
.
Tensor
,
gt_classes
:
tf
.
Tensor
):
"""Assigns the proposals with groundtruth classes and performs subsmpling.
Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
following algorithm to generate the final `num_samples_per_image` RoIs.
1. Calculates the IoU between each proposal box and each gt_boxes.
2. Assigns each proposed box with a groundtruth class and box by choosing
the largest IoU overlap.
3. Samples `num_samples_per_image` boxes from all proposed boxes, and
returns box_targets, class_targets, and RoIs.
Args:
boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the
box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
format.
gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4].
The coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES].
This tensor might have paddings with values of -1 indicating the invalid
classes.
Returns:
sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing
the coordinates of the sampled RoIs, where K is the number of the
sampled RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing
the box coordinates of the matched groundtruth boxes of the samples
RoIs.
sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the
indices of the sampled groudntruth boxes in the original `gt_boxes`
tensor, i.e.,
gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
"""
gt_boxes
=
tf
.
cast
(
gt_boxes
,
dtype
=
boxes
.
dtype
)
if
self
.
_config_dict
[
'mix_gt_boxes'
]:
boxes
=
tf
.
concat
([
boxes
,
gt_boxes
],
axis
=
1
)
boxes_invalid_mask
=
tf
.
less
(
tf
.
reduce_max
(
boxes
,
axis
=-
1
,
keepdims
=
True
),
0.0
)
gt_invalid_mask
=
tf
.
less
(
tf
.
reduce_max
(
gt_boxes
,
axis
=-
1
,
keepdims
=
True
),
0.0
)
similarity_matrix
=
self
.
_sim_calc
(
boxes
,
gt_boxes
,
boxes_invalid_mask
,
gt_invalid_mask
)
matched_gt_indices
,
match_indicators
=
self
.
_box_matcher
(
similarity_matrix
)
positive_matches
=
tf
.
greater_equal
(
match_indicators
,
0
)
negative_matches
=
tf
.
equal
(
match_indicators
,
-
1
)
ignored_matches
=
tf
.
equal
(
match_indicators
,
-
2
)
invalid_matches
=
tf
.
equal
(
match_indicators
,
-
3
)
background_mask
=
tf
.
expand_dims
(
tf
.
logical_or
(
negative_matches
,
invalid_matches
),
-
1
)
gt_classes
=
tf
.
expand_dims
(
gt_classes
,
axis
=-
1
)
matched_gt_classes
=
self
.
_target_gather
(
gt_classes
,
matched_gt_indices
,
background_mask
)
matched_gt_classes
=
tf
.
where
(
background_mask
,
tf
.
zeros_like
(
matched_gt_classes
),
matched_gt_classes
)
matched_gt_boxes
=
self
.
_target_gather
(
gt_boxes
,
matched_gt_indices
,
tf
.
tile
(
background_mask
,
[
1
,
1
,
4
]))
matched_gt_boxes
=
tf
.
where
(
background_mask
,
tf
.
zeros_like
(
matched_gt_boxes
),
matched_gt_boxes
)
matched_gt_indices
=
tf
.
where
(
tf
.
squeeze
(
background_mask
,
-
1
),
-
tf
.
ones_like
(
matched_gt_indices
),
matched_gt_indices
)
if
self
.
_config_dict
[
'skip_subsampling'
]:
return
(
boxes
,
matched_gt_boxes
,
tf
.
squeeze
(
matched_gt_classes
,
axis
=-
1
),
matched_gt_indices
)
sampled_indices
=
self
.
_sampler
(
positive_matches
,
negative_matches
,
ignored_matches
)
sampled_rois
=
self
.
_target_gather
(
boxes
,
sampled_indices
)
sampled_gt_boxes
=
self
.
_target_gather
(
matched_gt_boxes
,
sampled_indices
)
sampled_gt_classes
=
tf
.
squeeze
(
self
.
_target_gather
(
matched_gt_classes
,
sampled_indices
),
axis
=-
1
)
sampled_gt_indices
=
tf
.
squeeze
(
self
.
_target_gather
(
tf
.
expand_dims
(
matched_gt_indices
,
-
1
),
sampled_indices
),
axis
=-
1
)
return
(
sampled_rois
,
sampled_gt_boxes
,
sampled_gt_classes
,
sampled_gt_indices
)
def
get_config
(
self
):
return
self
.
_config_dict
@
classmethod
def
from_config
(
cls
,
config
):
return
cls
(
**
config
)
official/vision/modeling/maskrcnn_model.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""R-CNN(-RS) models."""
from
typing
import
Any
,
List
,
Mapping
,
Optional
,
Tuple
,
Union
import
tensorflow
as
tf
from
official.vision.ops
import
anchor
from
official.vision.ops
import
box_ops
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
MaskRCNNModel
(
tf
.
keras
.
Model
):
"""The Mask R-CNN(-RS) and Cascade RCNN-RS models."""
def
__init__
(
self
,
backbone
:
tf
.
keras
.
Model
,
decoder
:
tf
.
keras
.
Model
,
rpn_head
:
tf
.
keras
.
layers
.
Layer
,
detection_head
:
Union
[
tf
.
keras
.
layers
.
Layer
,
List
[
tf
.
keras
.
layers
.
Layer
]],
roi_generator
:
tf
.
keras
.
layers
.
Layer
,
roi_sampler
:
Union
[
tf
.
keras
.
layers
.
Layer
,
List
[
tf
.
keras
.
layers
.
Layer
]],
roi_aligner
:
tf
.
keras
.
layers
.
Layer
,
detection_generator
:
tf
.
keras
.
layers
.
Layer
,
mask_head
:
Optional
[
tf
.
keras
.
layers
.
Layer
]
=
None
,
mask_sampler
:
Optional
[
tf
.
keras
.
layers
.
Layer
]
=
None
,
mask_roi_aligner
:
Optional
[
tf
.
keras
.
layers
.
Layer
]
=
None
,
class_agnostic_bbox_pred
:
bool
=
False
,
cascade_class_ensemble
:
bool
=
False
,
min_level
:
Optional
[
int
]
=
None
,
max_level
:
Optional
[
int
]
=
None
,
num_scales
:
Optional
[
int
]
=
None
,
aspect_ratios
:
Optional
[
List
[
float
]]
=
None
,
anchor_size
:
Optional
[
float
]
=
None
,
**
kwargs
):
"""Initializes the R-CNN(-RS) model.
Args:
backbone: `tf.keras.Model`, the backbone network.
decoder: `tf.keras.Model`, the decoder network.
rpn_head: the RPN head.
detection_head: the detection head or a list of heads.
roi_generator: the ROI generator.
roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
detection heads.
roi_aligner: the ROI aligner.
detection_generator: the detection generator.
mask_head: the mask head.
mask_sampler: the mask sampler.
mask_roi_aligner: the ROI alginer for mask prediction.
class_agnostic_bbox_pred: if True, perform class agnostic bounding box
prediction. Needs to be `True` for Cascade RCNN models.
cascade_class_ensemble: if True, ensemble classification scores over all
detection heads.
min_level: Minimum level in output feature maps.
max_level: Maximum level in output feature maps.
num_scales: A number representing intermediate scales added on each level.
For instances, num_scales=2 adds one additional intermediate anchor
scales [2^0, 2^0.5] on each level.
aspect_ratios: A list representing the aspect raito anchors added on each
level. The number indicates the ratio of width to height. For instances,
aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
anchor_size: A number representing the scale of size of the base anchor to
the feature stride 2^level.
**kwargs: keyword arguments to be passed.
"""
super
(
MaskRCNNModel
,
self
).
__init__
(
**
kwargs
)
self
.
_config_dict
=
{
'backbone'
:
backbone
,
'decoder'
:
decoder
,
'rpn_head'
:
rpn_head
,
'detection_head'
:
detection_head
,
'roi_generator'
:
roi_generator
,
'roi_sampler'
:
roi_sampler
,
'roi_aligner'
:
roi_aligner
,
'detection_generator'
:
detection_generator
,
'mask_head'
:
mask_head
,
'mask_sampler'
:
mask_sampler
,
'mask_roi_aligner'
:
mask_roi_aligner
,
'class_agnostic_bbox_pred'
:
class_agnostic_bbox_pred
,
'cascade_class_ensemble'
:
cascade_class_ensemble
,
'min_level'
:
min_level
,
'max_level'
:
max_level
,
'num_scales'
:
num_scales
,
'aspect_ratios'
:
aspect_ratios
,
'anchor_size'
:
anchor_size
,
}
self
.
backbone
=
backbone
self
.
decoder
=
decoder
self
.
rpn_head
=
rpn_head
if
not
isinstance
(
detection_head
,
(
list
,
tuple
)):
self
.
detection_head
=
[
detection_head
]
else
:
self
.
detection_head
=
detection_head
self
.
roi_generator
=
roi_generator
if
not
isinstance
(
roi_sampler
,
(
list
,
tuple
)):
self
.
roi_sampler
=
[
roi_sampler
]
else
:
self
.
roi_sampler
=
roi_sampler
if
len
(
self
.
roi_sampler
)
>
1
and
not
class_agnostic_bbox_pred
:
raise
ValueError
(
'`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.'
)
self
.
roi_aligner
=
roi_aligner
self
.
detection_generator
=
detection_generator
self
.
_include_mask
=
mask_head
is
not
None
self
.
mask_head
=
mask_head
if
self
.
_include_mask
and
mask_sampler
is
None
:
raise
ValueError
(
'`mask_sampler` is not provided in Mask R-CNN.'
)
self
.
mask_sampler
=
mask_sampler
if
self
.
_include_mask
and
mask_roi_aligner
is
None
:
raise
ValueError
(
'`mask_roi_aligner` is not provided in Mask R-CNN.'
)
self
.
mask_roi_aligner
=
mask_roi_aligner
# Weights for the regression losses for each FRCNN layer.
# TODO(xianzhi): Make the weights configurable.
self
.
_cascade_layer_to_weights
=
[
[
10.0
,
10.0
,
5.0
,
5.0
],
[
20.0
,
20.0
,
10.0
,
10.0
],
[
30.0
,
30.0
,
15.0
,
15.0
],
]
def
call
(
self
,
images
:
tf
.
Tensor
,
image_shape
:
tf
.
Tensor
,
anchor_boxes
:
Optional
[
Mapping
[
str
,
tf
.
Tensor
]]
=
None
,
gt_boxes
:
Optional
[
tf
.
Tensor
]
=
None
,
gt_classes
:
Optional
[
tf
.
Tensor
]
=
None
,
gt_masks
:
Optional
[
tf
.
Tensor
]
=
None
,
training
:
Optional
[
bool
]
=
None
)
->
Mapping
[
str
,
tf
.
Tensor
]:
model_outputs
,
intermediate_outputs
=
self
.
_call_box_outputs
(
images
=
images
,
image_shape
=
image_shape
,
anchor_boxes
=
anchor_boxes
,
gt_boxes
=
gt_boxes
,
gt_classes
=
gt_classes
,
training
=
training
)
if
not
self
.
_include_mask
:
return
model_outputs
model_mask_outputs
=
self
.
_call_mask_outputs
(
model_box_outputs
=
model_outputs
,
features
=
model_outputs
[
'decoder_features'
],
current_rois
=
intermediate_outputs
[
'current_rois'
],
matched_gt_indices
=
intermediate_outputs
[
'matched_gt_indices'
],
matched_gt_boxes
=
intermediate_outputs
[
'matched_gt_boxes'
],
matched_gt_classes
=
intermediate_outputs
[
'matched_gt_classes'
],
gt_masks
=
gt_masks
,
training
=
training
)
model_outputs
.
update
(
model_mask_outputs
)
return
model_outputs
def
_get_backbone_and_decoder_features
(
self
,
images
):
backbone_features
=
self
.
backbone
(
images
)
if
self
.
decoder
:
features
=
self
.
decoder
(
backbone_features
)
else
:
features
=
backbone_features
return
backbone_features
,
features
def
_call_box_outputs
(
self
,
images
:
tf
.
Tensor
,
image_shape
:
tf
.
Tensor
,
anchor_boxes
:
Optional
[
Mapping
[
str
,
tf
.
Tensor
]]
=
None
,
gt_boxes
:
Optional
[
tf
.
Tensor
]
=
None
,
gt_classes
:
Optional
[
tf
.
Tensor
]
=
None
,
training
:
Optional
[
bool
]
=
None
)
->
Tuple
[
Mapping
[
str
,
tf
.
Tensor
],
Mapping
[
str
,
tf
.
Tensor
]]:
"""Implementation of the Faster-RCNN logic for boxes."""
model_outputs
=
{}
# Feature extraction.
(
backbone_features
,
decoder_features
)
=
self
.
_get_backbone_and_decoder_features
(
images
)
# Region proposal network.
rpn_scores
,
rpn_boxes
=
self
.
rpn_head
(
decoder_features
)
model_outputs
.
update
({
'backbone_features'
:
backbone_features
,
'decoder_features'
:
decoder_features
,
'rpn_boxes'
:
rpn_boxes
,
'rpn_scores'
:
rpn_scores
})
# Generate anchor boxes for this batch if not provided.
if
anchor_boxes
is
None
:
_
,
image_height
,
image_width
,
_
=
images
.
get_shape
().
as_list
()
anchor_boxes
=
anchor
.
Anchor
(
min_level
=
self
.
_config_dict
[
'min_level'
],
max_level
=
self
.
_config_dict
[
'max_level'
],
num_scales
=
self
.
_config_dict
[
'num_scales'
],
aspect_ratios
=
self
.
_config_dict
[
'aspect_ratios'
],
anchor_size
=
self
.
_config_dict
[
'anchor_size'
],
image_size
=
(
image_height
,
image_width
)).
multilevel_boxes
for
l
in
anchor_boxes
:
anchor_boxes
[
l
]
=
tf
.
tile
(
tf
.
expand_dims
(
anchor_boxes
[
l
],
axis
=
0
),
[
tf
.
shape
(
images
)[
0
],
1
,
1
,
1
])
# Generate RoIs.
current_rois
,
_
=
self
.
roi_generator
(
rpn_boxes
,
rpn_scores
,
anchor_boxes
,
image_shape
,
training
)
next_rois
=
current_rois
all_class_outputs
=
[]
for
cascade_num
in
range
(
len
(
self
.
roi_sampler
)):
# In cascade RCNN we want the higher layers to have different regression
# weights as the predicted deltas become smaller and smaller.
regression_weights
=
self
.
_cascade_layer_to_weights
[
cascade_num
]
current_rois
=
next_rois
(
class_outputs
,
box_outputs
,
model_outputs
,
matched_gt_boxes
,
matched_gt_classes
,
matched_gt_indices
,
current_rois
)
=
self
.
_run_frcnn_head
(
features
=
decoder_features
,
rois
=
current_rois
,
gt_boxes
=
gt_boxes
,
gt_classes
=
gt_classes
,
training
=
training
,
model_outputs
=
model_outputs
,
cascade_num
=
cascade_num
,
regression_weights
=
regression_weights
)
all_class_outputs
.
append
(
class_outputs
)
# Generate ROIs for the next cascade head if there is any.
if
cascade_num
<
len
(
self
.
roi_sampler
)
-
1
:
next_rois
=
box_ops
.
decode_boxes
(
tf
.
cast
(
box_outputs
,
tf
.
float32
),
current_rois
,
weights
=
regression_weights
)
next_rois
=
box_ops
.
clip_boxes
(
next_rois
,
tf
.
expand_dims
(
image_shape
,
axis
=
1
))
if
not
training
:
if
self
.
_config_dict
[
'cascade_class_ensemble'
]:
class_outputs
=
tf
.
add_n
(
all_class_outputs
)
/
len
(
all_class_outputs
)
detections
=
self
.
detection_generator
(
box_outputs
,
class_outputs
,
current_rois
,
image_shape
,
regression_weights
,
bbox_per_class
=
(
not
self
.
_config_dict
[
'class_agnostic_bbox_pred'
]))
model_outputs
.
update
({
'cls_outputs'
:
class_outputs
,
'box_outputs'
:
box_outputs
,
})
if
self
.
detection_generator
.
get_config
()[
'apply_nms'
]:
model_outputs
.
update
({
'detection_boxes'
:
detections
[
'detection_boxes'
],
'detection_scores'
:
detections
[
'detection_scores'
],
'detection_classes'
:
detections
[
'detection_classes'
],
'num_detections'
:
detections
[
'num_detections'
]
})
else
:
model_outputs
.
update
({
'decoded_boxes'
:
detections
[
'decoded_boxes'
],
'decoded_box_scores'
:
detections
[
'decoded_box_scores'
]
})
intermediate_outputs
=
{
'matched_gt_boxes'
:
matched_gt_boxes
,
'matched_gt_indices'
:
matched_gt_indices
,
'matched_gt_classes'
:
matched_gt_classes
,
'current_rois'
:
current_rois
,
}
return
(
model_outputs
,
intermediate_outputs
)
def
_call_mask_outputs
(
self
,
model_box_outputs
:
Mapping
[
str
,
tf
.
Tensor
],
features
:
tf
.
Tensor
,
current_rois
:
tf
.
Tensor
,
matched_gt_indices
:
tf
.
Tensor
,
matched_gt_boxes
:
tf
.
Tensor
,
matched_gt_classes
:
tf
.
Tensor
,
gt_masks
:
tf
.
Tensor
,
training
:
Optional
[
bool
]
=
None
)
->
Mapping
[
str
,
tf
.
Tensor
]:
"""Implementation of Mask-RCNN mask prediction logic."""
model_outputs
=
dict
(
model_box_outputs
)
if
training
:
current_rois
,
roi_classes
,
roi_masks
=
self
.
mask_sampler
(
current_rois
,
matched_gt_boxes
,
matched_gt_classes
,
matched_gt_indices
,
gt_masks
)
roi_masks
=
tf
.
stop_gradient
(
roi_masks
)
model_outputs
.
update
({
'mask_class_targets'
:
roi_classes
,
'mask_targets'
:
roi_masks
,
})
else
:
current_rois
=
model_outputs
[
'detection_boxes'
]
roi_classes
=
model_outputs
[
'detection_classes'
]
mask_logits
,
mask_probs
=
self
.
_features_to_mask_outputs
(
features
,
current_rois
,
roi_classes
)
if
training
:
model_outputs
.
update
({
'mask_outputs'
:
mask_logits
,
})
else
:
model_outputs
.
update
({
'detection_masks'
:
mask_probs
,
})
return
model_outputs
def
_run_frcnn_head
(
self
,
features
,
rois
,
gt_boxes
,
gt_classes
,
training
,
model_outputs
,
cascade_num
,
regression_weights
):
"""Runs the frcnn head that does both class and box prediction.
Args:
features: `list` of features from the feature extractor.
rois: `list` of current rois that will be used to predict bbox refinement
and classes from.
gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
This tensor might have paddings with a negative value.
gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
classes. It is padded with -1s to indicate the invalid classes.
training: `bool`, if model is training or being evaluated.
model_outputs: `dict`, used for storing outputs used for eval and losses.
cascade_num: `int`, the current frcnn layer in the cascade.
regression_weights: `list`, weights used for l1 loss in bounding box
regression.
Returns:
class_outputs: Class predictions for rois.
box_outputs: Box predictions for rois. These are formatted for the
regression loss and need to be converted before being used as rois
in the next stage.
model_outputs: Updated dict with predictions used for losses and eval.
matched_gt_boxes: If `is_training` is true, then these give the gt box
location of its positive match.
matched_gt_classes: If `is_training` is true, then these give the gt class
of the predicted box.
matched_gt_boxes: If `is_training` is true, then these give the box
location of its positive match.
matched_gt_indices: If `is_training` is true, then gives the index of
the positive box match. Used for mask prediction.
rois: The sampled rois used for this layer.
"""
# Only used during training.
matched_gt_boxes
,
matched_gt_classes
,
matched_gt_indices
=
(
None
,
None
,
None
)
if
training
and
gt_boxes
is
not
None
:
rois
=
tf
.
stop_gradient
(
rois
)
current_roi_sampler
=
self
.
roi_sampler
[
cascade_num
]
rois
,
matched_gt_boxes
,
matched_gt_classes
,
matched_gt_indices
=
(
current_roi_sampler
(
rois
,
gt_boxes
,
gt_classes
))
# Create bounding box training targets.
box_targets
=
box_ops
.
encode_boxes
(
matched_gt_boxes
,
rois
,
weights
=
regression_weights
)
# If the target is background, the box target is set to all 0s.
box_targets
=
tf
.
where
(
tf
.
tile
(
tf
.
expand_dims
(
tf
.
equal
(
matched_gt_classes
,
0
),
axis
=-
1
),
[
1
,
1
,
4
]),
tf
.
zeros_like
(
box_targets
),
box_targets
)
model_outputs
.
update
({
'class_targets_{}'
.
format
(
cascade_num
)
if
cascade_num
else
'class_targets'
:
matched_gt_classes
,
'box_targets_{}'
.
format
(
cascade_num
)
if
cascade_num
else
'box_targets'
:
box_targets
,
})
# Get roi features.
roi_features
=
self
.
roi_aligner
(
features
,
rois
)
# Run frcnn head to get class and bbox predictions.
current_detection_head
=
self
.
detection_head
[
cascade_num
]
class_outputs
,
box_outputs
=
current_detection_head
(
roi_features
)
model_outputs
.
update
({
'class_outputs_{}'
.
format
(
cascade_num
)
if
cascade_num
else
'class_outputs'
:
class_outputs
,
'box_outputs_{}'
.
format
(
cascade_num
)
if
cascade_num
else
'box_outputs'
:
box_outputs
,
})
return
(
class_outputs
,
box_outputs
,
model_outputs
,
matched_gt_boxes
,
matched_gt_classes
,
matched_gt_indices
,
rois
)
def
_features_to_mask_outputs
(
self
,
features
,
rois
,
roi_classes
):
# Mask RoI align.
mask_roi_features
=
self
.
mask_roi_aligner
(
features
,
rois
)
# Mask head.
raw_masks
=
self
.
mask_head
([
mask_roi_features
,
roi_classes
])
return
raw_masks
,
tf
.
nn
.
sigmoid
(
raw_masks
)
@
property
def
checkpoint_items
(
self
)
->
Mapping
[
str
,
Union
[
tf
.
keras
.
Model
,
tf
.
keras
.
layers
.
Layer
]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items
=
dict
(
backbone
=
self
.
backbone
,
rpn_head
=
self
.
rpn_head
,
detection_head
=
self
.
detection_head
)
if
self
.
decoder
is
not
None
:
items
.
update
(
decoder
=
self
.
decoder
)
if
self
.
_include_mask
:
items
.
update
(
mask_head
=
self
.
mask_head
)
return
items
def
get_config
(
self
)
->
Mapping
[
str
,
Any
]:
return
self
.
_config_dict
@
classmethod
def
from_config
(
cls
,
config
):
return
cls
(
**
config
)
official/vision/modeling/maskrcnn_model_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for maskrcnn_model.py."""
import
os
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.distribute
import
strategy_combinations
from
official.vision.modeling
import
maskrcnn_model
from
official.vision.modeling.backbones
import
resnet
from
official.vision.modeling.decoders
import
fpn
from
official.vision.modeling.heads
import
dense_prediction_heads
from
official.vision.modeling.heads
import
instance_heads
from
official.vision.modeling.layers
import
detection_generator
from
official.vision.modeling.layers
import
mask_sampler
from
official.vision.modeling.layers
import
roi_aligner
from
official.vision.modeling.layers
import
roi_generator
from
official.vision.modeling.layers
import
roi_sampler
from
official.vision.ops
import
anchor
class
MaskRCNNModelTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
combinations
.
generate
(
combinations
.
combine
(
include_mask
=
[
True
,
False
],
use_separable_conv
=
[
True
,
False
],
build_anchor_boxes
=
[
True
,
False
],
is_training
=
[
True
,
False
]))
def
test_build_model
(
self
,
include_mask
,
use_separable_conv
,
build_anchor_boxes
,
is_training
):
num_classes
=
3
min_level
=
3
max_level
=
7
num_scales
=
3
aspect_ratios
=
[
1.0
]
anchor_size
=
3
resnet_model_id
=
50
num_anchors_per_location
=
num_scales
*
len
(
aspect_ratios
)
image_size
=
384
images
=
np
.
random
.
rand
(
2
,
image_size
,
image_size
,
3
)
image_shape
=
np
.
array
([[
image_size
,
image_size
],
[
image_size
,
image_size
]])
if
build_anchor_boxes
:
anchor_boxes
=
anchor
.
Anchor
(
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
3
,
image_size
=
(
image_size
,
image_size
)).
multilevel_boxes
for
l
in
anchor_boxes
:
anchor_boxes
[
l
]
=
tf
.
tile
(
tf
.
expand_dims
(
anchor_boxes
[
l
],
axis
=
0
),
[
2
,
1
,
1
,
1
])
else
:
anchor_boxes
=
None
backbone
=
resnet
.
ResNet
(
model_id
=
resnet_model_id
)
decoder
=
fpn
.
FPN
(
input_specs
=
backbone
.
output_specs
,
min_level
=
min_level
,
max_level
=
max_level
,
use_separable_conv
=
use_separable_conv
)
rpn_head
=
dense_prediction_heads
.
RPNHead
(
min_level
=
min_level
,
max_level
=
max_level
,
num_anchors_per_location
=
num_anchors_per_location
,
num_convs
=
1
)
detection_head
=
instance_heads
.
DetectionHead
(
num_classes
=
num_classes
)
roi_generator_obj
=
roi_generator
.
MultilevelROIGenerator
()
roi_sampler_obj
=
roi_sampler
.
ROISampler
()
roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
()
detection_generator_obj
=
detection_generator
.
DetectionGenerator
()
if
include_mask
:
mask_head
=
instance_heads
.
MaskHead
(
num_classes
=
num_classes
,
upsample_factor
=
2
)
mask_sampler_obj
=
mask_sampler
.
MaskSampler
(
mask_target_size
=
28
,
num_sampled_masks
=
1
)
mask_roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
(
crop_size
=
14
)
else
:
mask_head
=
None
mask_sampler_obj
=
None
mask_roi_aligner_obj
=
None
model
=
maskrcnn_model
.
MaskRCNNModel
(
backbone
,
decoder
,
rpn_head
,
detection_head
,
roi_generator_obj
,
roi_sampler_obj
,
roi_aligner_obj
,
detection_generator_obj
,
mask_head
,
mask_sampler_obj
,
mask_roi_aligner_obj
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
)
gt_boxes
=
np
.
array
(
[[[
10
,
10
,
15
,
15
],
[
2.5
,
2.5
,
7.5
,
7.5
],
[
-
1
,
-
1
,
-
1
,
-
1
]],
[[
100
,
100
,
150
,
150
],
[
-
1
,
-
1
,
-
1
,
-
1
],
[
-
1
,
-
1
,
-
1
,
-
1
]]],
dtype
=
np
.
float32
)
gt_classes
=
np
.
array
([[
2
,
1
,
-
1
],
[
1
,
-
1
,
-
1
]],
dtype
=
np
.
int32
)
if
include_mask
:
gt_masks
=
np
.
ones
((
2
,
3
,
100
,
100
))
else
:
gt_masks
=
None
# Results will be checked in test_forward.
_
=
model
(
images
,
image_shape
,
anchor_boxes
,
gt_boxes
,
gt_classes
,
gt_masks
,
training
=
is_training
)
@
combinations
.
generate
(
combinations
.
combine
(
strategy
=
[
strategy_combinations
.
cloud_tpu_strategy
,
strategy_combinations
.
one_device_strategy_gpu
,
],
include_mask
=
[
True
,
False
],
build_anchor_boxes
=
[
True
,
False
],
use_cascade_heads
=
[
True
,
False
],
training
=
[
True
,
False
],
))
def
test_forward
(
self
,
strategy
,
include_mask
,
build_anchor_boxes
,
training
,
use_cascade_heads
):
num_classes
=
3
min_level
=
3
max_level
=
4
num_scales
=
3
aspect_ratios
=
[
1.0
]
anchor_size
=
3
if
use_cascade_heads
:
cascade_iou_thresholds
=
[
0.6
]
class_agnostic_bbox_pred
=
True
cascade_class_ensemble
=
True
else
:
cascade_iou_thresholds
=
None
class_agnostic_bbox_pred
=
False
cascade_class_ensemble
=
False
image_size
=
(
256
,
256
)
images
=
np
.
random
.
rand
(
2
,
image_size
[
0
],
image_size
[
1
],
3
)
image_shape
=
np
.
array
([[
224
,
100
],
[
100
,
224
]])
with
strategy
.
scope
():
if
build_anchor_boxes
:
anchor_boxes
=
anchor
.
Anchor
(
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
,
image_size
=
image_size
).
multilevel_boxes
else
:
anchor_boxes
=
None
num_anchors_per_location
=
len
(
aspect_ratios
)
*
num_scales
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
None
,
None
,
None
,
3
])
backbone
=
resnet
.
ResNet
(
model_id
=
50
,
input_specs
=
input_specs
)
decoder
=
fpn
.
FPN
(
min_level
=
min_level
,
max_level
=
max_level
,
input_specs
=
backbone
.
output_specs
)
rpn_head
=
dense_prediction_heads
.
RPNHead
(
min_level
=
min_level
,
max_level
=
max_level
,
num_anchors_per_location
=
num_anchors_per_location
)
detection_head
=
instance_heads
.
DetectionHead
(
num_classes
=
num_classes
,
class_agnostic_bbox_pred
=
class_agnostic_bbox_pred
)
roi_generator_obj
=
roi_generator
.
MultilevelROIGenerator
()
roi_sampler_cascade
=
[]
roi_sampler_obj
=
roi_sampler
.
ROISampler
()
roi_sampler_cascade
.
append
(
roi_sampler_obj
)
if
cascade_iou_thresholds
:
for
iou
in
cascade_iou_thresholds
:
roi_sampler_obj
=
roi_sampler
.
ROISampler
(
mix_gt_boxes
=
False
,
foreground_iou_threshold
=
iou
,
background_iou_high_threshold
=
iou
,
background_iou_low_threshold
=
0.0
,
skip_subsampling
=
True
)
roi_sampler_cascade
.
append
(
roi_sampler_obj
)
roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
()
detection_generator_obj
=
detection_generator
.
DetectionGenerator
()
if
include_mask
:
mask_head
=
instance_heads
.
MaskHead
(
num_classes
=
num_classes
,
upsample_factor
=
2
)
mask_sampler_obj
=
mask_sampler
.
MaskSampler
(
mask_target_size
=
28
,
num_sampled_masks
=
1
)
mask_roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
(
crop_size
=
14
)
else
:
mask_head
=
None
mask_sampler_obj
=
None
mask_roi_aligner_obj
=
None
model
=
maskrcnn_model
.
MaskRCNNModel
(
backbone
,
decoder
,
rpn_head
,
detection_head
,
roi_generator_obj
,
roi_sampler_obj
,
roi_aligner_obj
,
detection_generator_obj
,
mask_head
,
mask_sampler_obj
,
mask_roi_aligner_obj
,
class_agnostic_bbox_pred
=
class_agnostic_bbox_pred
,
cascade_class_ensemble
=
cascade_class_ensemble
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
)
gt_boxes
=
np
.
array
(
[[[
10
,
10
,
15
,
15
],
[
2.5
,
2.5
,
7.5
,
7.5
],
[
-
1
,
-
1
,
-
1
,
-
1
]],
[[
100
,
100
,
150
,
150
],
[
-
1
,
-
1
,
-
1
,
-
1
],
[
-
1
,
-
1
,
-
1
,
-
1
]]],
dtype
=
np
.
float32
)
gt_classes
=
np
.
array
([[
2
,
1
,
-
1
],
[
1
,
-
1
,
-
1
]],
dtype
=
np
.
int32
)
if
include_mask
:
gt_masks
=
np
.
ones
((
2
,
3
,
100
,
100
))
else
:
gt_masks
=
None
results
=
model
(
images
,
image_shape
,
anchor_boxes
,
gt_boxes
,
gt_classes
,
gt_masks
,
training
=
training
)
self
.
assertIn
(
'rpn_boxes'
,
results
)
self
.
assertIn
(
'rpn_scores'
,
results
)
if
training
:
self
.
assertIn
(
'class_targets'
,
results
)
self
.
assertIn
(
'box_targets'
,
results
)
self
.
assertIn
(
'class_outputs'
,
results
)
self
.
assertIn
(
'box_outputs'
,
results
)
if
include_mask
:
self
.
assertIn
(
'mask_outputs'
,
results
)
else
:
self
.
assertIn
(
'detection_boxes'
,
results
)
self
.
assertIn
(
'detection_scores'
,
results
)
self
.
assertIn
(
'detection_classes'
,
results
)
self
.
assertIn
(
'num_detections'
,
results
)
if
include_mask
:
self
.
assertIn
(
'detection_masks'
,
results
)
@
parameterized
.
parameters
(
(
False
,),
(
True
,),
)
def
test_serialize_deserialize
(
self
,
include_mask
):
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
None
,
None
,
None
,
3
])
backbone
=
resnet
.
ResNet
(
model_id
=
50
,
input_specs
=
input_specs
)
decoder
=
fpn
.
FPN
(
min_level
=
3
,
max_level
=
7
,
input_specs
=
backbone
.
output_specs
)
rpn_head
=
dense_prediction_heads
.
RPNHead
(
min_level
=
3
,
max_level
=
7
,
num_anchors_per_location
=
3
)
detection_head
=
instance_heads
.
DetectionHead
(
num_classes
=
2
)
roi_generator_obj
=
roi_generator
.
MultilevelROIGenerator
()
roi_sampler_obj
=
roi_sampler
.
ROISampler
()
roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
()
detection_generator_obj
=
detection_generator
.
DetectionGenerator
()
if
include_mask
:
mask_head
=
instance_heads
.
MaskHead
(
num_classes
=
2
,
upsample_factor
=
2
)
mask_sampler_obj
=
mask_sampler
.
MaskSampler
(
mask_target_size
=
28
,
num_sampled_masks
=
1
)
mask_roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
(
crop_size
=
14
)
else
:
mask_head
=
None
mask_sampler_obj
=
None
mask_roi_aligner_obj
=
None
model
=
maskrcnn_model
.
MaskRCNNModel
(
backbone
,
decoder
,
rpn_head
,
detection_head
,
roi_generator_obj
,
roi_sampler_obj
,
roi_aligner_obj
,
detection_generator_obj
,
mask_head
,
mask_sampler_obj
,
mask_roi_aligner_obj
,
min_level
=
3
,
max_level
=
7
,
num_scales
=
3
,
aspect_ratios
=
[
1.0
],
anchor_size
=
3
)
config
=
model
.
get_config
()
new_model
=
maskrcnn_model
.
MaskRCNNModel
.
from_config
(
config
)
# Validate that the config can be forced to JSON.
_
=
new_model
.
to_json
()
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
model
.
get_config
(),
new_model
.
get_config
())
@
parameterized
.
parameters
(
(
False
,),
(
True
,),
)
def
test_checkpoint
(
self
,
include_mask
):
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
None
,
None
,
None
,
3
])
backbone
=
resnet
.
ResNet
(
model_id
=
50
,
input_specs
=
input_specs
)
decoder
=
fpn
.
FPN
(
min_level
=
3
,
max_level
=
7
,
input_specs
=
backbone
.
output_specs
)
rpn_head
=
dense_prediction_heads
.
RPNHead
(
min_level
=
3
,
max_level
=
7
,
num_anchors_per_location
=
3
)
detection_head
=
instance_heads
.
DetectionHead
(
num_classes
=
2
)
roi_generator_obj
=
roi_generator
.
MultilevelROIGenerator
()
roi_sampler_obj
=
roi_sampler
.
ROISampler
()
roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
()
detection_generator_obj
=
detection_generator
.
DetectionGenerator
()
if
include_mask
:
mask_head
=
instance_heads
.
MaskHead
(
num_classes
=
2
,
upsample_factor
=
2
)
mask_sampler_obj
=
mask_sampler
.
MaskSampler
(
mask_target_size
=
28
,
num_sampled_masks
=
1
)
mask_roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
(
crop_size
=
14
)
else
:
mask_head
=
None
mask_sampler_obj
=
None
mask_roi_aligner_obj
=
None
model
=
maskrcnn_model
.
MaskRCNNModel
(
backbone
,
decoder
,
rpn_head
,
detection_head
,
roi_generator_obj
,
roi_sampler_obj
,
roi_aligner_obj
,
detection_generator_obj
,
mask_head
,
mask_sampler_obj
,
mask_roi_aligner_obj
,
min_level
=
3
,
max_level
=
7
,
num_scales
=
3
,
aspect_ratios
=
[
1.0
],
anchor_size
=
3
)
expect_checkpoint_items
=
dict
(
backbone
=
backbone
,
decoder
=
decoder
,
rpn_head
=
rpn_head
,
detection_head
=
[
detection_head
])
if
include_mask
:
expect_checkpoint_items
[
'mask_head'
]
=
mask_head
self
.
assertAllEqual
(
expect_checkpoint_items
,
model
.
checkpoint_items
)
# Test save and load checkpoints.
ckpt
=
tf
.
train
.
Checkpoint
(
model
=
model
,
**
model
.
checkpoint_items
)
save_dir
=
self
.
create_tempdir
().
full_path
ckpt
.
save
(
os
.
path
.
join
(
save_dir
,
'ckpt'
))
partial_ckpt
=
tf
.
train
.
Checkpoint
(
backbone
=
backbone
)
partial_ckpt
.
read
(
tf
.
train
.
latest_checkpoint
(
save_dir
)).
expect_partial
().
assert_existing_objects_matched
()
if
include_mask
:
partial_ckpt_mask
=
tf
.
train
.
Checkpoint
(
backbone
=
backbone
,
mask_head
=
mask_head
)
partial_ckpt_mask
.
restore
(
tf
.
train
.
latest_checkpoint
(
save_dir
)).
expect_partial
().
assert_existing_objects_matched
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/modeling/retinanet_model.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RetinaNet."""
from
typing
import
Any
,
Mapping
,
List
,
Optional
,
Union
# Import libraries
import
tensorflow
as
tf
from
official.vision.ops
import
anchor
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
RetinaNetModel
(
tf
.
keras
.
Model
):
"""The RetinaNet model class."""
def
__init__
(
self
,
backbone
:
tf
.
keras
.
Model
,
decoder
:
tf
.
keras
.
Model
,
head
:
tf
.
keras
.
layers
.
Layer
,
detection_generator
:
tf
.
keras
.
layers
.
Layer
,
min_level
:
Optional
[
int
]
=
None
,
max_level
:
Optional
[
int
]
=
None
,
num_scales
:
Optional
[
int
]
=
None
,
aspect_ratios
:
Optional
[
List
[
float
]]
=
None
,
anchor_size
:
Optional
[
float
]
=
None
,
**
kwargs
):
"""Classification initialization function.
Args:
backbone: `tf.keras.Model` a backbone network.
decoder: `tf.keras.Model` a decoder network.
head: `RetinaNetHead`, the RetinaNet head.
detection_generator: the detection generator.
min_level: Minimum level in output feature maps.
max_level: Maximum level in output feature maps.
num_scales: A number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: A list representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: A number representing the scale of size of the base
anchor to the feature stride 2^level.
**kwargs: keyword arguments to be passed.
"""
super
(
RetinaNetModel
,
self
).
__init__
(
**
kwargs
)
self
.
_config_dict
=
{
'backbone'
:
backbone
,
'decoder'
:
decoder
,
'head'
:
head
,
'detection_generator'
:
detection_generator
,
'min_level'
:
min_level
,
'max_level'
:
max_level
,
'num_scales'
:
num_scales
,
'aspect_ratios'
:
aspect_ratios
,
'anchor_size'
:
anchor_size
,
}
self
.
_backbone
=
backbone
self
.
_decoder
=
decoder
self
.
_head
=
head
self
.
_detection_generator
=
detection_generator
def
call
(
self
,
images
:
tf
.
Tensor
,
image_shape
:
Optional
[
tf
.
Tensor
]
=
None
,
anchor_boxes
:
Optional
[
Mapping
[
str
,
tf
.
Tensor
]]
=
None
,
output_intermediate_features
:
bool
=
False
,
training
:
bool
=
None
)
->
Mapping
[
str
,
tf
.
Tensor
]:
"""Forward pass of the RetinaNet model.
Args:
images: `Tensor`, the input batched images, whose shape is
[batch, height, width, 3].
image_shape: `Tensor`, the actual shape of the input images, whose shape
is [batch, 2] where the last dimension is [height, width]. Note that
this is the actual image shape excluding paddings. For example, images
in the batch may be resized into different shapes before padding to the
fixed size.
anchor_boxes: a dict of tensors which includes multilevel anchors.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the anchor coordinates of a particular feature
level, whose shape is [height_l, width_l, num_anchors_per_location].
output_intermediate_features: `bool` indicating whether to return the
intermediate feature maps generated by backbone and decoder.
training: `bool`, indicating whether it is in training mode.
Returns:
scores: a dict of tensors which includes scores of the predictions.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the box scores predicted from a particular feature
level, whose shape is
[batch, height_l, width_l, num_classes * num_anchors_per_location].
boxes: a dict of tensors which includes coordinates of the predictions.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the box coordinates predicted from a particular
feature level, whose shape is
[batch, height_l, width_l, 4 * num_anchors_per_location].
attributes: a dict of (attribute_name, attribute_predictions). Each
attribute prediction is a dict that includes:
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the attribute predictions from a particular
feature level, whose shape is
[batch, height_l, width_l, att_size * num_anchors_per_location].
"""
outputs
=
{}
# Feature extraction.
features
=
self
.
backbone
(
images
)
if
output_intermediate_features
:
outputs
.
update
(
{
'backbone_{}'
.
format
(
k
):
v
for
k
,
v
in
features
.
items
()})
if
self
.
decoder
:
features
=
self
.
decoder
(
features
)
if
output_intermediate_features
:
outputs
.
update
(
{
'decoder_{}'
.
format
(
k
):
v
for
k
,
v
in
features
.
items
()})
# Dense prediction. `raw_attributes` can be empty.
raw_scores
,
raw_boxes
,
raw_attributes
=
self
.
head
(
features
)
if
training
:
outputs
.
update
({
'cls_outputs'
:
raw_scores
,
'box_outputs'
:
raw_boxes
,
})
if
raw_attributes
:
outputs
.
update
({
'attribute_outputs'
:
raw_attributes
})
return
outputs
else
:
# Generate anchor boxes for this batch if not provided.
if
anchor_boxes
is
None
:
_
,
image_height
,
image_width
,
_
=
images
.
get_shape
().
as_list
()
anchor_boxes
=
anchor
.
Anchor
(
min_level
=
self
.
_config_dict
[
'min_level'
],
max_level
=
self
.
_config_dict
[
'max_level'
],
num_scales
=
self
.
_config_dict
[
'num_scales'
],
aspect_ratios
=
self
.
_config_dict
[
'aspect_ratios'
],
anchor_size
=
self
.
_config_dict
[
'anchor_size'
],
image_size
=
(
image_height
,
image_width
)).
multilevel_boxes
for
l
in
anchor_boxes
:
anchor_boxes
[
l
]
=
tf
.
tile
(
tf
.
expand_dims
(
anchor_boxes
[
l
],
axis
=
0
),
[
tf
.
shape
(
images
)[
0
],
1
,
1
,
1
])
# Post-processing.
final_results
=
self
.
detection_generator
(
raw_boxes
,
raw_scores
,
anchor_boxes
,
image_shape
,
raw_attributes
)
outputs
.
update
({
'cls_outputs'
:
raw_scores
,
'box_outputs'
:
raw_boxes
,
})
if
self
.
detection_generator
.
get_config
()[
'apply_nms'
]:
outputs
.
update
({
'detection_boxes'
:
final_results
[
'detection_boxes'
],
'detection_scores'
:
final_results
[
'detection_scores'
],
'detection_classes'
:
final_results
[
'detection_classes'
],
'num_detections'
:
final_results
[
'num_detections'
]
})
else
:
outputs
.
update
({
'decoded_boxes'
:
final_results
[
'decoded_boxes'
],
'decoded_box_scores'
:
final_results
[
'decoded_box_scores'
]
})
if
raw_attributes
:
outputs
.
update
({
'attribute_outputs'
:
raw_attributes
,
'detection_attributes'
:
final_results
[
'detection_attributes'
],
})
return
outputs
@
property
def
checkpoint_items
(
self
)
->
Mapping
[
str
,
Union
[
tf
.
keras
.
Model
,
tf
.
keras
.
layers
.
Layer
]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items
=
dict
(
backbone
=
self
.
backbone
,
head
=
self
.
head
)
if
self
.
decoder
is
not
None
:
items
.
update
(
decoder
=
self
.
decoder
)
return
items
@
property
def
backbone
(
self
)
->
tf
.
keras
.
Model
:
return
self
.
_backbone
@
property
def
decoder
(
self
)
->
tf
.
keras
.
Model
:
return
self
.
_decoder
@
property
def
head
(
self
)
->
tf
.
keras
.
layers
.
Layer
:
return
self
.
_head
@
property
def
detection_generator
(
self
)
->
tf
.
keras
.
layers
.
Layer
:
return
self
.
_detection_generator
def
get_config
(
self
)
->
Mapping
[
str
,
Any
]:
return
self
.
_config_dict
@
classmethod
def
from_config
(
cls
,
config
):
return
cls
(
**
config
)
official/vision/modeling/retinanet_model_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for RetinaNet models."""
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.distribute
import
strategy_combinations
from
official.vision.modeling
import
retinanet_model
from
official.vision.modeling.backbones
import
resnet
from
official.vision.modeling.decoders
import
fpn
from
official.vision.modeling.heads
import
dense_prediction_heads
from
official.vision.modeling.layers
import
detection_generator
from
official.vision.ops
import
anchor
class
RetinaNetTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
{
'use_separable_conv'
:
True
,
'build_anchor_boxes'
:
True
,
'is_training'
:
False
,
'has_att_heads'
:
False
},
{
'use_separable_conv'
:
False
,
'build_anchor_boxes'
:
True
,
'is_training'
:
False
,
'has_att_heads'
:
False
},
{
'use_separable_conv'
:
False
,
'build_anchor_boxes'
:
False
,
'is_training'
:
False
,
'has_att_heads'
:
False
},
{
'use_separable_conv'
:
False
,
'build_anchor_boxes'
:
False
,
'is_training'
:
True
,
'has_att_heads'
:
False
},
{
'use_separable_conv'
:
False
,
'build_anchor_boxes'
:
True
,
'is_training'
:
True
,
'has_att_heads'
:
True
},
{
'use_separable_conv'
:
False
,
'build_anchor_boxes'
:
True
,
'is_training'
:
False
,
'has_att_heads'
:
True
},
)
def
test_build_model
(
self
,
use_separable_conv
,
build_anchor_boxes
,
is_training
,
has_att_heads
):
num_classes
=
3
min_level
=
3
max_level
=
7
num_scales
=
3
aspect_ratios
=
[
1.0
]
anchor_size
=
3
fpn_num_filters
=
256
head_num_convs
=
4
head_num_filters
=
256
num_anchors_per_location
=
num_scales
*
len
(
aspect_ratios
)
image_size
=
384
images
=
np
.
random
.
rand
(
2
,
image_size
,
image_size
,
3
)
image_shape
=
np
.
array
([[
image_size
,
image_size
],
[
image_size
,
image_size
]])
if
build_anchor_boxes
:
anchor_boxes
=
anchor
.
Anchor
(
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
,
image_size
=
(
image_size
,
image_size
)).
multilevel_boxes
for
l
in
anchor_boxes
:
anchor_boxes
[
l
]
=
tf
.
tile
(
tf
.
expand_dims
(
anchor_boxes
[
l
],
axis
=
0
),
[
2
,
1
,
1
,
1
])
else
:
anchor_boxes
=
None
if
has_att_heads
:
attribute_heads
=
[
dict
(
name
=
'depth'
,
type
=
'regression'
,
size
=
1
)]
else
:
attribute_heads
=
None
backbone
=
resnet
.
ResNet
(
model_id
=
50
)
decoder
=
fpn
.
FPN
(
input_specs
=
backbone
.
output_specs
,
min_level
=
min_level
,
max_level
=
max_level
,
num_filters
=
fpn_num_filters
,
use_separable_conv
=
use_separable_conv
)
head
=
dense_prediction_heads
.
RetinaNetHead
(
min_level
=
min_level
,
max_level
=
max_level
,
num_classes
=
num_classes
,
attribute_heads
=
attribute_heads
,
num_anchors_per_location
=
num_anchors_per_location
,
use_separable_conv
=
use_separable_conv
,
num_convs
=
head_num_convs
,
num_filters
=
head_num_filters
)
generator
=
detection_generator
.
MultilevelDetectionGenerator
(
max_num_detections
=
10
)
model
=
retinanet_model
.
RetinaNetModel
(
backbone
=
backbone
,
decoder
=
decoder
,
head
=
head
,
detection_generator
=
generator
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
)
_
=
model
(
images
,
image_shape
,
anchor_boxes
,
training
=
is_training
)
@
combinations
.
generate
(
combinations
.
combine
(
strategy
=
[
strategy_combinations
.
cloud_tpu_strategy
,
strategy_combinations
.
one_device_strategy_gpu
,
],
image_size
=
[
(
128
,
128
),
],
training
=
[
True
,
False
],
has_att_heads
=
[
True
,
False
],
output_intermediate_features
=
[
True
,
False
],
soft_nms_sigma
=
[
None
,
0.0
,
0.1
],
))
def
test_forward
(
self
,
strategy
,
image_size
,
training
,
has_att_heads
,
output_intermediate_features
,
soft_nms_sigma
):
"""Test for creation of a R50-FPN RetinaNet."""
tf
.
keras
.
backend
.
set_image_data_format
(
'channels_last'
)
num_classes
=
3
min_level
=
3
max_level
=
7
num_scales
=
3
aspect_ratios
=
[
1.0
]
num_anchors_per_location
=
num_scales
*
len
(
aspect_ratios
)
images
=
np
.
random
.
rand
(
2
,
image_size
[
0
],
image_size
[
1
],
3
)
image_shape
=
np
.
array
(
[[
image_size
[
0
],
image_size
[
1
]],
[
image_size
[
0
],
image_size
[
1
]]])
with
strategy
.
scope
():
anchor_gen
=
anchor
.
build_anchor_generator
(
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
3
)
anchor_boxes
=
anchor_gen
(
image_size
)
for
l
in
anchor_boxes
:
anchor_boxes
[
l
]
=
tf
.
tile
(
tf
.
expand_dims
(
anchor_boxes
[
l
],
axis
=
0
),
[
2
,
1
,
1
,
1
])
backbone
=
resnet
.
ResNet
(
model_id
=
50
)
decoder
=
fpn
.
FPN
(
input_specs
=
backbone
.
output_specs
,
min_level
=
min_level
,
max_level
=
max_level
)
if
has_att_heads
:
attribute_heads
=
[
dict
(
name
=
'depth'
,
type
=
'regression'
,
size
=
1
)]
else
:
attribute_heads
=
None
head
=
dense_prediction_heads
.
RetinaNetHead
(
min_level
=
min_level
,
max_level
=
max_level
,
num_classes
=
num_classes
,
attribute_heads
=
attribute_heads
,
num_anchors_per_location
=
num_anchors_per_location
)
generator
=
detection_generator
.
MultilevelDetectionGenerator
(
max_num_detections
=
10
,
nms_version
=
'v1'
,
use_cpu_nms
=
soft_nms_sigma
is
not
None
,
soft_nms_sigma
=
soft_nms_sigma
)
model
=
retinanet_model
.
RetinaNetModel
(
backbone
=
backbone
,
decoder
=
decoder
,
head
=
head
,
detection_generator
=
generator
)
model_outputs
=
model
(
images
,
image_shape
,
anchor_boxes
,
output_intermediate_features
=
output_intermediate_features
,
training
=
training
)
if
training
:
cls_outputs
=
model_outputs
[
'cls_outputs'
]
box_outputs
=
model_outputs
[
'box_outputs'
]
for
level
in
range
(
min_level
,
max_level
+
1
):
self
.
assertIn
(
str
(
level
),
cls_outputs
)
self
.
assertIn
(
str
(
level
),
box_outputs
)
self
.
assertAllEqual
([
2
,
image_size
[
0
]
//
2
**
level
,
image_size
[
1
]
//
2
**
level
,
num_classes
*
num_anchors_per_location
],
cls_outputs
[
str
(
level
)].
numpy
().
shape
)
self
.
assertAllEqual
([
2
,
image_size
[
0
]
//
2
**
level
,
image_size
[
1
]
//
2
**
level
,
4
*
num_anchors_per_location
],
box_outputs
[
str
(
level
)].
numpy
().
shape
)
if
has_att_heads
:
att_outputs
=
model_outputs
[
'attribute_outputs'
]
for
att
in
att_outputs
.
values
():
self
.
assertAllEqual
([
2
,
image_size
[
0
]
//
2
**
level
,
image_size
[
1
]
//
2
**
level
,
1
*
num_anchors_per_location
],
att
[
str
(
level
)].
numpy
().
shape
)
else
:
self
.
assertIn
(
'detection_boxes'
,
model_outputs
)
self
.
assertIn
(
'detection_scores'
,
model_outputs
)
self
.
assertIn
(
'detection_classes'
,
model_outputs
)
self
.
assertIn
(
'num_detections'
,
model_outputs
)
self
.
assertAllEqual
(
[
2
,
10
,
4
],
model_outputs
[
'detection_boxes'
].
numpy
().
shape
)
self
.
assertAllEqual
(
[
2
,
10
],
model_outputs
[
'detection_scores'
].
numpy
().
shape
)
self
.
assertAllEqual
(
[
2
,
10
],
model_outputs
[
'detection_classes'
].
numpy
().
shape
)
self
.
assertAllEqual
(
[
2
,],
model_outputs
[
'num_detections'
].
numpy
().
shape
)
if
has_att_heads
:
self
.
assertIn
(
'detection_attributes'
,
model_outputs
)
self
.
assertAllEqual
(
[
2
,
10
,
1
],
model_outputs
[
'detection_attributes'
][
'depth'
].
numpy
().
shape
)
if
output_intermediate_features
:
for
l
in
range
(
2
,
6
):
self
.
assertIn
(
'backbone_{}'
.
format
(
l
),
model_outputs
)
self
.
assertAllEqual
([
2
,
image_size
[
0
]
//
2
**
l
,
image_size
[
1
]
//
2
**
l
,
backbone
.
output_specs
[
str
(
l
)].
as_list
()[
-
1
]
],
model_outputs
[
'backbone_{}'
.
format
(
l
)].
numpy
().
shape
)
for
l
in
range
(
min_level
,
max_level
+
1
):
self
.
assertIn
(
'decoder_{}'
.
format
(
l
),
model_outputs
)
self
.
assertAllEqual
([
2
,
image_size
[
0
]
//
2
**
l
,
image_size
[
1
]
//
2
**
l
,
decoder
.
output_specs
[
str
(
l
)].
as_list
()[
-
1
]
],
model_outputs
[
'decoder_{}'
.
format
(
l
)].
numpy
().
shape
)
def
test_serialize_deserialize
(
self
):
"""Validate the network can be serialized and deserialized."""
num_classes
=
3
min_level
=
3
max_level
=
7
num_scales
=
3
aspect_ratios
=
[
1.0
]
num_anchors_per_location
=
num_scales
*
len
(
aspect_ratios
)
backbone
=
resnet
.
ResNet
(
model_id
=
50
)
decoder
=
fpn
.
FPN
(
input_specs
=
backbone
.
output_specs
,
min_level
=
min_level
,
max_level
=
max_level
)
head
=
dense_prediction_heads
.
RetinaNetHead
(
min_level
=
min_level
,
max_level
=
max_level
,
num_classes
=
num_classes
,
num_anchors_per_location
=
num_anchors_per_location
)
generator
=
detection_generator
.
MultilevelDetectionGenerator
(
max_num_detections
=
10
)
model
=
retinanet_model
.
RetinaNetModel
(
backbone
=
backbone
,
decoder
=
decoder
,
head
=
head
,
detection_generator
=
generator
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
3
)
config
=
model
.
get_config
()
new_model
=
retinanet_model
.
RetinaNetModel
.
from_config
(
config
)
# Validate that the config can be forced to JSON.
_
=
new_model
.
to_json
()
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
model
.
get_config
(),
new_model
.
get_config
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/modeling/segmentation_model.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build segmentation models."""
from
typing
import
Any
,
Mapping
,
Union
,
Optional
,
Dict
# Import libraries
import
tensorflow
as
tf
layers
=
tf
.
keras
.
layers
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
SegmentationModel
(
tf
.
keras
.
Model
):
"""A Segmentation class model.
Input images are passed through backbone first. Decoder network is then
applied, and finally, segmentation head is applied on the output of the
decoder network. Layers such as ASPP should be part of decoder. Any feature
fusion is done as part of the segmentation head (i.e. deeplabv3+ feature
fusion is not part of the decoder, instead it is part of the segmentation
head). This way, different feature fusion techniques can be combined with
different backbones, and decoders.
"""
def
__init__
(
self
,
backbone
:
tf
.
keras
.
Model
,
decoder
:
tf
.
keras
.
Model
,
head
:
tf
.
keras
.
layers
.
Layer
,
mask_scoring_head
:
Optional
[
tf
.
keras
.
layers
.
Layer
]
=
None
,
**
kwargs
):
"""Segmentation initialization function.
Args:
backbone: a backbone network.
decoder: a decoder network. E.g. FPN.
head: segmentation head.
mask_scoring_head: mask scoring head.
**kwargs: keyword arguments to be passed.
"""
super
(
SegmentationModel
,
self
).
__init__
(
**
kwargs
)
self
.
_config_dict
=
{
'backbone'
:
backbone
,
'decoder'
:
decoder
,
'head'
:
head
,
'mask_scoring_head'
:
mask_scoring_head
,
}
self
.
backbone
=
backbone
self
.
decoder
=
decoder
self
.
head
=
head
self
.
mask_scoring_head
=
mask_scoring_head
def
call
(
self
,
inputs
:
tf
.
Tensor
,
training
:
bool
=
None
)
->
Dict
[
str
,
tf
.
Tensor
]:
backbone_features
=
self
.
backbone
(
inputs
)
if
self
.
decoder
:
decoder_features
=
self
.
decoder
(
backbone_features
)
else
:
decoder_features
=
backbone_features
logits
=
self
.
head
((
backbone_features
,
decoder_features
))
outputs
=
{
'logits'
:
logits
}
if
self
.
mask_scoring_head
:
mask_scores
=
self
.
mask_scoring_head
(
logits
)
outputs
.
update
({
'mask_scores'
:
mask_scores
})
return
outputs
@
property
def
checkpoint_items
(
self
)
->
Mapping
[
str
,
Union
[
tf
.
keras
.
Model
,
tf
.
keras
.
layers
.
Layer
]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items
=
dict
(
backbone
=
self
.
backbone
,
head
=
self
.
head
)
if
self
.
decoder
is
not
None
:
items
.
update
(
decoder
=
self
.
decoder
)
if
self
.
mask_scoring_head
is
not
None
:
items
.
update
(
mask_scoring_head
=
self
.
mask_scoring_head
)
return
items
def
get_config
(
self
)
->
Mapping
[
str
,
Any
]:
return
self
.
_config_dict
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
return
cls
(
**
config
)
official/vision/modeling/segmentation_model_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for segmentation network."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.modeling
import
backbones
from
official.vision.modeling
import
segmentation_model
from
official.vision.modeling.decoders
import
fpn
from
official.vision.modeling.heads
import
segmentation_heads
class
SegmentationNetworkTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
(
128
,
2
),
(
128
,
3
),
(
128
,
4
),
(
256
,
2
),
(
256
,
3
),
(
256
,
4
),
)
def
test_segmentation_network_creation
(
self
,
input_size
,
level
):
"""Test for creation of a segmentation network."""
num_classes
=
10
inputs
=
np
.
random
.
rand
(
2
,
input_size
,
input_size
,
3
)
tf
.
keras
.
backend
.
set_image_data_format
(
'channels_last'
)
backbone
=
backbones
.
ResNet
(
model_id
=
50
)
decoder
=
fpn
.
FPN
(
input_specs
=
backbone
.
output_specs
,
min_level
=
2
,
max_level
=
7
)
head
=
segmentation_heads
.
SegmentationHead
(
num_classes
,
level
=
level
)
model
=
segmentation_model
.
SegmentationModel
(
backbone
=
backbone
,
decoder
=
decoder
,
head
=
head
,
mask_scoring_head
=
None
,
)
outputs
=
model
(
inputs
)
self
.
assertAllEqual
(
[
2
,
input_size
//
(
2
**
level
),
input_size
//
(
2
**
level
),
num_classes
],
outputs
[
'logits'
].
numpy
().
shape
)
def
test_serialize_deserialize
(
self
):
"""Validate the network can be serialized and deserialized."""
num_classes
=
3
backbone
=
backbones
.
ResNet
(
model_id
=
50
)
decoder
=
fpn
.
FPN
(
input_specs
=
backbone
.
output_specs
,
min_level
=
3
,
max_level
=
7
)
head
=
segmentation_heads
.
SegmentationHead
(
num_classes
,
level
=
3
)
model
=
segmentation_model
.
SegmentationModel
(
backbone
=
backbone
,
decoder
=
decoder
,
head
=
head
)
config
=
model
.
get_config
()
new_model
=
segmentation_model
.
SegmentationModel
.
from_config
(
config
)
# Validate that the config can be forced to JSON.
_
=
new_model
.
to_json
()
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
model
.
get_config
(),
new_model
.
get_config
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/modeling/video_classification_model.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build video classification models."""
from
typing
import
Any
,
Mapping
,
Optional
,
Union
,
List
,
Text
import
tensorflow
as
tf
layers
=
tf
.
keras
.
layers
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
VideoClassificationModel
(
tf
.
keras
.
Model
):
"""A video classification class builder."""
def
__init__
(
self
,
backbone
:
tf
.
keras
.
Model
,
num_classes
:
int
,
input_specs
:
Optional
[
Mapping
[
str
,
tf
.
keras
.
layers
.
InputSpec
]]
=
None
,
dropout_rate
:
float
=
0.0
,
aggregate_endpoints
:
bool
=
False
,
kernel_initializer
:
str
=
'random_uniform'
,
kernel_regularizer
:
Optional
[
tf
.
keras
.
regularizers
.
Regularizer
]
=
None
,
bias_regularizer
:
Optional
[
tf
.
keras
.
regularizers
.
Regularizer
]
=
None
,
require_endpoints
:
Optional
[
List
[
Text
]]
=
None
,
**
kwargs
):
"""Video Classification initialization function.
Args:
backbone: a 3d backbone network.
num_classes: `int` number of classes in classification task.
input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
dropout_rate: `float` rate for dropout regularization.
aggregate_endpoints: `bool` aggregate all end ponits or only use the
final end point.
kernel_initializer: kernel initializer for the dense layer.
kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
None.
bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
None.
require_endpoints: the required endpoints for prediction. If None or
empty, then only uses the final endpoint.
**kwargs: keyword arguments to be passed.
"""
if
not
input_specs
:
input_specs
=
{
'image'
:
layers
.
InputSpec
(
shape
=
[
None
,
None
,
None
,
None
,
3
])
}
self
.
_self_setattr_tracking
=
False
self
.
_config_dict
=
{
'backbone'
:
backbone
,
'num_classes'
:
num_classes
,
'input_specs'
:
input_specs
,
'dropout_rate'
:
dropout_rate
,
'aggregate_endpoints'
:
aggregate_endpoints
,
'kernel_initializer'
:
kernel_initializer
,
'kernel_regularizer'
:
kernel_regularizer
,
'bias_regularizer'
:
bias_regularizer
,
'require_endpoints'
:
require_endpoints
,
}
self
.
_input_specs
=
input_specs
self
.
_kernel_regularizer
=
kernel_regularizer
self
.
_bias_regularizer
=
bias_regularizer
self
.
_backbone
=
backbone
inputs
=
{
k
:
tf
.
keras
.
Input
(
shape
=
v
.
shape
[
1
:])
for
k
,
v
in
input_specs
.
items
()
}
endpoints
=
backbone
(
inputs
[
'image'
])
if
aggregate_endpoints
:
pooled_feats
=
[]
for
endpoint
in
endpoints
.
values
():
x_pool
=
tf
.
keras
.
layers
.
GlobalAveragePooling3D
()(
endpoint
)
pooled_feats
.
append
(
x_pool
)
x
=
tf
.
concat
(
pooled_feats
,
axis
=
1
)
else
:
if
not
require_endpoints
:
# Uses the last endpoint for prediction.
x
=
endpoints
[
max
(
endpoints
.
keys
())]
x
=
tf
.
keras
.
layers
.
GlobalAveragePooling3D
()(
x
)
else
:
# Concats all the required endpoints for prediction.
outputs
=
[]
for
name
in
require_endpoints
:
x
=
endpoints
[
name
]
x
=
tf
.
keras
.
layers
.
GlobalAveragePooling3D
()(
x
)
outputs
.
append
(
x
)
x
=
tf
.
concat
(
outputs
,
axis
=
1
)
x
=
tf
.
keras
.
layers
.
Dropout
(
dropout_rate
)(
x
)
x
=
tf
.
keras
.
layers
.
Dense
(
num_classes
,
kernel_initializer
=
kernel_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
)(
x
)
super
(
VideoClassificationModel
,
self
).
__init__
(
inputs
=
inputs
,
outputs
=
x
,
**
kwargs
)
@
property
def
checkpoint_items
(
self
)
->
Mapping
[
str
,
Union
[
tf
.
keras
.
Model
,
tf
.
keras
.
layers
.
Layer
]]:
"""Returns a dictionary of items to be additionally checkpointed."""
return
dict
(
backbone
=
self
.
backbone
)
@
property
def
backbone
(
self
)
->
tf
.
keras
.
Model
:
return
self
.
_backbone
def
get_config
(
self
)
->
Mapping
[
str
,
Any
]:
return
self
.
_config_dict
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
return
cls
(
**
config
)
official/vision/modeling/video_classification_model_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for video classification network."""
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.modeling
import
backbones
from
official.vision.modeling
import
video_classification_model
class
VideoClassificationNetworkTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
(
50
,
8
,
112
,
'relu'
,
False
),
(
50
,
8
,
112
,
'swish'
,
True
),
)
def
test_resnet3d_network_creation
(
self
,
model_id
,
temporal_size
,
spatial_size
,
activation
,
aggregate_endpoints
):
"""Test for creation of a ResNet3D-50 classifier."""
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
None
,
temporal_size
,
spatial_size
,
spatial_size
,
3
])
temporal_strides
=
[
1
,
1
,
1
,
1
]
temporal_kernel_sizes
=
[(
3
,
3
,
3
),
(
3
,
1
,
3
,
1
),
(
3
,
1
,
3
,
1
,
3
,
1
),
(
1
,
3
,
1
)]
tf
.
keras
.
backend
.
set_image_data_format
(
'channels_last'
)
backbone
=
backbones
.
ResNet3D
(
model_id
=
model_id
,
temporal_strides
=
temporal_strides
,
temporal_kernel_sizes
=
temporal_kernel_sizes
,
input_specs
=
input_specs
,
activation
=
activation
)
num_classes
=
1000
model
=
video_classification_model
.
VideoClassificationModel
(
backbone
=
backbone
,
num_classes
=
num_classes
,
input_specs
=
{
'image'
:
input_specs
},
dropout_rate
=
0.2
,
aggregate_endpoints
=
aggregate_endpoints
,
)
inputs
=
np
.
random
.
rand
(
2
,
temporal_size
,
spatial_size
,
spatial_size
,
3
)
logits
=
model
(
inputs
)
self
.
assertAllEqual
([
2
,
num_classes
],
logits
.
numpy
().
shape
)
def
test_serialize_deserialize
(
self
):
"""Validate the classification network can be serialized and deserialized."""
model_id
=
50
temporal_strides
=
[
1
,
1
,
1
,
1
]
temporal_kernel_sizes
=
[(
3
,
3
,
3
),
(
3
,
1
,
3
,
1
),
(
3
,
1
,
3
,
1
,
3
,
1
),
(
1
,
3
,
1
)]
backbone
=
backbones
.
ResNet3D
(
model_id
=
model_id
,
temporal_strides
=
temporal_strides
,
temporal_kernel_sizes
=
temporal_kernel_sizes
)
model
=
video_classification_model
.
VideoClassificationModel
(
backbone
=
backbone
,
num_classes
=
1000
)
config
=
model
.
get_config
()
new_model
=
video_classification_model
.
VideoClassificationModel
.
from_config
(
config
)
# Validate that the config can be forced to JSON.
_
=
new_model
.
to_json
()
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
model
.
get_config
(),
new_model
.
get_config
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/__init__.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/vision/ops/anchor.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Anchor box and labeler definition."""
import
collections
# Import libraries
import
tensorflow
as
tf
from
official.vision.ops
import
anchor_generator
from
official.vision.ops
import
box_matcher
from
official.vision.ops
import
iou_similarity
from
official.vision.ops
import
target_gather
from
official.vision.utils.object_detection
import
balanced_positive_negative_sampler
from
official.vision.utils.object_detection
import
box_list
from
official.vision.utils.object_detection
import
faster_rcnn_box_coder
class
Anchor
(
object
):
"""Anchor class for anchor-based object detectors."""
def
__init__
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
):
"""Constructs multiscale anchors.
Args:
min_level: integer number of minimum level of the output feature pyramid.
max_level: integer number of maximum level of the output feature pyramid.
num_scales: integer number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: list of float numbers representing the aspect raito anchors
added on each level. The number indicates the ratio of width to height.
For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
scale level.
anchor_size: float number representing the scale of size of the base
anchor to the feature stride 2^level.
image_size: a list of integer numbers or Tensors representing
[height, width] of the input image size.The image_size should be divided
by the largest feature stride 2^max_level.
"""
self
.
min_level
=
min_level
self
.
max_level
=
max_level
self
.
num_scales
=
num_scales
self
.
aspect_ratios
=
aspect_ratios
self
.
anchor_size
=
anchor_size
self
.
image_size
=
image_size
self
.
boxes
=
self
.
_generate_boxes
()
def
_generate_boxes
(
self
):
"""Generates multiscale anchor boxes.
Returns:
a Tensor of shape [N, 4], representing anchor boxes of all levels
concatenated together.
"""
boxes_all
=
[]
for
level
in
range
(
self
.
min_level
,
self
.
max_level
+
1
):
boxes_l
=
[]
for
scale
in
range
(
self
.
num_scales
):
for
aspect_ratio
in
self
.
aspect_ratios
:
stride
=
2
**
level
intermidate_scale
=
2
**
(
scale
/
float
(
self
.
num_scales
))
base_anchor_size
=
self
.
anchor_size
*
stride
*
intermidate_scale
aspect_x
=
aspect_ratio
**
0.5
aspect_y
=
aspect_ratio
**
-
0.5
half_anchor_size_x
=
base_anchor_size
*
aspect_x
/
2.0
half_anchor_size_y
=
base_anchor_size
*
aspect_y
/
2.0
x
=
tf
.
range
(
stride
/
2
,
self
.
image_size
[
1
],
stride
)
y
=
tf
.
range
(
stride
/
2
,
self
.
image_size
[
0
],
stride
)
xv
,
yv
=
tf
.
meshgrid
(
x
,
y
)
xv
=
tf
.
cast
(
tf
.
reshape
(
xv
,
[
-
1
]),
dtype
=
tf
.
float32
)
yv
=
tf
.
cast
(
tf
.
reshape
(
yv
,
[
-
1
]),
dtype
=
tf
.
float32
)
# Tensor shape Nx4.
boxes
=
tf
.
stack
([
yv
-
half_anchor_size_y
,
xv
-
half_anchor_size_x
,
yv
+
half_anchor_size_y
,
xv
+
half_anchor_size_x
],
axis
=
1
)
boxes_l
.
append
(
boxes
)
# Concat anchors on the same level to tensor shape NxAx4.
boxes_l
=
tf
.
stack
(
boxes_l
,
axis
=
1
)
boxes_l
=
tf
.
reshape
(
boxes_l
,
[
-
1
,
4
])
boxes_all
.
append
(
boxes_l
)
return
tf
.
concat
(
boxes_all
,
axis
=
0
)
def
unpack_labels
(
self
,
labels
):
"""Unpacks an array of labels into multiscales labels."""
unpacked_labels
=
collections
.
OrderedDict
()
count
=
0
for
level
in
range
(
self
.
min_level
,
self
.
max_level
+
1
):
feat_size_y
=
tf
.
cast
(
self
.
image_size
[
0
]
/
2
**
level
,
tf
.
int32
)
feat_size_x
=
tf
.
cast
(
self
.
image_size
[
1
]
/
2
**
level
,
tf
.
int32
)
steps
=
feat_size_y
*
feat_size_x
*
self
.
anchors_per_location
unpacked_labels
[
str
(
level
)]
=
tf
.
reshape
(
labels
[
count
:
count
+
steps
],
[
feat_size_y
,
feat_size_x
,
-
1
])
count
+=
steps
return
unpacked_labels
@
property
def
anchors_per_location
(
self
):
return
self
.
num_scales
*
len
(
self
.
aspect_ratios
)
@
property
def
multilevel_boxes
(
self
):
return
self
.
unpack_labels
(
self
.
boxes
)
class
AnchorLabeler
(
object
):
"""Labeler for dense object detector."""
def
__init__
(
self
,
match_threshold
=
0.5
,
unmatched_threshold
=
0.5
):
"""Constructs anchor labeler to assign labels to anchors.
Args:
match_threshold: a float number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: a float number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
"""
self
.
similarity_calc
=
iou_similarity
.
IouSimilarity
()
self
.
target_gather
=
target_gather
.
TargetGather
()
self
.
matcher
=
box_matcher
.
BoxMatcher
(
thresholds
=
[
unmatched_threshold
,
match_threshold
],
indicators
=
[
-
1
,
-
2
,
1
],
force_match_for_each_col
=
True
)
self
.
box_coder
=
faster_rcnn_box_coder
.
FasterRcnnBoxCoder
()
def
label_anchors
(
self
,
anchor_boxes
,
gt_boxes
,
gt_labels
,
gt_attributes
=
None
,
gt_weights
=
None
):
"""Labels anchors with ground truth inputs.
Args:
anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
gt_attributes: If not None, a dict of (name, gt_attribute) pairs.
`gt_attribute` is a float tensor with shape [N, attribute_size]
representing groundtruth attributes.
gt_weights: If not None, a float tensor with shape [N] representing
groundtruth weights.
Returns:
cls_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
and width_l represent the dimension of bounding box regression output at
l-th level.
attribute_targets_dict: a dict with (name, attribute_targets) pairs. Each
`attribute_targets` represents an ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors_per_location * attribute_size].
The height_l and width_l represent the dimension of attribute prediction
output at l-th level.
cls_weights: A flattened Tensor with shape [batch_size, num_anchors], that
serves as masking / sample weight for classification loss. Its value
is 1.0 for positive and negative matched anchors, and 0.0 for ignored
anchors.
box_weights: A flattened Tensor with shape [batch_size, num_anchors], that
serves as masking / sample weight for regression loss. Its value is
1.0 for positive matched anchors, and 0.0 for negative and ignored
anchors.
"""
flattened_anchor_boxes
=
[]
for
anchors
in
anchor_boxes
.
values
():
flattened_anchor_boxes
.
append
(
tf
.
reshape
(
anchors
,
[
-
1
,
4
]))
flattened_anchor_boxes
=
tf
.
concat
(
flattened_anchor_boxes
,
axis
=
0
)
similarity_matrix
=
self
.
similarity_calc
(
flattened_anchor_boxes
,
gt_boxes
)
match_indices
,
match_indicators
=
self
.
matcher
(
similarity_matrix
)
mask
=
tf
.
less_equal
(
match_indicators
,
0
)
cls_mask
=
tf
.
expand_dims
(
mask
,
-
1
)
cls_targets
=
self
.
target_gather
(
gt_labels
,
match_indices
,
cls_mask
,
-
1
)
box_mask
=
tf
.
tile
(
cls_mask
,
[
1
,
4
])
box_targets
=
self
.
target_gather
(
gt_boxes
,
match_indices
,
box_mask
)
att_targets
=
{}
if
gt_attributes
:
for
k
,
v
in
gt_attributes
.
items
():
att_size
=
v
.
get_shape
().
as_list
()[
-
1
]
att_mask
=
tf
.
tile
(
cls_mask
,
[
1
,
att_size
])
att_targets
[
k
]
=
self
.
target_gather
(
v
,
match_indices
,
att_mask
,
0.0
)
weights
=
tf
.
squeeze
(
tf
.
ones_like
(
gt_labels
,
dtype
=
tf
.
float32
),
-
1
)
if
gt_weights
is
not
None
:
weights
=
tf
.
math
.
multiply
(
weights
,
gt_weights
)
box_weights
=
self
.
target_gather
(
weights
,
match_indices
,
mask
)
ignore_mask
=
tf
.
equal
(
match_indicators
,
-
2
)
cls_weights
=
self
.
target_gather
(
weights
,
match_indices
,
ignore_mask
)
box_targets_list
=
box_list
.
BoxList
(
box_targets
)
anchor_box_list
=
box_list
.
BoxList
(
flattened_anchor_boxes
)
box_targets
=
self
.
box_coder
.
encode
(
box_targets_list
,
anchor_box_list
)
# Unpacks labels into multi-level representations.
cls_targets_dict
=
unpack_targets
(
cls_targets
,
anchor_boxes
)
box_targets_dict
=
unpack_targets
(
box_targets
,
anchor_boxes
)
attribute_targets_dict
=
{}
for
k
,
v
in
att_targets
.
items
():
attribute_targets_dict
[
k
]
=
unpack_targets
(
v
,
anchor_boxes
)
return
cls_targets_dict
,
box_targets_dict
,
attribute_targets_dict
,
cls_weights
,
box_weights
class
RpnAnchorLabeler
(
AnchorLabeler
):
"""Labeler for Region Proposal Network."""
def
__init__
(
self
,
match_threshold
=
0.7
,
unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
):
AnchorLabeler
.
__init__
(
self
,
match_threshold
=
match_threshold
,
unmatched_threshold
=
unmatched_threshold
)
self
.
_rpn_batch_size_per_im
=
rpn_batch_size_per_im
self
.
_rpn_fg_fraction
=
rpn_fg_fraction
def
_get_rpn_samples
(
self
,
match_results
):
"""Computes anchor labels.
This function performs subsampling for foreground (fg) and background (bg)
anchors.
Args:
match_results: A integer tensor with shape [N] representing the
matching results of anchors. (1) match_results[i]>=0,
meaning that column i is matched with row match_results[i].
(2) match_results[i]=-1, meaning that column i is not matched.
(3) match_results[i]=-2, meaning that column i is ignored.
Returns:
score_targets: a integer tensor with the a shape of [N].
(1) score_targets[i]=1, the anchor is a positive sample.
(2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
don't care (ignore).
"""
sampler
=
(
balanced_positive_negative_sampler
.
BalancedPositiveNegativeSampler
(
positive_fraction
=
self
.
_rpn_fg_fraction
,
is_static
=
False
))
# indicator includes both positive and negative labels.
# labels includes only positives labels.
# positives = indicator & labels.
# negatives = indicator & !labels.
# ignore = !indicator.
indicator
=
tf
.
greater
(
match_results
,
-
2
)
labels
=
tf
.
greater
(
match_results
,
-
1
)
samples
=
sampler
.
subsample
(
indicator
,
self
.
_rpn_batch_size_per_im
,
labels
)
positive_labels
=
tf
.
where
(
tf
.
logical_and
(
samples
,
labels
),
tf
.
constant
(
2
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
),
tf
.
constant
(
0
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
))
negative_labels
=
tf
.
where
(
tf
.
logical_and
(
samples
,
tf
.
logical_not
(
labels
)),
tf
.
constant
(
1
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
),
tf
.
constant
(
0
,
dtype
=
tf
.
int32
,
shape
=
match_results
.
shape
))
ignore_labels
=
tf
.
fill
(
match_results
.
shape
,
-
1
)
return
(
ignore_labels
+
positive_labels
+
negative_labels
,
positive_labels
,
negative_labels
)
def
label_anchors
(
self
,
anchor_boxes
,
gt_boxes
,
gt_labels
):
"""Labels anchors with ground truth inputs.
Args:
anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
score_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
"""
flattened_anchor_boxes
=
[]
for
anchors
in
anchor_boxes
.
values
():
flattened_anchor_boxes
.
append
(
tf
.
reshape
(
anchors
,
[
-
1
,
4
]))
flattened_anchor_boxes
=
tf
.
concat
(
flattened_anchor_boxes
,
axis
=
0
)
similarity_matrix
=
self
.
similarity_calc
(
flattened_anchor_boxes
,
gt_boxes
)
match_indices
,
match_indicators
=
self
.
matcher
(
similarity_matrix
)
box_mask
=
tf
.
tile
(
tf
.
expand_dims
(
tf
.
less_equal
(
match_indicators
,
0
),
-
1
),
[
1
,
4
])
box_targets
=
self
.
target_gather
(
gt_boxes
,
match_indices
,
box_mask
)
box_targets_list
=
box_list
.
BoxList
(
box_targets
)
anchor_box_list
=
box_list
.
BoxList
(
flattened_anchor_boxes
)
box_targets
=
self
.
box_coder
.
encode
(
box_targets_list
,
anchor_box_list
)
# Zero out the unmatched and ignored regression targets.
num_matches
=
match_indices
.
shape
.
as_list
()[
0
]
or
tf
.
shape
(
match_indices
)[
0
]
unmatched_ignored_box_targets
=
tf
.
zeros
([
num_matches
,
4
],
dtype
=
tf
.
float32
)
matched_anchors_mask
=
tf
.
greater_equal
(
match_indicators
,
0
)
# To broadcast matched_anchors_mask to the same shape as
# matched_reg_targets.
matched_anchors_mask
=
tf
.
tile
(
tf
.
expand_dims
(
matched_anchors_mask
,
1
),
[
1
,
tf
.
shape
(
box_targets
)[
1
]])
box_targets
=
tf
.
where
(
matched_anchors_mask
,
box_targets
,
unmatched_ignored_box_targets
)
# score_targets contains the subsampled positive and negative anchors.
score_targets
,
_
,
_
=
self
.
_get_rpn_samples
(
match_indicators
)
# Unpacks labels.
score_targets_dict
=
unpack_targets
(
score_targets
,
anchor_boxes
)
box_targets_dict
=
unpack_targets
(
box_targets
,
anchor_boxes
)
return
score_targets_dict
,
box_targets_dict
def
build_anchor_generator
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
):
"""Build anchor generator from levels."""
anchor_sizes
=
collections
.
OrderedDict
()
strides
=
collections
.
OrderedDict
()
scales
=
[]
for
scale
in
range
(
num_scales
):
scales
.
append
(
2
**
(
scale
/
float
(
num_scales
)))
for
level
in
range
(
min_level
,
max_level
+
1
):
stride
=
2
**
level
strides
[
str
(
level
)]
=
stride
anchor_sizes
[
str
(
level
)]
=
anchor_size
*
stride
anchor_gen
=
anchor_generator
.
AnchorGenerator
(
anchor_sizes
=
anchor_sizes
,
scales
=
scales
,
aspect_ratios
=
aspect_ratios
,
strides
=
strides
)
return
anchor_gen
def
unpack_targets
(
targets
,
anchor_boxes_dict
):
"""Unpacks an array of labels into multiscales labels."""
unpacked_targets
=
collections
.
OrderedDict
()
count
=
0
for
level
,
anchor_boxes
in
anchor_boxes_dict
.
items
():
feat_size_shape
=
anchor_boxes
.
shape
.
as_list
()
feat_size_y
=
feat_size_shape
[
0
]
feat_size_x
=
feat_size_shape
[
1
]
anchors_per_location
=
int
(
feat_size_shape
[
2
]
/
4
)
steps
=
feat_size_y
*
feat_size_x
*
anchors_per_location
unpacked_targets
[
level
]
=
tf
.
reshape
(
targets
[
count
:
count
+
steps
],
[
feat_size_y
,
feat_size_x
,
-
1
])
count
+=
steps
return
unpacked_targets
official/vision/ops/anchor_generator.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multi scale anchor generator definition."""
import
tensorflow
as
tf
# (TODO/tanzheny): consider having customized anchor offset.
class
_SingleAnchorGenerator
:
"""Utility to generate anchors for a single feature map.
Example:
```python
anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16)
anchors = anchor_gen([512, 512, 3])
```
"""
def
__init__
(
self
,
anchor_size
,
scales
,
aspect_ratios
,
stride
,
clip_boxes
=
False
):
"""Constructs single scale anchor.
Args:
anchor_size: A single int represents the base anchor size. The anchor
height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be
`anchor_size * sqrt(aspect_ratio)`.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: a list/tuple of positive floats representing the ratio of
anchor width to anchor height.
stride: A single int represents the anchor stride size between center of
each anchor.
clip_boxes: Boolean to represent whether the anchor coordinates should be
clipped to the image size. Defaults to `True`.
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]`
"""
self
.
anchor_size
=
anchor_size
self
.
scales
=
scales
self
.
aspect_ratios
=
aspect_ratios
self
.
stride
=
stride
self
.
clip_boxes
=
clip_boxes
def
__call__
(
self
,
image_size
):
image_height
=
tf
.
cast
(
image_size
[
0
],
tf
.
float32
)
image_width
=
tf
.
cast
(
image_size
[
1
],
tf
.
float32
)
k
=
len
(
self
.
scales
)
*
len
(
self
.
aspect_ratios
)
aspect_ratios_sqrt
=
tf
.
cast
(
tf
.
sqrt
(
self
.
aspect_ratios
),
dtype
=
tf
.
float32
)
anchor_size
=
tf
.
cast
(
self
.
anchor_size
,
tf
.
float32
)
# [K]
anchor_heights
=
[]
anchor_widths
=
[]
for
scale
in
self
.
scales
:
anchor_size_t
=
anchor_size
*
scale
anchor_height
=
anchor_size_t
/
aspect_ratios_sqrt
anchor_width
=
anchor_size_t
*
aspect_ratios_sqrt
anchor_heights
.
append
(
anchor_height
)
anchor_widths
.
append
(
anchor_width
)
anchor_heights
=
tf
.
concat
(
anchor_heights
,
axis
=
0
)
anchor_widths
=
tf
.
concat
(
anchor_widths
,
axis
=
0
)
half_anchor_heights
=
tf
.
reshape
(
0.5
*
anchor_heights
,
[
1
,
1
,
k
])
half_anchor_widths
=
tf
.
reshape
(
0.5
*
anchor_widths
,
[
1
,
1
,
k
])
stride
=
tf
.
cast
(
self
.
stride
,
tf
.
float32
)
# [W]
cx
=
tf
.
range
(
0.5
*
stride
,
image_width
,
stride
)
# [H]
cy
=
tf
.
range
(
0.5
*
stride
,
image_height
,
stride
)
# [H, W]
cx_grid
,
cy_grid
=
tf
.
meshgrid
(
cx
,
cy
)
# [H, W, 1]
cx_grid
=
tf
.
expand_dims
(
cx_grid
,
axis
=-
1
)
cy_grid
=
tf
.
expand_dims
(
cy_grid
,
axis
=-
1
)
# [H, W, K, 1]
y_min
=
tf
.
expand_dims
(
cy_grid
-
half_anchor_heights
,
axis
=-
1
)
y_max
=
tf
.
expand_dims
(
cy_grid
+
half_anchor_heights
,
axis
=-
1
)
x_min
=
tf
.
expand_dims
(
cx_grid
-
half_anchor_widths
,
axis
=-
1
)
x_max
=
tf
.
expand_dims
(
cx_grid
+
half_anchor_widths
,
axis
=-
1
)
if
self
.
clip_boxes
:
y_min
=
tf
.
maximum
(
tf
.
minimum
(
y_min
,
image_height
),
0.
)
y_max
=
tf
.
maximum
(
tf
.
minimum
(
y_max
,
image_height
),
0.
)
x_min
=
tf
.
maximum
(
tf
.
minimum
(
x_min
,
image_width
),
0.
)
x_max
=
tf
.
maximum
(
tf
.
minimum
(
x_max
,
image_width
),
0.
)
# [H, W, K, 4]
result
=
tf
.
concat
([
y_min
,
x_min
,
y_max
,
x_max
],
axis
=-
1
)
shape
=
result
.
shape
.
as_list
()
# [H, W, K * 4]
return
tf
.
reshape
(
result
,
[
shape
[
0
],
shape
[
1
],
shape
[
2
]
*
shape
[
3
]])
class
AnchorGenerator
():
"""Utility to generate anchors for a multiple feature maps.
Example:
```python
anchor_gen = AnchorGenerator([32, 64], [.5, 1., 2.],
strides=[16, 32])
anchors = anchor_gen([512, 512, 3])
```
"""
def
__init__
(
self
,
anchor_sizes
,
scales
,
aspect_ratios
,
strides
,
clip_boxes
=
False
):
"""Constructs multiscale anchors.
Args:
anchor_sizes: A list of int represents the anchor size for each scale. The
anchor height will be `anchor_size / sqrt(aspect_ratio)`, anchor width
will be `anchor_size * sqrt(aspect_ratio)` for each scale.
scales: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the actual anchor size to the base `anchor_size`.
aspect_ratios: A list/tuple, or a list/tuple of a list/tuple of positive
floats representing the ratio of anchor width to anchor height.
strides: A list/tuple of ints represent the anchor stride size between
center of anchors at each scale.
clip_boxes: Boolean to represents whether the anchor coordinates should be
clipped to the image size. Defaults to `False`.
Input shape: the size of the image, `[H, W, C]`
Output shape: the size of anchors concat on each level, `[(H /
strides) * (W / strides), K * 4]`
"""
# aspect_ratio is a single list that is the same across all levels.
aspect_ratios
=
maybe_map_structure_for_anchor
(
aspect_ratios
,
anchor_sizes
)
scales
=
maybe_map_structure_for_anchor
(
scales
,
anchor_sizes
)
if
isinstance
(
anchor_sizes
,
dict
):
self
.
anchor_generators
=
{}
for
k
in
anchor_sizes
.
keys
():
self
.
anchor_generators
[
k
]
=
_SingleAnchorGenerator
(
anchor_sizes
[
k
],
scales
[
k
],
aspect_ratios
[
k
],
strides
[
k
],
clip_boxes
)
elif
isinstance
(
anchor_sizes
,
(
list
,
tuple
)):
self
.
anchor_generators
=
[]
for
anchor_size
,
scale_list
,
ar_list
,
stride
in
zip
(
anchor_sizes
,
scales
,
aspect_ratios
,
strides
):
self
.
anchor_generators
.
append
(
_SingleAnchorGenerator
(
anchor_size
,
scale_list
,
ar_list
,
stride
,
clip_boxes
))
def
__call__
(
self
,
image_size
):
anchor_generators
=
tf
.
nest
.
flatten
(
self
.
anchor_generators
)
results
=
[
anchor_gen
(
image_size
)
for
anchor_gen
in
anchor_generators
]
return
tf
.
nest
.
pack_sequence_as
(
self
.
anchor_generators
,
results
)
def
maybe_map_structure_for_anchor
(
params
,
anchor_sizes
):
"""broadcast the params to match anchor_sizes."""
if
all
(
isinstance
(
param
,
(
int
,
float
))
for
param
in
params
):
if
isinstance
(
anchor_sizes
,
(
tuple
,
list
)):
return
[
params
]
*
len
(
anchor_sizes
)
elif
isinstance
(
anchor_sizes
,
dict
):
return
tf
.
nest
.
map_structure
(
lambda
_
:
params
,
anchor_sizes
)
else
:
raise
ValueError
(
"the structure of `anchor_sizes` must be a tuple, "
"list, or dict, given {}"
.
format
(
anchor_sizes
))
else
:
return
params
official/vision/ops/anchor_generator_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for anchor_generator.py."""
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.ops
import
anchor_generator
class
AnchorGeneratorTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
# Single scale anchor.
(
5
,
[
1.0
],
[[[
-
16.
,
-
16.
,
48.
,
48.
],
[
-
16.
,
16.
,
48.
,
80.
]],
[[
16.
,
-
16.
,
80.
,
48.
],
[
16.
,
16.
,
80.
,
80.
]]]),
# # Multi aspect ratio anchor.
(
6
,
[
1.0
,
4.0
,
0.25
],
[[[
-
32.
,
-
32.
,
96.
,
96.
,
0.
,
-
96.
,
64.
,
160.
,
-
96.
,
0.
,
160.
,
64.
]]]),
)
def
testAnchorGeneration
(
self
,
level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
anchor_size
=
2
**
(
level
+
1
)
stride
=
2
**
level
anchor_gen
=
anchor_generator
.
_SingleAnchorGenerator
(
anchor_size
=
anchor_size
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
stride
=
stride
,
clip_boxes
=
False
)
anchors
=
anchor_gen
(
image_size
).
numpy
()
self
.
assertAllClose
(
expected_boxes
,
anchors
)
@
parameterized
.
parameters
(
# Single scale anchor.
(
5
,
[
1.0
],
[[[
0.
,
0.
,
48.
,
48.
],
[
0.
,
16.
,
48.
,
64.
]],
[[
16.
,
0.
,
64.
,
48.
],
[
16.
,
16.
,
64.
,
64.
]]]),
# # Multi aspect ratio anchor.
(
6
,
[
1.0
,
4.0
,
0.25
],
[[[
0.
,
0.
,
64.
,
64.
,
0.
,
0.
,
64.
,
64.
,
0.
,
0.
,
64.
,
64.
]]]),
)
def
testAnchorGenerationClipped
(
self
,
level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
anchor_size
=
2
**
(
level
+
1
)
stride
=
2
**
level
anchor_gen
=
anchor_generator
.
_SingleAnchorGenerator
(
anchor_size
=
anchor_size
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
stride
=
stride
,
clip_boxes
=
True
)
anchors
=
anchor_gen
(
image_size
).
numpy
()
self
.
assertAllClose
(
expected_boxes
,
anchors
)
class
MultiScaleAnchorGeneratorTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
# Multi scale anchor.
(
5
,
6
,
[[
1.0
],
[
1.0
]],
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
],
[
-
32
,
-
32
,
96
,
96
]]),)
def
testAnchorGeneration
(
self
,
min_level
,
max_level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
levels
=
range
(
min_level
,
max_level
+
1
)
anchor_sizes
=
[
2
**
(
level
+
1
)
for
level
in
levels
]
strides
=
[
2
**
level
for
level
in
levels
]
anchor_gen
=
anchor_generator
.
AnchorGenerator
(
anchor_sizes
=
anchor_sizes
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
strides
=
strides
)
anchors
=
anchor_gen
(
image_size
)
anchors
=
[
tf
.
reshape
(
anchor
,
[
-
1
,
4
])
for
anchor
in
anchors
]
anchors
=
tf
.
concat
(
anchors
,
axis
=
0
).
numpy
()
self
.
assertAllClose
(
expected_boxes
,
anchors
)
@
parameterized
.
parameters
(
# Multi scale anchor.
(
5
,
6
,
[[
1.0
],
[
1.0
]],
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
],
[
-
32
,
-
32
,
96
,
96
]]),)
def
testAnchorGenerationClipped
(
self
,
min_level
,
max_level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
levels
=
range
(
min_level
,
max_level
+
1
)
anchor_sizes
=
[
2
**
(
level
+
1
)
for
level
in
levels
]
strides
=
[
2
**
level
for
level
in
levels
]
anchor_gen
=
anchor_generator
.
AnchorGenerator
(
anchor_sizes
=
anchor_sizes
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
strides
=
strides
,
clip_boxes
=
False
)
anchors
=
anchor_gen
(
image_size
)
anchors
=
[
tf
.
reshape
(
anchor
,
[
-
1
,
4
])
for
anchor
in
anchors
]
anchors
=
tf
.
concat
(
anchors
,
axis
=
0
).
numpy
()
self
.
assertAllClose
(
expected_boxes
,
anchors
)
@
parameterized
.
parameters
(
# Multi scale anchor.
(
5
,
6
,
[
1.0
],
{
'5'
:
[[[
-
16.
,
-
16.
,
48.
,
48.
],
[
-
16.
,
16.
,
48.
,
80.
]],
[[
16.
,
-
16.
,
80.
,
48.
],
[
16.
,
16.
,
80.
,
80.
]]],
'6'
:
[[[
-
32
,
-
32
,
96
,
96
]]]
}),)
def
testAnchorGenerationDict
(
self
,
min_level
,
max_level
,
aspect_ratios
,
expected_boxes
):
image_size
=
[
64
,
64
]
levels
=
range
(
min_level
,
max_level
+
1
)
anchor_sizes
=
dict
((
str
(
level
),
2
**
(
level
+
1
))
for
level
in
levels
)
strides
=
dict
((
str
(
level
),
2
**
level
)
for
level
in
levels
)
anchor_gen
=
anchor_generator
.
AnchorGenerator
(
anchor_sizes
=
anchor_sizes
,
scales
=
[
1.
],
aspect_ratios
=
aspect_ratios
,
strides
=
strides
,
clip_boxes
=
False
)
anchors
=
anchor_gen
(
image_size
)
for
k
in
expected_boxes
.
keys
():
self
.
assertAllClose
(
expected_boxes
[
k
],
anchors
[
k
].
numpy
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/ops/anchor_test.py
0 → 100644
View file @
0225b135
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for anchor.py."""
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.ops
import
anchor
class
AnchorTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
# The set of parameters are tailored for the MLPerf configuration, where
# the number of anchors is 495132, rpn_batch_size_per_im=256, and
# rpn_fg_fraction=0.5.
@
parameterized
.
parameters
(
(
512
,
25
,
25
,
25
,
25
,
(
512
,
512
)),
(
512
,
25
,
25
,
25
,
25
,
(
512
,
640
)),
(
512
,
25
,
25
,
25
,
25
,
(
640
,
512
)),
(
495132
,
100
,
100
,
100
,
100
,
(
512
,
512
)),
(
495132
,
200
,
100
,
128
,
100
,
(
512
,
512
)),
(
495132
,
100
,
120
,
100
,
120
,
(
512
,
512
)),
(
495132
,
100
,
200
,
100
,
156
,
(
512
,
512
)),
(
495132
,
200
,
200
,
128
,
128
,
(
512
,
512
)),
)
def
testAnchorRpnSample
(
self
,
num_anchors
,
num_positives
,
num_negatives
,
expected_positives
,
expected_negatives
,
image_size
):
match_results_np
=
np
.
empty
([
num_anchors
])
match_results_np
.
fill
(
-
2
)
match_results_np
[:
num_positives
]
=
0
match_results_np
[
num_positives
:
num_positives
+
num_negatives
]
=
-
1
match_results
=
tf
.
convert_to_tensor
(
value
=
match_results_np
,
dtype
=
tf
.
int32
)
anchor_labeler
=
anchor
.
RpnAnchorLabeler
(
match_threshold
=
0.7
,
unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
)
rpn_sample_op
=
anchor_labeler
.
_get_rpn_samples
(
match_results
)
labels
=
[
v
.
numpy
()
for
v
in
rpn_sample_op
]
self
.
assertLen
(
labels
[
0
],
num_anchors
)
positives
=
np
.
sum
(
np
.
array
(
labels
[
0
])
==
1
)
negatives
=
np
.
sum
(
np
.
array
(
labels
[
0
])
==
0
)
self
.
assertEqual
(
positives
,
expected_positives
)
self
.
assertEqual
(
negatives
,
expected_negatives
)
@
parameterized
.
parameters
(
# Single scale anchor.
(
5
,
5
,
1
,
[
1.0
],
2.0
,
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
]]),
# Multi scale anchor.
(
5
,
6
,
1
,
[
1.0
],
2.0
,
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
],
[
-
32
,
-
32
,
96
,
96
]]),
# # Multi aspect ratio anchor.
(
6
,
6
,
1
,
[
1.0
,
4.0
,
0.25
],
2.0
,
[[
-
32
,
-
32
,
96
,
96
],
[
-
0
,
-
96
,
64
,
160
],
[
-
96
,
-
0
,
160
,
64
]]),
)
def
testAnchorGeneration
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
expected_boxes
):
image_size
=
[
64
,
64
]
anchors
=
anchor
.
Anchor
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
)
boxes
=
anchors
.
boxes
.
numpy
()
self
.
assertEqual
(
expected_boxes
,
boxes
.
tolist
())
@
parameterized
.
parameters
(
# Single scale anchor.
(
5
,
5
,
1
,
[
1.0
],
2.0
,
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
]]),
# Multi scale anchor.
(
5
,
6
,
1
,
[
1.0
],
2.0
,
[[
-
16
,
-
16
,
48
,
48
],
[
-
16
,
16
,
48
,
80
],
[
16
,
-
16
,
80
,
48
],
[
16
,
16
,
80
,
80
],
[
-
32
,
-
32
,
96
,
96
]]),
# # Multi aspect ratio anchor.
(
6
,
6
,
1
,
[
1.0
,
4.0
,
0.25
],
2.0
,
[[
-
32
,
-
32
,
96
,
96
],
[
-
0
,
-
96
,
64
,
160
],
[
-
96
,
-
0
,
160
,
64
]]),
)
def
testAnchorGenerationWithImageSizeAsTensor
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
expected_boxes
):
image_size
=
tf
.
constant
([
64
,
64
],
tf
.
int32
)
anchors
=
anchor
.
Anchor
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
)
boxes
=
anchors
.
boxes
.
numpy
()
self
.
assertEqual
(
expected_boxes
,
boxes
.
tolist
())
@
parameterized
.
parameters
(
(
3
,
6
,
2
,
[
1.0
],
2.0
,
False
),
(
3
,
6
,
2
,
[
1.0
],
2.0
,
True
),
)
def
testLabelAnchors
(
self
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
has_attribute
):
input_size
=
[
512
,
512
]
ground_truth_class_id
=
2
attribute_name
=
'depth'
ground_truth_depth
=
3.0
# The matched anchors are the anchors used as ground truth and the anchors
# at the next octave scale on the same location.
expected_anchor_locations
=
[[
0
,
0
,
0
],
[
0
,
0
,
1
]]
anchor_gen
=
anchor
.
build_anchor_generator
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
)
anchor_boxes
=
anchor_gen
(
input_size
)
anchor_labeler
=
anchor
.
AnchorLabeler
()
# Uses the first anchors as ground truth. The ground truth should map to
# two anchors with two intermediate scales at the same location.
gt_boxes
=
anchor_boxes
[
'3'
][
0
:
1
,
0
,
0
:
4
]
gt_classes
=
tf
.
constant
([[
ground_truth_class_id
]],
dtype
=
tf
.
float32
)
gt_attributes
=
{
attribute_name
:
tf
.
constant
([[
ground_truth_depth
]],
dtype
=
tf
.
float32
)
}
if
has_attribute
else
{}
(
cls_targets
,
box_targets
,
att_targets
,
_
,
box_weights
)
=
anchor_labeler
.
label_anchors
(
anchor_boxes
,
gt_boxes
,
gt_classes
,
gt_attributes
)
for
k
,
v
in
cls_targets
.
items
():
cls_targets
[
k
]
=
v
.
numpy
()
for
k
,
v
in
box_targets
.
items
():
box_targets
[
k
]
=
v
.
numpy
()
box_weights
=
box_weights
.
numpy
()
anchor_locations
=
np
.
vstack
(
np
.
where
(
cls_targets
[
str
(
min_level
)]
>
-
1
)).
transpose
()
self
.
assertAllClose
(
expected_anchor_locations
,
anchor_locations
)
# Two anchor boxes on min_level got matched to the gt_boxes.
self
.
assertAllClose
(
tf
.
reduce_sum
(
box_weights
),
2
)
if
has_attribute
:
self
.
assertIn
(
attribute_name
,
att_targets
)
for
k
,
v
in
att_targets
[
attribute_name
].
items
():
att_targets
[
attribute_name
][
k
]
=
v
.
numpy
()
anchor_locations
=
np
.
vstack
(
np
.
where
(
att_targets
[
attribute_name
][
str
(
min_level
)]
>
0.0
)).
transpose
()
self
.
assertAllClose
(
expected_anchor_locations
,
anchor_locations
)
else
:
self
.
assertEmpty
(
att_targets
)
@
parameterized
.
parameters
(
(
3
,
7
,
[.
5
,
1.
,
2.
],
2
,
8
,
(
256
,
256
)),
(
3
,
8
,
[
1.
],
3
,
32
,
(
512
,
512
)),
(
3
,
3
,
[
1.
],
2
,
4
,
(
32
,
32
)),
)
def
testEquivalentResult
(
self
,
min_level
,
max_level
,
aspect_ratios
,
num_scales
,
anchor_size
,
image_size
):
anchor_gen
=
anchor
.
build_anchor_generator
(
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
)
anchors
=
anchor_gen
(
image_size
)
expected_anchor_gen
=
anchor
.
Anchor
(
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
image_size
)
expected_anchors
=
expected_anchor_gen
.
multilevel_boxes
for
k
in
expected_anchors
.
keys
():
self
.
assertAllClose
(
expected_anchors
[
k
],
anchors
[
k
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
Prev
1
…
10
11
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment