Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
965cc3ee
Unverified
Commit
965cc3ee
authored
Apr 21, 2020
by
Ayushman Kumar
Committed by
GitHub
Apr 21, 2020
Browse files
Merge pull request #7 from tensorflow/master
updated
parents
1f3247f4
1f685c54
Changes
222
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
681 additions
and
471 deletions
+681
-471
official/vision/detection/configs/retinanet_config.py
official/vision/detection/configs/retinanet_config.py
+8
-120
official/vision/detection/dataloader/factory.py
official/vision/detection/dataloader/factory.py
+10
-10
official/vision/detection/dataloader/input_reader.py
official/vision/detection/dataloader/input_reader.py
+2
-1
official/vision/detection/main.py
official/vision/detection/main.py
+27
-11
official/vision/detection/modeling/architecture/factory.py
official/vision/detection/modeling/architecture/factory.py
+59
-67
official/vision/detection/modeling/architecture/fpn.py
official/vision/detection/modeling/architecture/fpn.py
+17
-9
official/vision/detection/modeling/architecture/heads.py
official/vision/detection/modeling/architecture/heads.py
+68
-46
official/vision/detection/modeling/architecture/nn_ops.py
official/vision/detection/modeling/architecture/nn_ops.py
+36
-13
official/vision/detection/modeling/architecture/resnet.py
official/vision/detection/modeling/architecture/resnet.py
+24
-19
official/vision/detection/modeling/base_model.py
official/vision/detection/modeling/base_model.py
+5
-34
official/vision/detection/modeling/learning_rates.py
official/vision/detection/modeling/learning_rates.py
+8
-6
official/vision/detection/modeling/losses.py
official/vision/detection/modeling/losses.py
+2
-2
official/vision/detection/modeling/maskrcnn_model.py
official/vision/detection/modeling/maskrcnn_model.py
+8
-6
official/vision/detection/modeling/optimizers.py
official/vision/detection/modeling/optimizers.py
+50
-0
official/vision/detection/modeling/retinanet_model.py
official/vision/detection/modeling/retinanet_model.py
+5
-2
official/vision/detection/ops/__init__.py
official/vision/detection/ops/__init__.py
+14
-0
official/vision/detection/ops/postprocess_ops.py
official/vision/detection/ops/postprocess_ops.py
+3
-3
official/vision/detection/ops/sampling_ops.py
official/vision/detection/ops/sampling_ops.py
+3
-3
official/vision/detection/ops/spatial_transform_ops.py
official/vision/detection/ops/spatial_transform_ops.py
+300
-115
official/vision/image_classification/README.md
official/vision/image_classification/README.md
+32
-4
No files found.
official/vision/detection/configs/retinanet_config.py
View file @
965cc3ee
...
...
@@ -14,84 +14,18 @@
# ==============================================================================
"""Config template to train Retinanet."""
# pylint: disable=line-too-long
from
official.modeling.hyperparams
import
params_dict
from
official.vision.detection.configs
import
base_config
# For ResNet-50, this freezes the variables of the first conv1 and conv2_x
# layers [1], which leads to higher training speed and slightly better testing
# accuracy. The intuition is that the low-level architecture (e.g., ResNet-50)
# is able to capture low-level features such as edges; therefore, it does not
# need to be fine-tuned for the detection task.
# Note that we need to trailing `/` to avoid the incorrect match.
# [1]: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L198
RESNET_FROZEN_VAR_PREFIX
=
r
'(resnet\d+)\/(conv2d(|_([1-9]|10))|batch_normalization(|_([1-9]|10)))\/'
REGULARIZATION_VAR_REGEX
=
r
'.*(kernel|weight):0$'
# pylint: disable=line-too-long
RETINANET_CFG
=
{
RETINANET_CFG
=
params_dict
.
ParamsDict
(
base_config
.
BASE_CFG
)
RETINANET_CFG
.
override
({
'type'
:
'retinanet'
,
'model_dir'
:
''
,
'use_tpu'
:
True
,
'strategy_type'
:
'tpu'
,
'train'
:
{
'batch_size'
:
64
,
'iterations_per_loop'
:
500
,
'total_steps'
:
22500
,
'optimizer'
:
{
'type'
:
'momentum'
,
'momentum'
:
0.9
,
'nesterov'
:
True
,
# `False` is better for TPU v3-128.
},
'learning_rate'
:
{
'type'
:
'step'
,
'warmup_learning_rate'
:
0.0067
,
'warmup_steps'
:
500
,
'init_learning_rate'
:
0.08
,
'learning_rate_levels'
:
[
0.008
,
0.0008
],
'learning_rate_steps'
:
[
15000
,
20000
],
},
'checkpoint'
:
{
'path'
:
''
,
'prefix'
:
''
,
},
'frozen_variable_prefix'
:
RESNET_FROZEN_VAR_PREFIX
,
'train_file_pattern'
:
''
,
# TODO(b/142174042): Support transpose_input option.
'transpose_input'
:
False
,
'regularization_variable_regex'
:
REGULARIZATION_VAR_REGEX
,
'l2_weight_decay'
:
0.0001
,
'input_sharding'
:
False
,
},
'eval'
:
{
'batch_size'
:
8
,
'min_eval_interval'
:
180
,
'eval_timeout'
:
None
,
'eval_samples'
:
5000
,
'type'
:
'box'
,
'val_json_file'
:
''
,
'eval_file_pattern'
:
''
,
'input_sharding'
:
True
,
# When visualizing images, set evaluation batch size to 40 to avoid
# potential OOM.
'num_images_to_visualize'
:
0
,
},
'predict'
:
{
'predict_batch_size'
:
8
,
},
'architecture'
:
{
'parser'
:
'retinanet_parser'
,
'backbone'
:
'resnet'
,
'multilevel_features'
:
'fpn'
,
'use_bfloat16'
:
False
,
},
'anchor'
:
{
'min_level'
:
3
,
'max_level'
:
7
,
'num_scales'
:
3
,
'aspect_ratios'
:
[
1.0
,
2.0
,
0.5
],
'anchor_size'
:
4.0
,
},
'retinanet_parser'
:
{
'use_bfloat16'
:
False
,
'output_size'
:
[
640
,
640
],
'num_channels'
:
3
,
'match_threshold'
:
0.5
,
...
...
@@ -104,68 +38,22 @@ RETINANET_CFG = {
'skip_crowd_during_training'
:
True
,
'max_num_instances'
:
100
,
},
'resnet'
:
{
'resnet_depth'
:
50
,
'batch_norm'
:
{
'batch_norm_momentum'
:
0.997
,
'batch_norm_epsilon'
:
1e-4
,
'batch_norm_trainable'
:
True
,
},
},
'fpn'
:
{
'min_level'
:
3
,
'max_level'
:
7
,
'fpn_feat_dims'
:
256
,
'use_separable_conv'
:
False
,
'use_batch_norm'
:
True
,
'batch_norm'
:
{
'batch_norm_momentum'
:
0.997
,
'batch_norm_epsilon'
:
1e-4
,
'batch_norm_trainable'
:
True
,
},
},
'retinanet_head'
:
{
'min_level'
:
3
,
'max_level'
:
7
,
# Note that `num_classes` is the total number of classes including
# one background classes whose index is 0.
'num_classes'
:
91
,
'anchors_per_location'
:
9
,
'
retinanet_head_
num_convs'
:
4
,
'
retinanet_head_
num_filters'
:
256
,
'num_convs'
:
4
,
'num_filters'
:
256
,
'use_separable_conv'
:
False
,
'batch_norm'
:
{
'batch_norm_momentum'
:
0.997
,
'batch_norm_epsilon'
:
1e-4
,
'batch_norm_trainable'
:
True
,
},
},
'retinanet_loss'
:
{
'num_classes'
:
91
,
'focal_loss_alpha'
:
0.25
,
'focal_loss_gamma'
:
1.5
,
'huber_loss_delta'
:
0.1
,
'box_loss_weight'
:
50
,
},
'postprocess'
:
{
'use_batched_nms'
:
False
,
'min_level'
:
3
,
'max_level'
:
7
,
'max_total_size'
:
100
,
'nms_iou_threshold'
:
0.5
,
'score_threshold'
:
0.05
,
'pre_nms_num_boxes'
:
5000
,
},
'enable_summary'
:
False
,
}
'enable_summary'
:
True
,
},
is_strict
=
False
)
RETINANET_RESTRICTIONS
=
[
'architecture.use_bfloat16 == retinanet_parser.use_bfloat16'
,
'anchor.min_level == retinanet_head.min_level'
,
'anchor.max_level == retinanet_head.max_level'
,
'anchor.min_level == postprocess.min_level'
,
'anchor.max_level == postprocess.max_level'
,
'retinanet_head.num_classes == retinanet_loss.num_classes'
,
]
# pylint: enable=line-too-long
official/vision/detection/dataloader/factory.py
View file @
965cc3ee
...
...
@@ -29,8 +29,8 @@ def parser_generator(params, mode):
parser_params
=
params
.
retinanet_parser
parser_fn
=
retinanet_parser
.
Parser
(
output_size
=
parser_params
.
output_size
,
min_level
=
anchor_params
.
min_level
,
max_level
=
anchor_params
.
max_level
,
min_level
=
params
.
architecture
.
min_level
,
max_level
=
params
.
architecture
.
max_level
,
num_scales
=
anchor_params
.
num_scales
,
aspect_ratios
=
anchor_params
.
aspect_ratios
,
anchor_size
=
anchor_params
.
anchor_size
,
...
...
@@ -43,15 +43,15 @@ def parser_generator(params, mode):
autoaugment_policy_name
=
parser_params
.
autoaugment_policy_name
,
skip_crowd_during_training
=
parser_params
.
skip_crowd_during_training
,
max_num_instances
=
parser_params
.
max_num_instances
,
use_bfloat16
=
par
ser_params
.
use_bfloat16
,
use_bfloat16
=
par
ams
.
architecture
.
use_bfloat16
,
mode
=
mode
)
elif
params
.
architecture
.
parser
==
'maskrcnn_parser'
:
anchor_params
=
params
.
anchor
parser_params
=
params
.
maskrcnn_parser
parser_fn
=
maskrcnn_parser
.
Parser
(
output_size
=
parser_params
.
output_size
,
min_level
=
anchor_params
.
min_level
,
max_level
=
anchor_params
.
max_level
,
min_level
=
params
.
architecture
.
min_level
,
max_level
=
params
.
architecture
.
max_level
,
num_scales
=
anchor_params
.
num_scales
,
aspect_ratios
=
anchor_params
.
aspect_ratios
,
anchor_size
=
anchor_params
.
anchor_size
,
...
...
@@ -64,17 +64,17 @@ def parser_generator(params, mode):
aug_scale_max
=
parser_params
.
aug_scale_max
,
skip_crowd_during_training
=
parser_params
.
skip_crowd_during_training
,
max_num_instances
=
parser_params
.
max_num_instances
,
include_mask
=
par
ser_params
.
include_mask
,
include_mask
=
par
ams
.
architecture
.
include_mask
,
mask_crop_size
=
parser_params
.
mask_crop_size
,
use_bfloat16
=
par
ser_params
.
use_bfloat16
,
use_bfloat16
=
par
ams
.
architecture
.
use_bfloat16
,
mode
=
mode
)
elif
params
.
architecture
.
parser
==
'shapemask_parser'
:
anchor_params
=
params
.
anchor
parser_params
=
params
.
shapemask_parser
parser_fn
=
shapemask_parser
.
Parser
(
output_size
=
parser_params
.
output_size
,
min_level
=
anchor_params
.
min_level
,
max_level
=
anchor_params
.
max_level
,
min_level
=
params
.
architecture
.
min_level
,
max_level
=
params
.
architecture
.
max_level
,
num_scales
=
anchor_params
.
num_scales
,
aspect_ratios
=
anchor_params
.
aspect_ratios
,
anchor_size
=
anchor_params
.
anchor_size
,
...
...
@@ -93,7 +93,7 @@ def parser_generator(params, mode):
aug_scale_max
=
parser_params
.
aug_scale_max
,
skip_crowd_during_training
=
parser_params
.
skip_crowd_during_training
,
max_num_instances
=
parser_params
.
max_num_instances
,
use_bfloat16
=
par
ser_params
.
use_bfloat16
,
use_bfloat16
=
par
ams
.
architecture
.
use_bfloat16
,
mask_train_class
=
parser_params
.
mask_train_class
,
mode
=
mode
)
else
:
...
...
official/vision/detection/dataloader/input_reader.py
View file @
965cc3ee
...
...
@@ -85,13 +85,14 @@ class InputFn(object):
if
self
.
_input_sharding
and
ctx
and
ctx
.
num_input_pipelines
>
1
:
dataset
=
dataset
.
shard
(
ctx
.
num_input_pipelines
,
ctx
.
input_pipeline_id
)
dataset
=
dataset
.
cache
()
if
self
.
_is_training
:
dataset
=
dataset
.
repeat
()
dataset
=
dataset
.
interleave
(
map_func
=
lambda
file_name
:
self
.
_dataset_fn
(
file_name
),
cycle_length
=
32
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
dataset
=
dataset
.
cache
()
if
self
.
_is_training
:
# Large shuffle size is critical for 2vm input pipeline. Can use small
...
...
official/vision/detection/main.py
View file @
965cc3ee
...
...
@@ -35,10 +35,12 @@ from official.vision.detection.dataloader import input_reader
from
official.vision.detection.dataloader
import
mode_keys
as
ModeKeys
from
official.vision.detection.executor.detection_executor
import
DetectionDistributedExecutor
from
official.vision.detection.modeling
import
factory
as
model_factory
from
official.utils.flags
import
core
as
flags_core
from
official.utils.misc
import
distribution_utils
from
official.utils.misc
import
keras_utils
hyperparams_flags
.
initialize_common_flags
()
flags_core
.
define_log_steps
()
flags
.
DEFINE_bool
(
'enable_xla'
,
...
...
@@ -67,10 +69,12 @@ FLAGS = flags.FLAGS
def
run_executor
(
params
,
mode
,
checkpoint_path
=
None
,
train_input_fn
=
None
,
eval_input_fn
=
None
,
callbacks
=
None
,
strategy
=
None
):
prebuilt_
strategy
=
None
):
"""Runs Retinanet model on distribution strategy defined by the user."""
if
params
.
architecture
.
use_bfloat16
:
...
...
@@ -80,7 +84,9 @@ def run_executor(params,
model_builder
=
model_factory
.
model_generator
(
params
)
if
strategy
is
None
:
if
prebuilt_strategy
is
not
None
:
strategy
=
prebuilt_strategy
else
:
strategy_config
=
params
.
strategy_config
distribution_utils
.
configure_cluster
(
strategy_config
.
worker_hosts
,
strategy_config
.
task_index
)
...
...
@@ -94,7 +100,7 @@ def run_executor(params,
num_workers
=
int
(
strategy
.
num_replicas_in_sync
+
7
)
//
8
is_multi_host
=
(
int
(
num_workers
)
>=
2
)
if
FLAGS
.
mode
==
'train'
:
if
mode
==
'train'
:
def
_model_fn
(
params
):
return
model_builder
.
build_model
(
params
,
mode
=
ModeKeys
.
TRAIN
)
...
...
@@ -126,8 +132,7 @@ def run_executor(params,
init_checkpoint
=
model_builder
.
make_restore_checkpoint_fn
(),
custom_callbacks
=
callbacks
,
save_config
=
True
)
elif
FLAGS
.
mode
==
'eval'
or
FLAGS
.
mode
==
'eval_once'
:
elif
mode
==
'eval'
or
mode
==
'eval_once'
:
def
_model_fn
(
params
):
return
model_builder
.
build_model
(
params
,
mode
=
ModeKeys
.
PREDICT_WITH_GT
)
...
...
@@ -150,7 +155,7 @@ def run_executor(params,
trainable_variables_filter
=
model_builder
.
make_filter_trainable_variables_fn
())
if
FLAGS
.
mode
==
'eval'
:
if
mode
==
'eval'
:
results
=
dist_executor
.
evaluate_from_model_dir
(
model_dir
=
params
.
model_dir
,
eval_input_fn
=
eval_input_fn
,
...
...
@@ -160,9 +165,8 @@ def run_executor(params,
total_steps
=
params
.
train
.
total_steps
)
else
:
# Run evaluation once for a single checkpoint.
if
not
FLAGS
.
checkpoint_path
:
raise
ValueError
(
'FLAGS.checkpoint_path cannot be empty.'
)
checkpoint_path
=
FLAGS
.
checkpoint_path
if
not
checkpoint_path
:
raise
ValueError
(
'checkpoint_path cannot be empty.'
)
if
tf
.
io
.
gfile
.
isdir
(
checkpoint_path
):
checkpoint_path
=
tf
.
train
.
latest_checkpoint
(
checkpoint_path
)
summary_writer
=
executor
.
SummaryWriter
(
params
.
model_dir
,
'eval'
)
...
...
@@ -175,7 +179,7 @@ def run_executor(params,
logging
.
info
(
'Final eval metric %s: %f'
,
k
,
v
)
return
results
else
:
raise
ValueError
(
'Mode not found: %s.'
%
FLAGS
.
mode
)
raise
ValueError
(
'Mode not found: %s.'
%
mode
)
def
run
(
callbacks
=
None
):
...
...
@@ -224,8 +228,21 @@ def run(callbacks=None):
mode
=
input_reader
.
ModeKeys
.
PREDICT_WITH_GT
,
batch_size
=
params
.
eval
.
batch_size
,
num_examples
=
params
.
eval
.
eval_samples
)
if
callbacks
is
None
:
callbacks
=
[]
if
FLAGS
.
log_steps
:
callbacks
.
append
(
keras_utils
.
TimeHistory
(
batch_size
=
params
.
train
.
batch_size
,
log_steps
=
FLAGS
.
log_steps
,
))
return
run_executor
(
params
,
FLAGS
.
mode
,
checkpoint_path
=
FLAGS
.
checkpoint_path
,
train_input_fn
=
train_input_fn
,
eval_input_fn
=
eval_input_fn
,
callbacks
=
callbacks
)
...
...
@@ -238,6 +255,5 @@ def main(argv):
if
__name__
==
'__main__'
:
assert
tf
.
version
.
VERSION
.
startswith
(
'2.'
)
tf
.
config
.
set_soft_device_placement
(
True
)
app
.
run
(
main
)
official/vision/detection/modeling/architecture/factory.py
View file @
965cc3ee
...
...
@@ -25,16 +25,12 @@ from official.vision.detection.modeling.architecture import nn_ops
from
official.vision.detection.modeling.architecture
import
resnet
def
batch_norm_relu_generator
(
params
):
def
_batch_norm_op
(
**
kwargs
):
return
nn_ops
.
BatchNormRelu
(
momentum
=
params
.
batch_norm_momentum
,
epsilon
=
params
.
batch_norm_epsilon
,
trainable
=
params
.
batch_norm_trainable
,
**
kwargs
)
return
_batch_norm_op
def
norm_activation_generator
(
params
):
return
nn_ops
.
norm_activation_builder
(
momentum
=
params
.
batch_norm_momentum
,
epsilon
=
params
.
batch_norm_epsilon
,
trainable
=
params
.
batch_norm_trainable
,
activation
=
params
.
activation
)
def
backbone_generator
(
params
):
...
...
@@ -43,10 +39,12 @@ def backbone_generator(params):
resnet_params
=
params
.
resnet
backbone_fn
=
resnet
.
Resnet
(
resnet_depth
=
resnet_params
.
resnet_depth
,
batch_norm_relu
=
batch_norm_relu_generator
(
resnet_params
.
batch_norm
))
activation
=
params
.
norm_activation
.
activation
,
norm_activation
=
norm_activation_generator
(
params
.
norm_activation
))
else
:
raise
ValueError
(
'Backbone model
%s
is not supported.'
%
params
.
architecture
.
backbone
)
raise
ValueError
(
'Backbone model
`{}`
is not supported.'
.
format
(
params
.
architecture
.
backbone
)
)
return
backbone_fn
...
...
@@ -56,81 +54,75 @@ def multilevel_features_generator(params):
if
params
.
architecture
.
multilevel_features
==
'fpn'
:
fpn_params
=
params
.
fpn
fpn_fn
=
fpn
.
Fpn
(
min_level
=
fpn_
params
.
min_level
,
max_level
=
fpn_
params
.
max_level
,
min_level
=
params
.
architecture
.
min_level
,
max_level
=
params
.
architecture
.
max_level
,
fpn_feat_dims
=
fpn_params
.
fpn_feat_dims
,
use_separable_conv
=
fpn_params
.
use_separable_conv
,
activation
=
params
.
norm_activation
.
activation
,
use_batch_norm
=
fpn_params
.
use_batch_norm
,
batch_norm_relu
=
batch_norm_relu_generator
(
fpn_params
.
batch_norm
))
norm_activation
=
norm_activation_generator
(
params
.
norm_activation
))
elif
params
.
architecture
.
multilevel_features
==
'identity'
:
fpn_fn
=
identity
.
Identity
()
else
:
raise
ValueError
(
'The multi-level feature model
%s
is not supported.'
%
params
.
architecture
.
multilevel_features
)
raise
ValueError
(
'The multi-level feature model
`{}`
is not supported.'
.
format
(
params
.
architecture
.
multilevel_features
)
)
return
fpn_fn
def
retinanet_head_generator
(
params
):
"""Generator function for RetinaNet head architecture."""
head_params
=
params
.
retinanet_head
return
heads
.
RetinanetHead
(
params
.
min_level
,
params
.
max_level
,
params
.
num_classes
,
params
.
anchors_per_location
,
params
.
retinanet_head_
num_convs
,
params
.
retinanet_head_
num_filters
,
params
.
use_separable_conv
,
batch_norm_relu
=
batch_norm_relu
_generator
(
params
.
batch_norm
))
params
.
architecture
.
min_level
,
params
.
architecture
.
max_level
,
params
.
architecture
.
num_classes
,
head_
params
.
anchors_per_location
,
head_
params
.
num_convs
,
head_
params
.
num_filters
,
head_
params
.
use_separable_conv
,
norm_activation
=
norm_activation
_generator
(
params
.
norm_activation
))
def
rpn_head_generator
(
params
):
head_params
=
params
.
rpn_head
"""Generator function for RPN head architecture."""
return
heads
.
RpnHead
(
params
.
min_level
,
params
.
max_level
,
params
.
anchors_per_location
,
params
.
num_convs
,
params
.
num_filters
,
params
.
use_separable_conv
,
params
.
use_batch_norm
,
batch_norm_relu
=
batch_norm_relu_generator
(
params
.
batch_norm
))
return
heads
.
RpnHead
(
params
.
architecture
.
min_level
,
params
.
architecture
.
max_level
,
head_params
.
anchors_per_location
,
head_params
.
num_convs
,
head_params
.
num_filters
,
head_params
.
use_separable_conv
,
params
.
norm_activation
.
activation
,
head_params
.
use_batch_norm
,
norm_activation
=
norm_activation_generator
(
params
.
norm_activation
))
def
fast_rcnn_head_generator
(
params
):
"""Generator function for Fast R-CNN head architecture."""
return
heads
.
FastrcnnHead
(
params
.
num_classes
,
params
.
num_convs
,
params
.
num_filters
,
params
.
use_separable_conv
,
params
.
num_fcs
,
params
.
fc_dims
,
params
.
use_batch_norm
,
batch_norm_relu
=
batch_norm_relu_generator
(
params
.
batch_norm
))
head_params
=
params
.
frcnn_head
return
heads
.
FastrcnnHead
(
params
.
architecture
.
num_classes
,
head_params
.
num_convs
,
head_params
.
num_filters
,
head_params
.
use_separable_conv
,
head_params
.
num_fcs
,
head_params
.
fc_dims
,
params
.
norm_activation
.
activation
,
head_params
.
use_batch_norm
,
norm_activation
=
norm_activation_generator
(
params
.
norm_activation
))
def
mask_rcnn_head_generator
(
params
):
"""Generator function for Mask R-CNN head architecture."""
return
heads
.
MaskrcnnHead
(
params
.
num_classes
,
params
.
mask_target_size
,
params
.
num_convs
,
params
.
num_filters
,
params
.
use_separable_conv
,
params
.
use_batch_norm
,
batch_norm_relu
=
batch_norm_relu_generator
(
params
.
batch_norm
))
def
shapeprior_head_generator
(
params
):
"""Generator function for Shapemask head architecture."""
raise
NotImplementedError
(
'Unimplemented'
)
def
coarsemask_head_generator
(
params
):
"""Generator function for Shapemask head architecture."""
raise
NotImplementedError
(
'Unimplemented'
)
def
finemask_head_generator
(
params
):
"""Generator function for Shapemask head architecture."""
raise
NotImplementedError
(
'Unimplemented'
)
head_params
=
params
.
mrcnn_head
return
heads
.
MaskrcnnHead
(
params
.
architecture
.
num_classes
,
params
.
architecture
.
mask_target_size
,
head_params
.
num_convs
,
head_params
.
num_filters
,
head_params
.
use_separable_conv
,
params
.
norm_activation
.
activation
,
head_params
.
use_batch_norm
,
norm_activation
=
norm_activation_generator
(
params
.
norm_activation
))
official/vision/detection/modeling/architecture/fpn.py
View file @
965cc3ee
...
...
@@ -41,8 +41,10 @@ class Fpn(object):
max_level
=
7
,
fpn_feat_dims
=
256
,
use_separable_conv
=
False
,
activation
=
'relu'
,
use_batch_norm
=
True
,
batch_norm_relu
=
nn_ops
.
BatchNormRelu
):
norm_activation
=
nn_ops
.
norm_activation_builder
(
activation
=
'relu'
)):
"""FPN initialization function.
Args:
...
...
@@ -52,8 +54,8 @@ class Fpn(object):
use_separable_conv: `bool`, if True use separable convolution for
convolution in FPN layers.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
batch_norm_relu
: an operation that includes a
batch
normalization layer
followed by a
relu layer(optional)
.
norm_activation
: an operation that includes a normalization layer
followed by a
n optional activation layer
.
"""
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
...
...
@@ -63,17 +65,23 @@ class Fpn(object):
tf
.
keras
.
layers
.
SeparableConv2D
,
depth_multiplier
=
1
)
else
:
self
.
_conv2d_op
=
tf
.
keras
.
layers
.
Conv2D
if
activation
==
'relu'
:
self
.
_activation_op
=
tf
.
nn
.
relu
elif
activation
==
'swish'
:
self
.
_activation_op
=
tf
.
nn
.
swish
else
:
raise
ValueError
(
'Unsupported activation `{}`.'
.
format
(
activation
))
self
.
_use_batch_norm
=
use_batch_norm
self
.
_
batch_norm_relu
=
batch_norm_relu
self
.
_
norm_activation
=
norm_activation
self
.
_
batch_norm_relu
s
=
{}
self
.
_
norm_activation
s
=
{}
self
.
_lateral_conv2d_op
=
{}
self
.
_post_hoc_conv2d_op
=
{}
self
.
_coarse_conv2d_op
=
{}
for
level
in
range
(
self
.
_min_level
,
self
.
_max_level
+
1
):
if
self
.
_use_batch_norm
:
self
.
_
batch_norm_relus
[
level
]
=
batch_norm_relu
(
relu
=
False
,
name
=
'p%d-bn'
%
level
)
self
.
_
norm_activations
[
level
]
=
norm_activation
(
use_activation
=
False
,
name
=
'p%d-bn'
%
level
)
self
.
_lateral_conv2d_op
[
level
]
=
self
.
_conv2d_op
(
filters
=
self
.
_fpn_feat_dims
,
kernel_size
=
(
1
,
1
),
...
...
@@ -133,11 +141,11 @@ class Fpn(object):
for
level
in
range
(
backbone_max_level
+
1
,
self
.
_max_level
+
1
):
feats_in
=
feats
[
level
-
1
]
if
level
>
backbone_max_level
+
1
:
feats_in
=
tf
.
nn
.
relu
(
feats_in
)
feats_in
=
self
.
_activation_op
(
feats_in
)
feats
[
level
]
=
self
.
_coarse_conv2d_op
[
level
](
feats_in
)
if
self
.
_use_batch_norm
:
# Adds batch_norm layer.
for
level
in
range
(
self
.
_min_level
,
self
.
_max_level
+
1
):
feats
[
level
]
=
self
.
_
batch_norm_relu
s
[
level
](
feats
[
level
]
=
self
.
_
norm_activation
s
[
level
](
feats
[
level
],
is_training
=
is_training
)
return
feats
official/vision/detection/modeling/architecture/heads.py
View file @
965cc3ee
...
...
@@ -39,8 +39,10 @@ class RpnHead(tf.keras.layers.Layer):
num_convs
=
2
,
num_filters
=
256
,
use_separable_conv
=
False
,
activation
=
'relu'
,
use_batch_norm
=
True
,
batch_norm_relu
=
nn_ops
.
BatchNormRelu
):
norm_activation
=
nn_ops
.
norm_activation_builder
(
activation
=
'relu'
)):
"""Initialize params to build Region Proposal Network head.
Args:
...
...
@@ -55,12 +57,18 @@ class RpnHead(tf.keras.layers.Layer):
use_separable_conv: `bool`, indicating whether the separable conv layers
is used.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
batch_norm_relu
: an operation that includes a
batch
normalization layer
followed by a
relu layer(optional)
.
norm_activation
: an operation that includes a normalization layer
followed by a
n optional activation layer
.
"""
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_anchors_per_location
=
anchors_per_location
if
activation
==
'relu'
:
self
.
_activation_op
=
tf
.
nn
.
relu
elif
activation
==
'swish'
:
self
.
_activation_op
=
tf
.
nn
.
swish
else
:
raise
ValueError
(
'Unsupported activation `{}`.'
.
format
(
activation
))
self
.
_use_batch_norm
=
use_batch_norm
if
use_separable_conv
:
...
...
@@ -78,7 +86,7 @@ class RpnHead(tf.keras.layers.Layer):
num_filters
,
kernel_size
=
(
3
,
3
),
strides
=
(
1
,
1
),
activation
=
(
None
if
self
.
_use_batch_norm
else
tf
.
nn
.
relu
),
activation
=
(
None
if
self
.
_use_batch_norm
else
self
.
_activation_op
),
padding
=
'same'
,
name
=
'rpn'
)
self
.
_rpn_class_conv
=
self
.
_conv2d_op
(
...
...
@@ -94,10 +102,10 @@ class RpnHead(tf.keras.layers.Layer):
padding
=
'valid'
,
name
=
'rpn-box'
)
self
.
_
batch_norm_relu
s
=
{}
self
.
_
norm_activation
s
=
{}
if
self
.
_use_batch_norm
:
for
level
in
range
(
self
.
_min_level
,
self
.
_max_level
+
1
):
self
.
_
batch_norm_relus
[
level
]
=
batch_norm_relu
(
name
=
'rpn-l%d-bn'
%
self
.
_
norm_activations
[
level
]
=
norm_activation
(
name
=
'rpn-l%d-bn'
%
level
)
def
_shared_rpn_heads
(
self
,
features
,
anchors_per_location
,
level
,
...
...
@@ -106,7 +114,7 @@ class RpnHead(tf.keras.layers.Layer):
features
=
self
.
_rpn_conv
(
features
)
if
self
.
_use_batch_norm
:
# The batch normalization layers are not shared between levels.
features
=
self
.
_
batch_norm_relu
s
[
level
](
features
=
self
.
_
norm_activation
s
[
level
](
features
,
is_training
=
is_training
)
# Proposal classification scores
scores
=
self
.
_rpn_class_conv
(
features
)
...
...
@@ -139,8 +147,10 @@ class FastrcnnHead(tf.keras.layers.Layer):
use_separable_conv
=
False
,
num_fcs
=
2
,
fc_dims
=
1024
,
activation
=
'relu'
,
use_batch_norm
=
True
,
batch_norm_relu
=
nn_ops
.
BatchNormRelu
):
norm_activation
=
nn_ops
.
norm_activation_builder
(
activation
=
'relu'
)):
"""Initialize params to build Fast R-CNN box head.
Args:
...
...
@@ -156,8 +166,8 @@ class FastrcnnHead(tf.keras.layers.Layer):
fc_dims: `int` number that represents the number of dimension of the FC
layers.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
batch_norm_relu
: an operation that includes a
batch
normalization layer
followed by a
relu layer(optional)
.
norm_activation
: an operation that includes a normalization layer
followed by a
n optional activation layer
.
"""
self
.
_num_classes
=
num_classes
...
...
@@ -177,9 +187,14 @@ class FastrcnnHead(tf.keras.layers.Layer):
self
.
_num_fcs
=
num_fcs
self
.
_fc_dims
=
fc_dims
if
activation
==
'relu'
:
self
.
_activation_op
=
tf
.
nn
.
relu
elif
activation
==
'swish'
:
self
.
_activation_op
=
tf
.
nn
.
swish
else
:
raise
ValueError
(
'Unsupported activation `{}`.'
.
format
(
activation
))
self
.
_use_batch_norm
=
use_batch_norm
self
.
_
batch_norm_relu
=
batch_norm_relu
self
.
_
norm_activation
=
norm_activation
self
.
_conv_ops
=
[]
self
.
_conv_bn_ops
=
[]
...
...
@@ -191,10 +206,10 @@ class FastrcnnHead(tf.keras.layers.Layer):
strides
=
(
1
,
1
),
padding
=
'same'
,
dilation_rate
=
(
1
,
1
),
activation
=
(
None
if
self
.
_use_batch_norm
else
tf
.
nn
.
relu
),
activation
=
(
None
if
self
.
_use_batch_norm
else
self
.
_activation_op
),
name
=
'conv_{}'
.
format
(
i
)))
if
self
.
_use_batch_norm
:
self
.
_conv_bn_ops
.
append
(
self
.
_
batch_norm_relu
())
self
.
_conv_bn_ops
.
append
(
self
.
_
norm_activation
())
self
.
_fc_ops
=
[]
self
.
_fc_bn_ops
=
[]
...
...
@@ -202,10 +217,10 @@ class FastrcnnHead(tf.keras.layers.Layer):
self
.
_fc_ops
.
append
(
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
_fc_dims
,
activation
=
(
None
if
self
.
_use_batch_norm
else
tf
.
nn
.
relu
),
activation
=
(
None
if
self
.
_use_batch_norm
else
self
.
_activation_op
),
name
=
'fc{}'
.
format
(
i
)))
if
self
.
_use_batch_norm
:
self
.
_fc_bn_ops
.
append
(
self
.
_
batch_norm_relu
(
fused
=
False
))
self
.
_fc_bn_ops
.
append
(
self
.
_
norm_activation
(
fused
=
False
))
self
.
_class_predict
=
tf
.
keras
.
layers
.
Dense
(
self
.
_num_classes
,
...
...
@@ -266,8 +281,10 @@ class MaskrcnnHead(tf.keras.layers.Layer):
num_convs
=
4
,
num_filters
=
256
,
use_separable_conv
=
False
,
activation
=
'relu'
,
use_batch_norm
=
True
,
batch_norm_relu
=
nn_ops
.
BatchNormRelu
):
norm_activation
=
nn_ops
.
norm_activation_builder
(
activation
=
'relu'
)):
"""Initialize params to build Fast R-CNN head.
Args:
...
...
@@ -280,8 +297,8 @@ class MaskrcnnHead(tf.keras.layers.Layer):
use_separable_conv: `bool`, indicating whether the separable conv layers
is used.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
batch_norm_relu
: an operation that includes a
batch
normalization layer
followed by a
relu layer(optional)
.
norm_activation
: an operation that includes a normalization layer
followed by a
n optional activation layer
.
"""
self
.
_num_classes
=
num_classes
self
.
_mask_target_size
=
mask_target_size
...
...
@@ -299,9 +316,14 @@ class MaskrcnnHead(tf.keras.layers.Layer):
kernel_initializer
=
tf
.
keras
.
initializers
.
VarianceScaling
(
scale
=
2
,
mode
=
'fan_out'
,
distribution
=
'untruncated_normal'
),
bias_initializer
=
tf
.
zeros_initializer
())
if
activation
==
'relu'
:
self
.
_activation_op
=
tf
.
nn
.
relu
elif
activation
==
'swish'
:
self
.
_activation_op
=
tf
.
nn
.
swish
else
:
raise
ValueError
(
'Unsupported activation `{}`.'
.
format
(
activation
))
self
.
_use_batch_norm
=
use_batch_norm
self
.
_
batch_norm_relu
=
batch_norm_relu
self
.
_
norm_activation
=
norm_activation
self
.
_conv2d_ops
=
[]
for
i
in
range
(
self
.
_num_convs
):
self
.
_conv2d_ops
.
append
(
...
...
@@ -311,14 +333,14 @@ class MaskrcnnHead(tf.keras.layers.Layer):
strides
=
(
1
,
1
),
padding
=
'same'
,
dilation_rate
=
(
1
,
1
),
activation
=
(
None
if
self
.
_use_batch_norm
else
tf
.
nn
.
relu
),
activation
=
(
None
if
self
.
_use_batch_norm
else
self
.
_activation_op
),
name
=
'mask-conv-l%d'
%
i
))
self
.
_mask_conv_transpose
=
tf
.
keras
.
layers
.
Conv2DTranspose
(
self
.
_num_filters
,
kernel_size
=
(
2
,
2
),
strides
=
(
2
,
2
),
padding
=
'valid'
,
activation
=
(
None
if
self
.
_use_batch_norm
else
tf
.
nn
.
relu
),
activation
=
(
None
if
self
.
_use_batch_norm
else
self
.
_activation_op
),
kernel_initializer
=
tf
.
keras
.
initializers
.
VarianceScaling
(
scale
=
2
,
mode
=
'fan_out'
,
distribution
=
'untruncated_normal'
),
bias_initializer
=
tf
.
zeros_initializer
(),
...
...
@@ -353,11 +375,11 @@ class MaskrcnnHead(tf.keras.layers.Layer):
for
i
in
range
(
self
.
_num_convs
):
net
=
self
.
_conv2d_ops
[
i
](
net
)
if
self
.
_use_batch_norm
:
net
=
self
.
_
batch_norm_relu
()(
net
,
is_training
=
is_training
)
net
=
self
.
_
norm_activation
()(
net
,
is_training
=
is_training
)
net
=
self
.
_mask_conv_transpose
(
net
)
if
self
.
_use_batch_norm
:
net
=
self
.
_
batch_norm_relu
()(
net
,
is_training
=
is_training
)
net
=
self
.
_
norm_activation
()(
net
,
is_training
=
is_training
)
mask_outputs
=
self
.
_conv2d_op
(
self
.
_num_classes
,
...
...
@@ -398,7 +420,8 @@ class RetinanetHead(object):
num_convs
=
4
,
num_filters
=
256
,
use_separable_conv
=
False
,
batch_norm_relu
=
nn_ops
.
BatchNormRelu
):
norm_activation
=
nn_ops
.
norm_activation_builder
(
activation
=
'relu'
)):
"""Initialize params to build RetinaNet head.
Args:
...
...
@@ -411,8 +434,8 @@ class RetinanetHead(object):
num_filters: `int` number of filters used in the head architecture.
use_separable_conv: `bool` to indicate whether to use separable
convoluation.
batch_norm_relu
: an operation that includes a
batch
normalization layer
followed by a
relu layer(optional)
.
norm_activation
: an operation that includes a normalization layer
followed by a
n optional activation layer
.
"""
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
...
...
@@ -423,13 +446,12 @@ class RetinanetHead(object):
self
.
_num_convs
=
num_convs
self
.
_num_filters
=
num_filters
self
.
_use_separable_conv
=
use_separable_conv
with
tf
.
name_scope
(
'class_net'
)
as
scope_name
:
self
.
_class_name_scope
=
tf
.
name_scope
(
scope_name
)
with
tf
.
name_scope
(
'box_net'
)
as
scope_name
:
self
.
_box_name_scope
=
tf
.
name_scope
(
scope_name
)
self
.
_build_class_net_layers
(
batch_norm_relu
)
self
.
_build_box_net_layers
(
batch_norm_relu
)
self
.
_build_class_net_layers
(
norm_activation
)
self
.
_build_box_net_layers
(
norm_activation
)
def
_class_net_batch_norm_name
(
self
,
i
,
level
):
return
'class-%d-%d'
%
(
i
,
level
)
...
...
@@ -437,7 +459,7 @@ class RetinanetHead(object):
def
_box_net_batch_norm_name
(
self
,
i
,
level
):
return
'box-%d-%d'
%
(
i
,
level
)
def
_build_class_net_layers
(
self
,
batch_norm_relu
):
def
_build_class_net_layers
(
self
,
norm_activation
):
"""Build re-usable layers for class prediction network."""
if
self
.
_use_separable_conv
:
self
.
_class_predict
=
tf
.
keras
.
layers
.
SeparableConv2D
(
...
...
@@ -455,7 +477,7 @@ class RetinanetHead(object):
padding
=
'same'
,
name
=
'class-predict'
)
self
.
_class_conv
=
[]
self
.
_class_
batch_norm_relu
=
{}
self
.
_class_
norm_activation
=
{}
for
i
in
range
(
self
.
_num_convs
):
if
self
.
_use_separable_conv
:
self
.
_class_conv
.
append
(
...
...
@@ -479,9 +501,9 @@ class RetinanetHead(object):
name
=
'class-'
+
str
(
i
)))
for
level
in
range
(
self
.
_min_level
,
self
.
_max_level
+
1
):
name
=
self
.
_class_net_batch_norm_name
(
i
,
level
)
self
.
_class_
batch_norm_relu
[
name
]
=
batch_norm_relu
(
name
=
name
)
self
.
_class_
norm_activation
[
name
]
=
norm_activation
(
name
=
name
)
def
_build_box_net_layers
(
self
,
batch_norm_relu
):
def
_build_box_net_layers
(
self
,
norm_activation
):
"""Build re-usable layers for box prediction network."""
if
self
.
_use_separable_conv
:
self
.
_box_predict
=
tf
.
keras
.
layers
.
SeparableConv2D
(
...
...
@@ -499,7 +521,7 @@ class RetinanetHead(object):
padding
=
'same'
,
name
=
'box-predict'
)
self
.
_box_conv
=
[]
self
.
_box_
batch_norm_relu
=
{}
self
.
_box_
norm_activation
=
{}
for
i
in
range
(
self
.
_num_convs
):
if
self
.
_use_separable_conv
:
self
.
_box_conv
.
append
(
...
...
@@ -523,13 +545,13 @@ class RetinanetHead(object):
name
=
'box-'
+
str
(
i
)))
for
level
in
range
(
self
.
_min_level
,
self
.
_max_level
+
1
):
name
=
self
.
_box_net_batch_norm_name
(
i
,
level
)
self
.
_box_
batch_norm_relu
[
name
]
=
batch_norm_relu
(
name
=
name
)
self
.
_box_
norm_activation
[
name
]
=
norm_activation
(
name
=
name
)
def
__call__
(
self
,
fpn_features
,
is_training
=
None
):
"""Returns outputs of RetinaNet head."""
class_outputs
=
{}
box_outputs
=
{}
with
backend
.
get_graph
().
as_default
(),
tf
.
name_scope
(
'retinanet'
):
with
backend
.
get_graph
().
as_default
(),
tf
.
name_scope
(
'retinanet
_head
'
):
for
level
in
range
(
self
.
_min_level
,
self
.
_max_level
+
1
):
features
=
fpn_features
[
level
]
...
...
@@ -548,7 +570,7 @@ class RetinanetHead(object):
# each level has its batch normlization to capture the statistical
# difference among different levels.
name
=
self
.
_class_net_batch_norm_name
(
i
,
level
)
features
=
self
.
_class_
batch_norm_relu
[
name
](
features
=
self
.
_class_
norm_activation
[
name
](
features
,
is_training
=
is_training
)
classes
=
self
.
_class_predict
(
features
)
...
...
@@ -563,7 +585,7 @@ class RetinanetHead(object):
# each level has its batch normlization to capture the statistical
# difference among different levels.
name
=
self
.
_box_net_batch_norm_name
(
i
,
level
)
features
=
self
.
_box_
batch_norm_relu
[
name
](
features
=
self
.
_box_
norm_activation
[
name
](
features
,
is_training
=
is_training
)
boxes
=
self
.
_box_predict
(
features
)
...
...
@@ -953,13 +975,13 @@ class ShapemaskCoarsemaskHead(object):
def
coarsemask_decoder_net
(
self
,
images
,
is_training
=
None
,
batch_norm_relu
=
nn_ops
.
BatchNormRelu
):
norm_activation
=
nn_ops
.
norm_activation_builder
()
):
"""Coarse mask decoder network architecture.
Args:
images: A tensor of size [batch, height_in, width_in, channels_in].
is_training: Whether batch_norm layers are in training mode.
batch_norm_relu
: an operation that includes a batch normalization layer
norm_activation
: an operation that includes a batch normalization layer
followed by a relu layer(optional).
Returns:
images: A feature tensor of size [batch, output_size, output_size,
...
...
@@ -975,7 +997,7 @@ class ShapemaskCoarsemaskHead(object):
padding
=
'same'
,
name
=
'coarse-class-%d'
%
i
)(
images
)
images
=
batch_norm_relu
(
name
=
'coarse-class-%d-bn'
%
i
)(
images
=
norm_activation
(
name
=
'coarse-class-%d-bn'
%
i
)(
images
,
is_training
=
is_training
)
return
images
...
...
@@ -991,7 +1013,7 @@ class ShapemaskFinemaskHead(object):
num_convs
,
coarse_mask_thr
,
gt_upsample_scale
,
batch_norm_relu
=
nn_ops
.
BatchNormRelu
):
norm_activation
=
nn_ops
.
norm_activation_builder
()
):
"""Initialize params to build ShapeMask coarse and fine prediction head.
Args:
...
...
@@ -1002,7 +1024,7 @@ class ShapemaskFinemaskHead(object):
layer.
coarse_mask_thr: the threshold for suppressing noisy coarse prediction.
gt_upsample_scale: scale for upsampling groundtruths.
batch_norm_relu
: an operation that includes a batch normalization layer
norm_activation
: an operation that includes a batch normalization layer
followed by a relu layer(optional).
"""
self
.
_mask_num_classes
=
num_classes
...
...
@@ -1038,7 +1060,7 @@ class ShapemaskFinemaskHead(object):
activation
=
None
,
padding
=
'same'
,
name
=
'fine-class-%d'
%
i
))
self
.
_fine_class_bn
.
append
(
batch_norm_relu
(
name
=
'fine-class-%d-bn'
%
i
))
self
.
_fine_class_bn
.
append
(
norm_activation
(
name
=
'fine-class-%d-bn'
%
i
))
def
__call__
(
self
,
prior_conditioned_features
,
class_probs
,
is_training
=
None
):
"""Generate instance masks from FPN features and detection priors.
...
...
official/vision/detection/modeling/architecture/nn_ops.py
View file @
965cc3ee
...
...
@@ -18,20 +18,21 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
functools
from
absl
import
logging
import
tensorflow.compat.v2
as
tf
from
tensorflow.python.keras
import
backend
class
BatchNormRelu
(
tf
.
keras
.
layers
.
Layer
):
"""Combined
Batch
Normalization and
ReLU
layers."""
class
NormActivation
(
tf
.
keras
.
layers
.
Layer
):
"""Combined Normalization and
Activation
layers."""
def
__init__
(
self
,
momentum
=
0.997
,
epsilon
=
1e-4
,
trainable
=
True
,
relu
=
True
,
init_zero
=
False
,
use_activation
=
True
,
activation
=
'relu'
,
fused
=
True
,
name
=
None
):
"""A class to construct layers for a batch normalization followed by a ReLU.
...
...
@@ -39,22 +40,24 @@ class BatchNormRelu(tf.keras.layers.Layer):
Args:
momentum: momentum for the moving average.
epsilon: small float added to variance to avoid dividing by zero.
trainable: `bool
ean
`, if True also add variables to the graph collection
trainable: `bool`, if True also add variables to the graph collection
GraphKeys.TRAINABLE_VARIABLES. If False, freeze batch normalization
layer.
relu: `bool` if False, omits the ReLU operation.
init_zero: `bool` if True, initializes scale parameter of batch
normalization with 0. If False, initialize it with 1.
fused: `bool` fused option in batch normalziation.
use_actiation: `bool`, whether to add the optional activation layer after
the batch normalization layer.
activation: 'string', the type of the activation layer. Currently support
`relu` and `swish`.
name: `str` name for the operation.
"""
super
(
BatchNormRelu
,
self
).
__init__
(
trainable
=
trainable
)
self
.
_use_relu
=
relu
super
(
NormActivation
,
self
).
__init__
(
trainable
=
trainable
)
if
init_zero
:
gamma_initializer
=
tf
.
keras
.
initializers
.
Zeros
()
else
:
gamma_initializer
=
tf
.
keras
.
initializers
.
Ones
()
self
.
_
batch_norm
_op
=
tf
.
keras
.
layers
.
BatchNormalization
(
self
.
_
normalization
_op
=
tf
.
keras
.
layers
.
BatchNormalization
(
momentum
=
momentum
,
epsilon
=
epsilon
,
center
=
True
,
...
...
@@ -63,9 +66,16 @@ class BatchNormRelu(tf.keras.layers.Layer):
fused
=
fused
,
gamma_initializer
=
gamma_initializer
,
name
=
name
)
self
.
_use_activation
=
use_activation
if
activation
==
'relu'
:
self
.
_activation_op
=
tf
.
nn
.
relu
elif
activation
==
'swish'
:
self
.
_activation_op
=
tf
.
nn
.
swish
else
:
raise
ValueError
(
'Unsupported activation `{}`.'
.
format
(
activation
))
def
__call__
(
self
,
inputs
,
is_training
=
None
):
"""Builds
layers for a batch
normalization followed by a
ReLU
.
"""Builds
the
normalization
layer
followed by a
n optional activation layer
.
Args:
inputs: `Tensor` of shape `[batch, channels, ...]`.
...
...
@@ -78,9 +88,22 @@ class BatchNormRelu(tf.keras.layers.Layer):
# from keras.Model.training
if
is_training
and
self
.
trainable
:
is_training
=
True
inputs
=
self
.
_
batch_norm
_op
(
inputs
,
training
=
is_training
)
inputs
=
self
.
_
normalization
_op
(
inputs
,
training
=
is_training
)
if
self
.
_use_
relu
:
inputs
=
tf
.
nn
.
relu
(
inputs
)
if
self
.
_use_
activation
:
inputs
=
self
.
_activation_op
(
inputs
)
return
inputs
def
norm_activation_builder
(
momentum
=
0.997
,
epsilon
=
1e-4
,
trainable
=
True
,
activation
=
'relu'
,
**
kwargs
):
return
functools
.
partial
(
NormActivation
,
momentum
=
momentum
,
epsilon
=
epsilon
,
trainable
=
trainable
,
activation
=
'relu'
,
**
kwargs
)
official/vision/detection/modeling/architecture/resnet.py
View file @
965cc3ee
...
...
@@ -34,21 +34,27 @@ class Resnet(object):
def
__init__
(
self
,
resnet_depth
,
batch_norm_relu
=
nn_ops
.
BatchNormRelu
,
activation
=
'relu'
,
norm_activation
=
nn_ops
.
norm_activation_builder
(
activation
=
'relu'
),
data_format
=
'channels_last'
):
"""ResNet initialization function.
Args:
resnet_depth: `int` depth of ResNet backbone model.
batch_norm_relu
: an operation that includes a
batch
normalization layer
followed by a
relu layer(optional)
.
norm_activation
: an operation that includes a normalization layer
followed by a
n optional activation layer
.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
"""
self
.
_resnet_depth
=
resnet_depth
self
.
_batch_norm_relu
=
batch_norm_relu
if
activation
==
'relu'
:
self
.
_activation_op
=
tf
.
nn
.
relu
elif
activation
==
'swish'
:
self
.
_activation_op
=
tf
.
nn
.
swish
else
:
raise
ValueError
(
'Unsupported activation `{}`.'
.
format
(
activation
))
self
.
_norm_activation
=
norm_activation
self
.
_data_format
=
data_format
model_params
=
{
...
...
@@ -170,19 +176,19 @@ class Resnet(object):
# Projection shortcut in first layer to match filters and strides
shortcut
=
self
.
conv2d_fixed_padding
(
inputs
=
inputs
,
filters
=
filters
,
kernel_size
=
1
,
strides
=
strides
)
shortcut
=
self
.
_
batch_norm_relu
(
relu
=
False
)(
shortcut
=
self
.
_
norm_activation
(
use_activation
=
False
)(
shortcut
,
is_training
=
is_training
)
inputs
=
self
.
conv2d_fixed_padding
(
inputs
=
inputs
,
filters
=
filters
,
kernel_size
=
3
,
strides
=
strides
)
inputs
=
self
.
_
batch_norm_relu
()(
inputs
,
is_training
=
is_training
)
inputs
=
self
.
_
norm_activation
()(
inputs
,
is_training
=
is_training
)
inputs
=
self
.
conv2d_fixed_padding
(
inputs
=
inputs
,
filters
=
filters
,
kernel_size
=
3
,
strides
=
1
)
inputs
=
self
.
_
batch_norm_relu
(
)(
inputs
,
relu
=
False
,
init_zero
=
True
,
is_training
=
is_training
)
inputs
=
self
.
_
norm_activation
(
use_activation
=
False
,
init_zero
=
True
)(
inputs
,
is_training
=
is_training
)
return
tf
.
nn
.
relu
(
inputs
+
shortcut
)
return
self
.
_activation_op
(
inputs
+
shortcut
)
def
bottleneck_block
(
self
,
inputs
,
...
...
@@ -214,24 +220,23 @@ class Resnet(object):
filters_out
=
4
*
filters
shortcut
=
self
.
conv2d_fixed_padding
(
inputs
=
inputs
,
filters
=
filters_out
,
kernel_size
=
1
,
strides
=
strides
)
shortcut
=
self
.
_
batch_norm_relu
(
relu
=
False
)(
shortcut
=
self
.
_
norm_activation
(
use_activation
=
False
)(
shortcut
,
is_training
=
is_training
)
inputs
=
self
.
conv2d_fixed_padding
(
inputs
=
inputs
,
filters
=
filters
,
kernel_size
=
1
,
strides
=
1
)
inputs
=
self
.
_
batch_norm_relu
()(
inputs
,
is_training
=
is_training
)
inputs
=
self
.
_
norm_activation
()(
inputs
,
is_training
=
is_training
)
inputs
=
self
.
conv2d_fixed_padding
(
inputs
=
inputs
,
filters
=
filters
,
kernel_size
=
3
,
strides
=
strides
)
inputs
=
self
.
_
batch_norm_relu
()(
inputs
,
is_training
=
is_training
)
inputs
=
self
.
_
norm_activation
()(
inputs
,
is_training
=
is_training
)
inputs
=
self
.
conv2d_fixed_padding
(
inputs
=
inputs
,
filters
=
4
*
filters
,
kernel_size
=
1
,
strides
=
1
)
inputs
=
self
.
_batch_norm_relu
(
relu
=
False
,
init_zero
=
True
)(
inputs
,
is_training
=
is_training
)
inputs
=
self
.
_norm_activation
(
use_activation
=
False
,
init_zero
=
True
)(
inputs
,
is_training
=
is_training
)
return
tf
.
nn
.
relu
(
inputs
+
shortcut
)
return
self
.
_activation_op
(
inputs
+
shortcut
)
def
block_group
(
self
,
inputs
,
filters
,
block_fn
,
blocks
,
strides
,
name
,
is_training
):
...
...
@@ -279,7 +284,7 @@ class Resnet(object):
inputs
=
self
.
conv2d_fixed_padding
(
inputs
=
inputs
,
filters
=
64
,
kernel_size
=
7
,
strides
=
2
)
inputs
=
tf
.
identity
(
inputs
,
'initial_conv'
)
inputs
=
self
.
_
batch_norm_relu
()(
inputs
,
is_training
=
is_training
)
inputs
=
self
.
_
norm_activation
()(
inputs
,
is_training
=
is_training
)
inputs
=
tf
.
keras
.
layers
.
MaxPool2D
(
pool_size
=
3
,
strides
=
2
,
padding
=
'SAME'
,
...
...
official/vision/detection/modeling/base_model.py
View file @
965cc3ee
...
...
@@ -24,37 +24,7 @@ import re
import
tensorflow.compat.v2
as
tf
from
official.vision.detection.modeling
import
checkpoint_utils
from
official.vision.detection.modeling
import
learning_rates
class
OptimizerFactory
(
object
):
"""Class to generate optimizer function."""
def
__init__
(
self
,
params
):
"""Creates optimized based on the specified flags."""
if
params
.
type
==
'momentum'
:
nesterov
=
False
try
:
nesterov
=
params
.
nesterov
except
AttributeError
:
pass
self
.
_optimizer
=
functools
.
partial
(
tf
.
keras
.
optimizers
.
SGD
,
momentum
=
params
.
momentum
,
nesterov
=
nesterov
)
elif
params
.
type
==
'adam'
:
self
.
_optimizer
=
tf
.
keras
.
optimizers
.
Adam
elif
params
.
type
==
'adadelta'
:
self
.
_optimizer
=
tf
.
keras
.
optimizers
.
Adadelta
elif
params
.
type
==
'adagrad'
:
self
.
_optimizer
=
tf
.
keras
.
optimizers
.
Adagrad
elif
params
.
type
==
'rmsprop'
:
self
.
_optimizer
=
functools
.
partial
(
tf
.
keras
.
optimizers
.
RMSprop
,
momentum
=
params
.
momentum
)
else
:
raise
ValueError
(
'Unsupported optimizer type %s.'
%
self
.
_optimizer
)
def
__call__
(
self
,
learning_rate
):
return
self
.
_optimizer
(
learning_rate
=
learning_rate
)
from
official.vision.detection.modeling
import
optimizers
def
_make_filter_trainable_variables_fn
(
frozen_variable_prefix
):
...
...
@@ -73,7 +43,8 @@ def _make_filter_trainable_variables_fn(frozen_variable_prefix):
# the frozen variables' names.
filtered_variables
=
[
v
for
v
in
variables
if
not
re
.
match
(
frozen_variable_prefix
,
v
.
name
)
if
not
frozen_variable_prefix
or
not
re
.
match
(
frozen_variable_prefix
,
v
.
name
)
]
return
filtered_variables
...
...
@@ -94,9 +65,9 @@ class Model(object):
tf
.
compat
.
v2
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
policy
)
# Optimization.
self
.
_optimizer_fn
=
OptimizerFactory
(
params
.
train
.
optimizer
)
self
.
_optimizer_fn
=
optimizers
.
OptimizerFactory
(
params
.
train
.
optimizer
)
self
.
_learning_rate
=
learning_rates
.
learning_rate_generator
(
params
.
train
.
learning_rate
)
params
.
train
.
total_steps
,
params
.
train
.
learning_rate
)
self
.
_frozen_variable_prefix
=
params
.
train
.
frozen_variable_prefix
self
.
_regularization_var_regex
=
params
.
train
.
regularization_variable_regex
...
...
official/vision/detection/modeling/learning_rates.py
View file @
965cc3ee
...
...
@@ -28,9 +28,10 @@ from official.modeling.hyperparams import params_dict
class
StepLearningRateWithLinearWarmup
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Class to generate learning rate tensor."""
def
__init__
(
self
,
params
):
def
__init__
(
self
,
total_steps
,
params
):
"""Creates the step learning rate tensor with linear warmup."""
super
(
StepLearningRateWithLinearWarmup
,
self
).
__init__
()
self
.
_total_steps
=
total_steps
assert
isinstance
(
params
,
(
dict
,
params_dict
.
ParamsDict
))
if
isinstance
(
params
,
dict
):
params
=
params_dict
.
ParamsDict
(
params
)
...
...
@@ -59,9 +60,10 @@ class StepLearningRateWithLinearWarmup(tf.keras.optimizers.schedules.LearningRat
class
CosineLearningRateWithLinearWarmup
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Class to generate learning rate tensor."""
def
__init__
(
self
,
params
):
def
__init__
(
self
,
total_steps
,
params
):
"""Creates the consine learning rate tensor with linear warmup."""
super
(
CosineLearningRateWithLinearWarmup
,
self
).
__init__
()
self
.
_total_steps
=
total_steps
assert
isinstance
(
params
,
(
dict
,
params_dict
.
ParamsDict
))
if
isinstance
(
params
,
dict
):
params
=
params_dict
.
ParamsDict
(
params
)
...
...
@@ -72,7 +74,7 @@ class CosineLearningRateWithLinearWarmup(tf.keras.optimizers.schedules.LearningR
warmup_lr
=
self
.
_params
.
warmup_learning_rate
warmup_steps
=
self
.
_params
.
warmup_steps
init_lr
=
self
.
_params
.
init_learning_rate
total_steps
=
self
.
_
params
.
total_steps
total_steps
=
self
.
_total_steps
linear_warmup
=
(
warmup_lr
+
global_step
/
warmup_steps
*
(
init_lr
-
warmup_lr
))
cosine_learning_rate
=
(
...
...
@@ -86,11 +88,11 @@ class CosineLearningRateWithLinearWarmup(tf.keras.optimizers.schedules.LearningR
return
{
'_params'
:
self
.
_params
.
as_dict
()}
def
learning_rate_generator
(
params
):
def
learning_rate_generator
(
total_steps
,
params
):
"""The learning rate function generator."""
if
params
.
type
==
'step'
:
return
StepLearningRateWithLinearWarmup
(
params
)
return
StepLearningRateWithLinearWarmup
(
total_steps
,
params
)
elif
params
.
type
==
'cosine'
:
return
CosineLearningRateWithLinearWarmup
(
params
)
return
CosineLearningRateWithLinearWarmup
(
total_steps
,
params
)
else
:
raise
ValueError
(
'Unsupported learning rate type: {}.'
.
format
(
params
.
type
))
official/vision/detection/modeling/losses.py
View file @
965cc3ee
...
...
@@ -371,8 +371,8 @@ class MaskrcnnLoss(object):
class
RetinanetClassLoss
(
object
):
"""RetinaNet class loss."""
def
__init__
(
self
,
params
):
self
.
_num_classes
=
params
.
num_classes
def
__init__
(
self
,
params
,
num_classes
):
self
.
_num_classes
=
num_classes
self
.
_focal_loss_alpha
=
params
.
focal_loss_alpha
self
.
_focal_loss_gamma
=
params
.
focal_loss_gamma
...
...
official/vision/detection/modeling/maskrcnn_model.py
View file @
965cc3ee
...
...
@@ -49,14 +49,16 @@ class MaskrcnnModel(base_model.Model):
# Architecture generators.
self
.
_backbone_fn
=
factory
.
backbone_generator
(
params
)
self
.
_fpn_fn
=
factory
.
multilevel_features_generator
(
params
)
self
.
_rpn_head_fn
=
factory
.
rpn_head_generator
(
params
.
rpn_head
)
self
.
_rpn_head_fn
=
factory
.
rpn_head_generator
(
params
)
self
.
_generate_rois_fn
=
roi_ops
.
ROIGenerator
(
params
.
roi_proposal
)
self
.
_sample_rois_fn
=
sampling_ops
.
ROISampler
(
params
.
roi_sampling
)
self
.
_sample_masks_fn
=
sampling_ops
.
MaskSampler
(
params
.
mask_sampling
)
self
.
_sample_masks_fn
=
sampling_ops
.
MaskSampler
(
params
.
architecture
.
mask_target_size
,
params
.
mask_sampling
.
num_mask_samples_per_image
)
self
.
_frcnn_head_fn
=
factory
.
fast_rcnn_head_generator
(
params
.
frcnn_head
)
self
.
_frcnn_head_fn
=
factory
.
fast_rcnn_head_generator
(
params
)
if
self
.
_include_mask
:
self
.
_mrcnn_head_fn
=
factory
.
mask_rcnn_head_generator
(
params
.
mrcnn_head
)
self
.
_mrcnn_head_fn
=
factory
.
mask_rcnn_head_generator
(
params
)
# Loss function.
self
.
_rpn_score_loss_fn
=
losses
.
RpnScoreLoss
(
params
.
rpn_score_loss
)
...
...
@@ -91,8 +93,8 @@ class MaskrcnnModel(base_model.Model):
tf
.
nest
.
map_structure
(
lambda
x
:
tf
.
cast
(
x
,
tf
.
float32
),
rpn_box_outputs
),
})
input_anchor
=
anchor
.
Anchor
(
self
.
_params
.
a
n
ch
or
.
min_level
,
self
.
_params
.
a
n
ch
or
.
max_level
,
input_anchor
=
anchor
.
Anchor
(
self
.
_params
.
a
r
ch
itecture
.
min_level
,
self
.
_params
.
a
r
ch
itecture
.
max_level
,
self
.
_params
.
anchor
.
num_scales
,
self
.
_params
.
anchor
.
aspect_ratios
,
self
.
_params
.
anchor
.
anchor_size
,
...
...
official/vision/detection/modeling/optimizers.py
0 → 100644
View file @
965cc3ee
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Optimizers."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
functools
import
numpy
as
np
import
tensorflow.compat.v2
as
tf
class
OptimizerFactory
(
object
):
"""Class to generate optimizer function."""
def
__init__
(
self
,
params
):
"""Creates optimized based on the specified flags."""
if
params
.
type
==
'momentum'
:
self
.
_optimizer
=
functools
.
partial
(
tf
.
keras
.
optimizers
.
SGD
,
momentum
=
params
.
momentum
,
nesterov
=
params
.
nesterov
)
elif
params
.
type
==
'adam'
:
self
.
_optimizer
=
tf
.
keras
.
optimizers
.
Adam
elif
params
.
type
==
'adadelta'
:
self
.
_optimizer
=
tf
.
keras
.
optimizers
.
Adadelta
elif
params
.
type
==
'adagrad'
:
self
.
_optimizer
=
tf
.
keras
.
optimizers
.
Adagrad
elif
params
.
type
==
'rmsprop'
:
self
.
_optimizer
=
functools
.
partial
(
tf
.
keras
.
optimizers
.
RMSprop
,
momentum
=
params
.
momentum
)
else
:
raise
ValueError
(
'Unsupported optimizer type `{}`.'
.
format
(
params
.
type
))
def
__call__
(
self
,
learning_rate
):
return
self
.
_optimizer
(
learning_rate
=
learning_rate
)
official/vision/detection/modeling/retinanet_model.py
View file @
965cc3ee
...
...
@@ -44,16 +44,19 @@ class RetinanetModel(base_model.Model):
# Architecture generators.
self
.
_backbone_fn
=
factory
.
backbone_generator
(
params
)
self
.
_fpn_fn
=
factory
.
multilevel_features_generator
(
params
)
self
.
_head_fn
=
factory
.
retinanet_head_generator
(
params
.
retinanet_head
)
self
.
_head_fn
=
factory
.
retinanet_head_generator
(
params
)
# Loss function.
self
.
_cls_loss_fn
=
losses
.
RetinanetClassLoss
(
params
.
retinanet_loss
)
self
.
_cls_loss_fn
=
losses
.
RetinanetClassLoss
(
params
.
retinanet_loss
,
params
.
architecture
.
num_classes
)
self
.
_box_loss_fn
=
losses
.
RetinanetBoxLoss
(
params
.
retinanet_loss
)
self
.
_box_loss_weight
=
params
.
retinanet_loss
.
box_loss_weight
self
.
_keras_model
=
None
# Predict function.
self
.
_generate_detections_fn
=
postprocess_ops
.
MultilevelDetectionGenerator
(
params
.
architecture
.
min_level
,
params
.
architecture
.
max_level
,
params
.
postprocess
)
self
.
_transpose_input
=
params
.
train
.
transpose_input
...
...
official/vision/detection/ops/__init__.py
0 → 100644
View file @
965cc3ee
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
official/vision/detection/ops/postprocess_ops.py
View file @
965cc3ee
...
...
@@ -294,10 +294,10 @@ def _generate_detections_batched(boxes,
class
MultilevelDetectionGenerator
(
object
):
"""Generates detected boxes with scores and classes for one-stage detector."""
def
__init__
(
self
,
params
):
def
__init__
(
self
,
min_level
,
max_level
,
params
):
self
.
_min_level
=
min_level
self
.
_max_level
=
max_level
self
.
_generate_detections
=
generate_detections_factory
(
params
)
self
.
_min_level
=
params
.
min_level
self
.
_max_level
=
params
.
max_level
def
__call__
(
self
,
box_outputs
,
class_outputs
,
anchor_boxes
,
image_shape
):
# Collects outputs from all levels into a list.
...
...
official/vision/detection/ops/sampling_ops.py
View file @
965cc3ee
...
...
@@ -346,9 +346,9 @@ class ROISampler(object):
class
MaskSampler
(
object
):
"""Samples and creates mask training targets."""
def
__init__
(
self
,
params
):
self
.
_
num_
mask_
samples_per_image
=
params
.
num_mask_samples_per_imag
e
self
.
_mask_
target_size
=
params
.
mask_target_siz
e
def
__init__
(
self
,
mask_target_size
,
num_mask_samples_per_image
):
self
.
_mask_
target_size
=
mask_target_siz
e
self
.
_
num_
mask_
samples_per_image
=
num_mask_samples_per_imag
e
def
__call__
(
self
,
candidate_rois
,
...
...
official/vision/detection/ops/spatial_transform_ops.py
View file @
965cc3ee
...
...
@@ -48,12 +48,143 @@ def nearest_upsampling(data, scale):
return
tf
.
reshape
(
data
,
[
bs
,
h
*
scale
,
w
*
scale
,
c
])
def
feature_bilinear_interpolation
(
features
,
kernel_y
,
kernel_x
):
"""Feature bilinear interpolation.
The RoIAlign feature f can be computed by bilinear interpolation
of four neighboring feature points f0, f1, f2, and f3.
f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
[f10, f11]]
f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
kernel_y = [hy, ly]
kernel_x = [hx, lx]
Args:
features: The features are in shape of [batch_size, num_boxes, output_size *
2, output_size * 2, num_filters].
kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
Returns:
A 5-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size, num_filters].
"""
(
batch_size
,
num_boxes
,
output_size
,
_
,
num_filters
)
=
features
.
get_shape
().
as_list
()
output_size
=
output_size
//
2
kernel_y
=
tf
.
reshape
(
kernel_y
,
[
batch_size
,
num_boxes
,
output_size
*
2
,
1
])
kernel_x
=
tf
.
reshape
(
kernel_x
,
[
batch_size
,
num_boxes
,
1
,
output_size
*
2
])
# Use implicit broadcast to generate the interpolation kernel. The
# multiplier `4` is for avg pooling.
interpolation_kernel
=
kernel_y
*
kernel_x
*
4
# Interpolate the gathered features with computed interpolation kernels.
features
*=
tf
.
cast
(
tf
.
expand_dims
(
interpolation_kernel
,
axis
=-
1
),
dtype
=
features
.
dtype
)
features
=
tf
.
reshape
(
features
,
[
batch_size
*
num_boxes
,
output_size
*
2
,
output_size
*
2
,
num_filters
])
features
=
tf
.
nn
.
avg_pool
(
features
,
[
1
,
2
,
2
,
1
],
[
1
,
2
,
2
,
1
],
'VALID'
)
features
=
tf
.
reshape
(
features
,
[
batch_size
,
num_boxes
,
output_size
,
output_size
,
num_filters
])
return
features
def
compute_grid_positions
(
boxes
,
boundaries
,
output_size
,
sample_offset
):
"""Compute the grid position w.r.t.
the corresponding feature map.
Args:
boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
information of each box w.r.t. the corresponding feature map.
boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
in terms of the number of pixels of the corresponding feature map size.
boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
the boundary (in (y, x)) of the corresponding feature map for each box.
Any resampled grid points that go beyond the bounary will be clipped.
output_size: a scalar indicating the output crop size.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
Returns:
kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
"""
batch_size
,
num_boxes
,
_
=
boxes
.
get_shape
().
as_list
()
box_grid_x
=
[]
box_grid_y
=
[]
for
i
in
range
(
output_size
):
box_grid_x
.
append
(
boxes
[:,
:,
1
]
+
(
i
+
sample_offset
)
*
boxes
[:,
:,
3
]
/
output_size
)
box_grid_y
.
append
(
boxes
[:,
:,
0
]
+
(
i
+
sample_offset
)
*
boxes
[:,
:,
2
]
/
output_size
)
box_grid_x
=
tf
.
stack
(
box_grid_x
,
axis
=
2
)
box_grid_y
=
tf
.
stack
(
box_grid_y
,
axis
=
2
)
box_grid_y0
=
tf
.
floor
(
box_grid_y
)
box_grid_x0
=
tf
.
floor
(
box_grid_x
)
box_grid_x0
=
tf
.
maximum
(
0.
,
box_grid_x0
)
box_grid_y0
=
tf
.
maximum
(
0.
,
box_grid_y0
)
box_grid_x0
=
tf
.
minimum
(
box_grid_x0
,
tf
.
expand_dims
(
boundaries
[:,
:,
1
],
-
1
))
box_grid_x1
=
tf
.
minimum
(
box_grid_x0
+
1
,
tf
.
expand_dims
(
boundaries
[:,
:,
1
],
-
1
))
box_grid_y0
=
tf
.
minimum
(
box_grid_y0
,
tf
.
expand_dims
(
boundaries
[:,
:,
0
],
-
1
))
box_grid_y1
=
tf
.
minimum
(
box_grid_y0
+
1
,
tf
.
expand_dims
(
boundaries
[:,
:,
0
],
-
1
))
box_gridx0x1
=
tf
.
stack
([
box_grid_x0
,
box_grid_x1
],
axis
=-
1
)
box_gridy0y1
=
tf
.
stack
([
box_grid_y0
,
box_grid_y1
],
axis
=-
1
)
# The RoIAlign feature f can be computed by bilinear interpolation of four
# neighboring feature points f0, f1, f2, and f3.
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
# [f10, f11]]
# f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
# f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
ly
=
box_grid_y
-
box_grid_y0
lx
=
box_grid_x
-
box_grid_x0
hy
=
1.0
-
ly
hx
=
1.0
-
lx
kernel_y
=
tf
.
reshape
(
tf
.
stack
([
hy
,
ly
],
axis
=
3
),
[
batch_size
,
num_boxes
,
output_size
,
2
,
1
])
kernel_x
=
tf
.
reshape
(
tf
.
stack
([
hx
,
lx
],
axis
=
3
),
[
batch_size
,
num_boxes
,
output_size
,
2
,
1
])
return
kernel_y
,
kernel_x
,
box_gridy0y1
,
box_gridx0x1
def
get_grid_one_hot
(
box_gridy0y1
,
box_gridx0x1
,
feature_height
,
feature_width
):
"""Get grid_one_hot from indices and feature_size."""
(
batch_size
,
num_boxes
,
output_size
,
_
)
=
box_gridx0x1
.
get_shape
().
as_list
()
y_indices
=
tf
.
cast
(
tf
.
reshape
(
box_gridy0y1
,
[
batch_size
,
num_boxes
,
output_size
,
2
]),
dtype
=
tf
.
int32
)
x_indices
=
tf
.
cast
(
tf
.
reshape
(
box_gridx0x1
,
[
batch_size
,
num_boxes
,
output_size
,
2
]),
dtype
=
tf
.
int32
)
# shape is [batch_size, num_boxes, output_size, 2, height]
grid_y_one_hot
=
tf
.
one_hot
(
tf
.
cast
(
y_indices
,
tf
.
int32
),
feature_height
)
# shape is [batch_size, num_boxes, output_size, 2, width]
grid_x_one_hot
=
tf
.
one_hot
(
tf
.
cast
(
x_indices
,
tf
.
int32
),
feature_width
)
return
grid_y_one_hot
,
grid_x_one_hot
def
selective_crop_and_resize
(
features
,
boxes
,
box_levels
,
boundaries
,
output_size
=
7
,
sample_offset
=
0.5
):
sample_offset
=
0.5
,
use_einsum_gather
=
False
):
"""Crop and resize boxes on a set of feature maps.
Given multiple features maps indexed by different levels, and a set of boxes
...
...
@@ -67,7 +198,7 @@ def selective_crop_and_resize(features,
pixel.
For performance, we perform the gather and interpolation on all layers as a
single operation.
This
is op the multi-level features are first stacked and
single operation.
In th
is op the multi-level features are first stacked and
gathered into [2*output_size, 2*output_size] feature points. Then bilinear
interpolation is performed on the gathered feature points to generate
[output_size, output_size] RoIAlign feature map.
...
...
@@ -86,14 +217,13 @@ def selective_crop_and_resize(features,
output_size.
Args:
features: a 5-D tensor of shape
[batch_size, num_levels, max_height, max_width, num_filters] where
cropping and resizing are based.
features: a 5-D tensor of shape [batch_size, num_levels, max_height,
max_width, num_filters] where cropping and resizing are based.
boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
information of each box w.r.t. the corresponding feature map.
boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
in terms of the number of pixels of the corresponding feature map size.
in terms of the number of pixels of the corresponding feature map size.
box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
the 0-based corresponding feature level index of each box.
boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
...
...
@@ -102,6 +232,10 @@ def selective_crop_and_resize(features,
output_size: a scalar indicating the output crop size.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
use_einsum_gather: use einsum to replace gather or not. Replacing einsum
with gather can improve performance when feature size is not large, einsum
is friendly with model partition as well. Gather's performance is better
when feature size is very large and there are multiple box levels.
Returns:
features_per_box: a 5-D tensor of shape
...
...
@@ -112,93 +246,77 @@ def selective_crop_and_resize(features,
num_filters
)
=
features
.
get_shape
().
as_list
()
_
,
num_boxes
,
_
=
boxes
.
get_shape
().
as_list
()
# Compute the grid position w.r.t. the corresponding feature map.
box_grid_x
=
[]
box_grid_y
=
[]
for
i
in
range
(
output_size
):
box_grid_x
.
append
(
boxes
[:,
:,
1
]
+
(
i
+
sample_offset
)
*
boxes
[:,
:,
3
]
/
output_size
)
box_grid_y
.
append
(
boxes
[:,
:,
0
]
+
(
i
+
sample_offset
)
*
boxes
[:,
:,
2
]
/
output_size
)
box_grid_x
=
tf
.
stack
(
box_grid_x
,
axis
=
2
)
box_grid_y
=
tf
.
stack
(
box_grid_y
,
axis
=
2
)
# Compute indices for gather operation.
box_grid_y0
=
tf
.
floor
(
box_grid_y
)
box_grid_x0
=
tf
.
floor
(
box_grid_x
)
box_grid_x0
=
tf
.
maximum
(
0.
,
box_grid_x0
)
box_grid_y0
=
tf
.
maximum
(
0.
,
box_grid_y0
)
box_gridx0x1
=
tf
.
stack
(
[
tf
.
minimum
(
box_grid_x0
,
tf
.
expand_dims
(
boundaries
[:,
:,
1
],
-
1
)),
tf
.
minimum
(
box_grid_x0
+
1
,
tf
.
expand_dims
(
boundaries
[:,
:,
1
],
-
1
))],
axis
=
3
)
box_gridy0y1
=
tf
.
stack
(
[
tf
.
minimum
(
box_grid_y0
,
tf
.
expand_dims
(
boundaries
[:,
:,
0
],
-
1
)),
tf
.
minimum
(
box_grid_y0
+
1
,
tf
.
expand_dims
(
boundaries
[:,
:,
0
],
-
1
))],
axis
=
3
)
kernel_y
,
kernel_x
,
box_gridy0y1
,
box_gridx0x1
=
compute_grid_positions
(
boxes
,
boundaries
,
output_size
,
sample_offset
)
x_indices
=
tf
.
cast
(
tf
.
reshape
(
box_gridx0x1
,
[
batch_size
,
num_boxes
,
output_size
*
2
]),
dtype
=
tf
.
int32
)
tf
.
reshape
(
box_gridx0x1
,
[
batch_size
,
num_boxes
,
output_size
*
2
]),
dtype
=
tf
.
int32
)
y_indices
=
tf
.
cast
(
tf
.
reshape
(
box_gridy0y1
,
[
batch_size
,
num_boxes
,
output_size
*
2
]),
dtype
=
tf
.
int32
)
height_dim_offset
=
max_feature_width
level_dim_offset
=
max_feature_height
*
height_dim_offset
batch_dim_offset
=
num_levels
*
level_dim_offset
indices
=
tf
.
reshape
(
tf
.
tile
(
tf
.
reshape
(
tf
.
range
(
batch_size
)
*
batch_dim_offset
,
[
batch_size
,
1
,
1
,
1
]),
[
1
,
num_boxes
,
output_size
*
2
,
output_size
*
2
])
+
tf
.
tile
(
tf
.
reshape
(
box_levels
*
level_dim_offset
,
[
batch_size
,
num_boxes
,
1
,
1
]),
[
1
,
1
,
output_size
*
2
,
output_size
*
2
])
+
tf
.
tile
(
tf
.
reshape
(
y_indices
*
height_dim_offset
,
[
batch_size
,
num_boxes
,
output_size
*
2
,
1
]),
[
1
,
1
,
1
,
output_size
*
2
])
+
tf
.
tile
(
tf
.
reshape
(
x_indices
,
[
batch_size
,
num_boxes
,
1
,
output_size
*
2
]),
[
1
,
1
,
output_size
*
2
,
1
]),
[
-
1
])
features
=
tf
.
reshape
(
features
,
[
-
1
,
num_filters
])
features_per_box
=
tf
.
reshape
(
tf
.
gather
(
features
,
indices
),
[
batch_size
,
num_boxes
,
output_size
*
2
,
output_size
*
2
,
num_filters
])
# The RoIAlign feature f can be computed by bilinear interpolation of four
# neighboring feature points f0, f1, f2, and f3.
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
# [f10, f11]]
# f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
# f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
ly
=
box_grid_y
-
box_grid_y0
lx
=
box_grid_x
-
box_grid_x0
hy
=
1.0
-
ly
hx
=
1.0
-
lx
kernel_x
=
tf
.
reshape
(
tf
.
stack
([
hx
,
lx
],
axis
=
3
),
[
batch_size
,
num_boxes
,
1
,
output_size
*
2
])
kernel_y
=
tf
.
reshape
(
tf
.
stack
([
hy
,
ly
],
axis
=
3
),
[
batch_size
,
num_boxes
,
output_size
*
2
,
1
])
# Uses implicit broadcast to generate the interpolation kernel. The
# multiplier `4` is for avg pooling.
interpolation_kernel
=
kernel_y
*
kernel_x
*
4
# Interpolates the gathered features with computed interpolation kernels.
features_per_box
*=
tf
.
cast
(
tf
.
expand_dims
(
interpolation_kernel
,
axis
=
4
),
dtype
=
features_per_box
.
dtype
)
features_per_box
=
tf
.
reshape
(
features_per_box
,
[
batch_size
*
num_boxes
,
output_size
*
2
,
output_size
*
2
,
num_filters
])
features_per_box
=
tf
.
nn
.
avg_pool2d
(
input
=
features_per_box
,
ksize
=
[
1
,
2
,
2
,
1
],
strides
=
[
1
,
2
,
2
,
1
],
padding
=
'VALID'
)
features_per_box
=
tf
.
reshape
(
features_per_box
,
[
batch_size
,
num_boxes
,
output_size
,
output_size
,
num_filters
])
tf
.
reshape
(
box_gridy0y1
,
[
batch_size
,
num_boxes
,
output_size
*
2
]),
dtype
=
tf
.
int32
)
if
use_einsum_gather
:
# Blinear interpolation is done during the last two gathers:
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
# [f10, f11]]
# [[f00, f01],
# [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
# where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.
# shape is [batch_size, boxes, output_size, 2, 1]
grid_y_one_hot
,
grid_x_one_hot
=
get_grid_one_hot
(
box_gridy0y1
,
box_gridx0x1
,
max_feature_height
,
max_feature_width
)
# shape is [batch_size, num_boxes, output_size, height]
grid_y_weight
=
tf
.
reduce_sum
(
tf
.
multiply
(
grid_y_one_hot
,
kernel_y
),
axis
=-
2
)
# shape is [batch_size, num_boxes, output_size, width]
grid_x_weight
=
tf
.
reduce_sum
(
tf
.
multiply
(
grid_x_one_hot
,
kernel_x
),
axis
=-
2
)
# Gather for y_axis.
# shape is [batch_size, num_boxes, output_size, width, features]
features_per_box
=
tf
.
einsum
(
'bmhwf,bmoh->bmowf'
,
features
,
tf
.
cast
(
grid_y_weight
,
features
.
dtype
))
# Gather for x_axis.
# shape is [batch_size, num_boxes, output_size, output_size, features]
features_per_box
=
tf
.
einsum
(
'bmhwf,bmow->bmhof'
,
features_per_box
,
tf
.
cast
(
grid_x_weight
,
features
.
dtype
))
else
:
height_dim_offset
=
max_feature_width
level_dim_offset
=
max_feature_height
*
height_dim_offset
batch_dim_offset
=
num_levels
*
level_dim_offset
batch_size_offset
=
tf
.
tile
(
tf
.
reshape
(
tf
.
range
(
batch_size
)
*
batch_dim_offset
,
[
batch_size
,
1
,
1
,
1
]),
[
1
,
num_boxes
,
output_size
*
2
,
output_size
*
2
])
box_levels_offset
=
tf
.
tile
(
tf
.
reshape
(
box_levels
*
level_dim_offset
,
[
batch_size
,
num_boxes
,
1
,
1
]),
[
1
,
1
,
output_size
*
2
,
output_size
*
2
])
y_indices_offset
=
tf
.
tile
(
tf
.
reshape
(
y_indices
*
height_dim_offset
,
[
batch_size
,
num_boxes
,
output_size
*
2
,
1
]),
[
1
,
1
,
1
,
output_size
*
2
])
x_indices_offset
=
tf
.
tile
(
tf
.
reshape
(
x_indices
,
[
batch_size
,
num_boxes
,
1
,
output_size
*
2
]),
[
1
,
1
,
output_size
*
2
,
1
])
indices
=
tf
.
reshape
(
batch_size_offset
+
box_levels_offset
+
y_indices_offset
+
x_indices_offset
,
[
-
1
])
features
=
tf
.
reshape
(
features
,
[
-
1
,
num_filters
])
# TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
# performance.
features_per_box
=
tf
.
reshape
(
tf
.
gather
(
features
,
indices
),
[
batch_size
,
num_boxes
,
output_size
*
2
,
output_size
*
2
,
num_filters
])
features_per_box
=
feature_bilinear_interpolation
(
features_per_box
,
kernel_y
,
kernel_x
)
return
features_per_box
...
...
@@ -211,29 +329,52 @@ def multilevel_crop_and_resize(features, boxes, output_size=7):
and resizing it using the correspoding feature map of that level.
Args:
features: A dictionary with key as pyramid level and value as features.
The
features are in shape of [batch_size, height_l, width_l, num_filters].
boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row
represents
a box with [y1, x1, y2, x2] in un-normalized coordinates.
features: A dictionary with key as pyramid level and value as features.
The
features are in shape of [batch_size, height_l, width_l, num_filters].
boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row
represents
a box with [y1, x1, y2, x2] in un-normalized coordinates.
output_size: A scalar to indicate the output crop size.
Returns:
A 5-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size, num_filters].
"""
with
tf
.
name_scope
(
'multilevel_crop_and_resize'
):
levels
=
features
.
keys
()
levels
=
list
(
features
.
keys
()
)
min_level
=
min
(
levels
)
max_level
=
max
(
levels
)
_
,
max_feature_height
,
max_feature_width
,
_
=
(
batch_size
,
max_feature_height
,
max_feature_width
,
num_filters
=
(
features
[
min_level
].
get_shape
().
as_list
())
# Stacks feature pyramid into a features_all of shape
_
,
num_boxes
,
_
=
boxes
.
get_shape
().
as_list
()
# Stack feature pyramid into a features_all of shape
# [batch_size, levels, height, width, num_filters].
features_all
=
[]
feature_heights
=
[]
feature_widths
=
[]
for
level
in
range
(
min_level
,
max_level
+
1
):
features_all
.
append
(
tf
.
image
.
pad_to_bounding_box
(
features
[
level
],
0
,
0
,
max_feature_height
,
max_feature_width
))
features_all
=
tf
.
stack
(
features_all
,
axis
=
1
)
shape
=
features
[
level
].
get_shape
().
as_list
()
feature_heights
.
append
(
shape
[
1
])
feature_widths
.
append
(
shape
[
2
])
# Concat tensor of [batch_size, height_l * width_l, num_filters] for each
# levels.
features_all
.
append
(
tf
.
reshape
(
features
[
level
],
[
batch_size
,
-
1
,
num_filters
]))
features_r2
=
tf
.
reshape
(
tf
.
concat
(
features_all
,
1
),
[
-
1
,
num_filters
])
# Calculate height_l * width_l for each level.
level_dim_sizes
=
[
feature_widths
[
i
]
*
feature_heights
[
i
]
for
i
in
range
(
len
(
feature_widths
))
]
# level_dim_offsets is accumulated sum of level_dim_size.
level_dim_offsets
=
[
0
]
for
i
in
range
(
len
(
feature_widths
)
-
1
):
level_dim_offsets
.
append
(
level_dim_offsets
[
i
]
+
level_dim_sizes
[
i
])
batch_dim_size
=
level_dim_offsets
[
-
1
]
+
level_dim_sizes
[
-
1
]
level_dim_offsets
=
tf
.
constant
(
level_dim_offsets
,
tf
.
int32
)
height_dim_sizes
=
tf
.
constant
(
feature_widths
,
tf
.
int32
)
# Assigns boxes to the right level.
box_width
=
boxes
[:,
:,
3
]
-
boxes
[:,
:,
1
]
...
...
@@ -241,8 +382,8 @@ def multilevel_crop_and_resize(features, boxes, output_size=7):
areas_sqrt
=
tf
.
sqrt
(
box_height
*
box_width
)
levels
=
tf
.
cast
(
tf
.
math
.
floordiv
(
tf
.
math
.
log
(
tf
.
divide
(
areas_sqrt
,
224.0
)),
tf
.
math
.
log
(
2.0
))
+
4.0
,
tf
.
math
.
log
(
tf
.
divide
(
areas_sqrt
,
224.0
)),
tf
.
math
.
log
(
2.0
))
+
4.0
,
dtype
=
tf
.
int32
)
# Maps levels between [min_level, max_level].
levels
=
tf
.
minimum
(
max_level
,
tf
.
maximum
(
levels
,
min_level
))
...
...
@@ -263,17 +404,58 @@ def multilevel_crop_and_resize(features, boxes, output_size=7):
level_strides
=
tf
.
pow
([[
2.0
]],
tf
.
cast
(
levels
,
tf
.
float32
))
boundary
=
tf
.
cast
(
tf
.
concat
([
tf
.
expand_dims
([[
tf
.
cast
(
max_feature_height
,
tf
.
float32
)]]
/
level_strides
-
1
,
axis
=-
1
),
tf
.
expand_dims
([[
tf
.
cast
(
max_feature_width
,
tf
.
float32
)]]
/
level_strides
-
1
,
axis
=-
1
),
],
axis
=-
1
),
boxes
.
dtype
)
tf
.
expand_dims
(
[[
tf
.
cast
(
max_feature_height
,
tf
.
float32
)]]
/
level_strides
-
1
,
axis
=-
1
),
tf
.
expand_dims
(
[[
tf
.
cast
(
max_feature_width
,
tf
.
float32
)]]
/
level_strides
-
1
,
axis
=-
1
),
],
axis
=-
1
),
boxes
.
dtype
)
# Compute grid positions.
kernel_y
,
kernel_x
,
box_gridy0y1
,
box_gridx0x1
=
compute_grid_positions
(
boxes
,
boundary
,
output_size
,
sample_offset
=
0.5
)
x_indices
=
tf
.
cast
(
tf
.
reshape
(
box_gridx0x1
,
[
batch_size
,
num_boxes
,
output_size
*
2
]),
dtype
=
tf
.
int32
)
y_indices
=
tf
.
cast
(
tf
.
reshape
(
box_gridy0y1
,
[
batch_size
,
num_boxes
,
output_size
*
2
]),
dtype
=
tf
.
int32
)
return
selective_crop_and_resize
(
features_all
,
boxes
,
levels
,
boundary
,
output_size
)
batch_size_offset
=
tf
.
tile
(
tf
.
reshape
(
tf
.
range
(
batch_size
)
*
batch_dim_size
,
[
batch_size
,
1
,
1
,
1
]),
[
1
,
num_boxes
,
output_size
*
2
,
output_size
*
2
])
# Get level offset for each box. Each box belongs to one level.
levels_offset
=
tf
.
tile
(
tf
.
reshape
(
tf
.
gather
(
level_dim_offsets
,
levels
),
[
batch_size
,
num_boxes
,
1
,
1
]),
[
1
,
1
,
output_size
*
2
,
output_size
*
2
])
y_indices_offset
=
tf
.
tile
(
tf
.
reshape
(
y_indices
*
tf
.
expand_dims
(
tf
.
gather
(
height_dim_sizes
,
levels
),
-
1
),
[
batch_size
,
num_boxes
,
output_size
*
2
,
1
]),
[
1
,
1
,
1
,
output_size
*
2
])
x_indices_offset
=
tf
.
tile
(
tf
.
reshape
(
x_indices
,
[
batch_size
,
num_boxes
,
1
,
output_size
*
2
]),
[
1
,
1
,
output_size
*
2
,
1
])
indices
=
tf
.
reshape
(
batch_size_offset
+
levels_offset
+
y_indices_offset
+
x_indices_offset
,
[
-
1
])
# TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
# performance.
features_per_box
=
tf
.
reshape
(
tf
.
gather
(
features_r2
,
indices
),
[
batch_size
,
num_boxes
,
output_size
*
2
,
output_size
*
2
,
num_filters
])
# Bilinear interpolation.
features_per_box
=
feature_bilinear_interpolation
(
features_per_box
,
kernel_y
,
kernel_x
)
return
features_per_box
def
single_level_feature_crop
(
features
,
level_boxes
,
detection_prior_levels
,
...
...
@@ -355,7 +537,8 @@ def crop_mask_in_target_box(masks,
boxes
,
target_boxes
,
output_size
,
sample_offset
=
0
):
sample_offset
=
0
,
use_einsum
=
True
):
"""Crop masks in target boxes.
Args:
...
...
@@ -370,6 +553,7 @@ def crop_mask_in_target_box(masks,
supports to output a square shape outputs.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
use_einsum: Use einsum to replace gather in selective_crop_and_resize.
Returns:
A 4-D tensor representing feature crop of shape
...
...
@@ -417,7 +601,8 @@ def crop_mask_in_target_box(masks,
levels
,
boundaries
,
output_size
,
sample_offset
=
sample_offset
)
sample_offset
=
sample_offset
,
use_einsum_gather
=
use_einsum
)
cropped_masks
=
tf
.
squeeze
(
cropped_masks
,
axis
=-
1
)
return
cropped_masks
official/vision/image_classification/README.md
View file @
965cc3ee
...
...
@@ -19,21 +19,49 @@ installed and
### ImageNet preparation
#### Using TFDS
`classifier_trainer.py`
supports ImageNet with
[
TensorFlow Datasets (TFDS)
](
https://www.tensorflow.org/datasets/overview
)
.
Please see the following
[
example snippet
](
https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py
)
for more information on how to use TFDS to download and prepare datasets, and
specifically the
[
TFDS ImageNet readme
](
https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md
)
for manual download instructions.
#### Legacy TFRecords
Download the ImageNet dataset and convert it to TFRecord format.
The following
[
script
](
https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py
)
and
[
README
](
https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy
)
provide a few options.
Note that the legacy ResNet runners, e.g.
[
resnet/resnet_ctl_imagenet_main.py
](
resnet/resnet_ctl_imagenet_main.py
)
require TFRecords whereas
`classifier_trainer.py`
can use both by setting the
builder to 'records' or 'tfds' in the configurations.
### Running on Cloud TPUs
Note: These models will
**not**
work with TPUs on Colab.
You can train image classification models on Cloud TPUs using
`tf.distribute.TPUStrategy`
. If you are not familiar with Cloud TPUs, it is
strongly recommended that you go through the
[
tf.distribute.experimental.TPUStrategy
](
https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/TPUStrategy?version=nightly
)
.
If you are not familiar with Cloud TPUs, it is strongly recommended that you go
through the
[
quickstart
](
https://cloud.google.com/tpu/docs/quickstart
)
to learn how to
create a TPU and GCE VM.
### Running on multiple GPU hosts
You can also train these models on multiple hosts, each with GPUs, using
[
tf.distribute.experimental.MultiWorkerMirroredStrategy
](
https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy
)
.
The easiest way to run multi-host benchmarks is to set the
[
`TF_CONFIG`
](
https://www.tensorflow.org/guide/distributed_training#TF_CONFIG
)
appropriately at each host. e.g., to run using
`MultiWorkerMirroredStrategy`
on
2 hosts, the
`cluster`
in
`TF_CONFIG`
should have 2
`host:port`
entries, and
host
`i`
should have the
`task`
in
`TF_CONFIG`
set to
`{"type": "worker",
"index": i}`
.
`MultiWorkerMirroredStrategy`
will automatically use all the
available GPUs at each host.
## MNIST
To download the data and run the MNIST sample model locally for the first time,
...
...
@@ -100,7 +128,7 @@ python3 classifier_trainer.py \
--tpu
=
$TPU_NAME
\
--model_dir
=
$MODEL_DIR
\
--data_dir
=
$DATA_DIR
\
--config_file
=
config/examples/resnet/imagenet/tpu.yaml
--config_file
=
config
s
/examples/resnet/imagenet/tpu.yaml
```
### EfficientNet
...
...
@@ -127,7 +155,7 @@ python3 classifier_trainer.py \
--tpu
=
$TPU_NAME
\
--model_dir
=
$MODEL_DIR
\
--data_dir
=
$DATA_DIR
\
--config_file
=
config/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
--config_file
=
config
s
/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
```
Note that the number of GPU devices can be overridden in the command line using
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment