Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
78c43ef1
Commit
78c43ef1
authored
Jul 26, 2021
by
Gunho Park
Browse files
Merge branch 'master' of
https://github.com/tensorflow/models
parents
67cfc95b
e3c7e300
Changes
227
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1983 additions
and
8 deletions
+1983
-8
official/vision/beta/projects/movinet/tools/convert_3d_2plus1d.py
.../vision/beta/projects/movinet/tools/convert_3d_2plus1d.py
+104
-0
official/vision/beta/projects/movinet/tools/convert_3d_2plus1d_test.py
...on/beta/projects/movinet/tools/convert_3d_2plus1d_test.py
+60
-0
official/vision/beta/projects/movinet/train.py
official/vision/beta/projects/movinet/train.py
+1
-0
official/vision/beta/projects/panoptic_maskrcnn/README.md
official/vision/beta/projects/panoptic_maskrcnn/README.md
+20
-0
official/vision/beta/projects/panoptic_maskrcnn/__init__.py
official/vision/beta/projects/panoptic_maskrcnn/__init__.py
+27
-0
official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
...a/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
+47
-0
official/vision/beta/projects/panoptic_maskrcnn/dataloaders/panoptic_maskrcnn_input.py
.../panoptic_maskrcnn/dataloaders/panoptic_maskrcnn_input.py
+288
-0
official/vision/beta/projects/panoptic_maskrcnn/modeling/factory.py
...ision/beta/projects/panoptic_maskrcnn/modeling/factory.py
+117
-0
official/vision/beta/projects/panoptic_maskrcnn/modeling/factory_test.py
.../beta/projects/panoptic_maskrcnn/modeling/factory_test.py
+66
-0
official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model.py
...cts/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model.py
+182
-0
official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model_test.py
...anoptic_maskrcnn/modeling/panoptic_maskrcnn_model_test.py
+489
-0
official/vision/beta/projects/simclr/heads/simclr_head.py
official/vision/beta/projects/simclr/heads/simclr_head.py
+0
-1
official/vision/beta/projects/simclr/modeling/simclr_model.py
...cial/vision/beta/projects/simclr/modeling/simclr_model.py
+7
-6
official/vision/beta/projects/simclr/tasks/simclr.py
official/vision/beta/projects/simclr/tasks/simclr.py
+2
-1
official/vision/beta/projects/vit/README.md
official/vision/beta/projects/vit/README.md
+12
-0
official/vision/beta/projects/vit/configs/__init__.py
official/vision/beta/projects/vit/configs/__init__.py
+18
-0
official/vision/beta/projects/vit/configs/backbones.py
official/vision/beta/projects/vit/configs/backbones.py
+56
-0
official/vision/beta/projects/vit/configs/image_classification.py
.../vision/beta/projects/vit/configs/image_classification.py
+195
-0
official/vision/beta/projects/vit/modeling/vit.py
official/vision/beta/projects/vit/modeling/vit.py
+249
-0
official/vision/beta/projects/vit/modeling/vit_test.py
official/vision/beta/projects/vit/modeling/vit_test.py
+43
-0
No files found.
official/vision/beta/projects/movinet/tools/convert_3d_2plus1d.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Converts '3d_2plus1d' checkpoints into '2plus1d'."""
from
absl
import
app
from
absl
import
flags
import
tensorflow
as
tf
from
official.vision.beta.projects.movinet.modeling
import
movinet
from
official.vision.beta.projects.movinet.modeling
import
movinet_model
flags
.
DEFINE_string
(
'input_checkpoint_path'
,
None
,
'Checkpoint path to load.'
)
flags
.
DEFINE_string
(
'output_checkpoint_path'
,
None
,
'Export path to save the saved_model file.'
)
flags
.
DEFINE_string
(
'model_id'
,
'a0'
,
'MoViNet model name.'
)
flags
.
DEFINE_bool
(
'causal'
,
False
,
'Run the model in causal mode.'
)
flags
.
DEFINE_bool
(
'use_positional_encoding'
,
False
,
'Whether to use positional encoding (only applied when causal=True).'
)
flags
.
DEFINE_integer
(
'num_classes'
,
600
,
'The number of classes for prediction.'
)
flags
.
DEFINE_bool
(
'verify_output'
,
False
,
'Verify the output matches between the models.'
)
FLAGS
=
flags
.
FLAGS
def
main
(
_
)
->
None
:
backbone_2plus1d
=
movinet
.
Movinet
(
model_id
=
FLAGS
.
model_id
,
causal
=
FLAGS
.
causal
,
conv_type
=
'2plus1d'
,
use_positional_encoding
=
FLAGS
.
use_positional_encoding
)
model_2plus1d
=
movinet_model
.
MovinetClassifier
(
backbone
=
backbone_2plus1d
,
num_classes
=
FLAGS
.
num_classes
)
model_2plus1d
.
build
([
1
,
1
,
1
,
1
,
3
])
backbone_3d_2plus1d
=
movinet
.
Movinet
(
model_id
=
FLAGS
.
model_id
,
causal
=
FLAGS
.
causal
,
conv_type
=
'3d_2plus1d'
,
use_positional_encoding
=
FLAGS
.
use_positional_encoding
)
model_3d_2plus1d
=
movinet_model
.
MovinetClassifier
(
backbone
=
backbone_3d_2plus1d
,
num_classes
=
FLAGS
.
num_classes
)
model_3d_2plus1d
.
build
([
1
,
1
,
1
,
1
,
3
])
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
model_3d_2plus1d
)
status
=
checkpoint
.
restore
(
FLAGS
.
input_checkpoint_path
)
status
.
assert_existing_objects_matched
()
# Ensure both models have the same weights
weights
=
[]
for
var_2plus1d
,
var_3d_2plus1d
in
zip
(
model_2plus1d
.
get_weights
(),
model_3d_2plus1d
.
get_weights
()):
if
var_2plus1d
.
shape
==
var_3d_2plus1d
.
shape
:
weights
.
append
(
var_3d_2plus1d
)
else
:
if
var_3d_2plus1d
.
shape
[
0
]
==
1
:
weight
=
var_3d_2plus1d
[
0
]
else
:
weight
=
var_3d_2plus1d
[:,
0
]
if
weight
.
shape
[
-
1
]
!=
var_2plus1d
.
shape
[
-
1
]:
# Transpose any depthwise kernels (conv3d --> depthwise_conv2d)
weight
=
tf
.
transpose
(
weight
,
perm
=
(
0
,
1
,
3
,
2
))
weights
.
append
(
weight
)
model_2plus1d
.
set_weights
(
weights
)
if
FLAGS
.
verify_output
:
inputs
=
tf
.
random
.
uniform
([
1
,
6
,
64
,
64
,
3
],
dtype
=
tf
.
float32
)
logits_2plus1d
=
model_2plus1d
(
inputs
)
logits_3d_2plus1d
=
model_3d_2plus1d
(
inputs
)
if
tf
.
reduce_mean
(
logits_2plus1d
-
logits_3d_2plus1d
)
>
1e-5
:
raise
ValueError
(
'Bad conversion, model outputs do not match.'
)
save_checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
model_2plus1d
,
backbone
=
backbone_2plus1d
)
save_checkpoint
.
save
(
FLAGS
.
output_checkpoint_path
)
if
__name__
==
'__main__'
:
flags
.
mark_flag_as_required
(
'input_checkpoint_path'
)
flags
.
mark_flag_as_required
(
'output_checkpoint_path'
)
app
.
run
(
main
)
official/vision/beta/projects/movinet/tools/convert_3d_2plus1d_test.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for convert_3d_2plus1d."""
import
os
from
absl
import
flags
import
tensorflow
as
tf
from
official.vision.beta.projects.movinet.modeling
import
movinet
from
official.vision.beta.projects.movinet.modeling
import
movinet_model
from
official.vision.beta.projects.movinet.tools
import
convert_3d_2plus1d
FLAGS
=
flags
.
FLAGS
class
Convert3d2plus1dTest
(
tf
.
test
.
TestCase
):
def
test_convert_model
(
self
):
saved_model_path
=
self
.
get_temp_dir
()
input_checkpoint_path
=
os
.
path
.
join
(
saved_model_path
,
'ckpt-input'
)
output_checkpoint_path
=
os
.
path
.
join
(
saved_model_path
,
'ckpt'
)
model_3d_2plus1d
=
movinet_model
.
MovinetClassifier
(
backbone
=
movinet
.
Movinet
(
model_id
=
'a0'
,
conv_type
=
'3d_2plus1d'
),
num_classes
=
600
)
model_3d_2plus1d
.
build
([
1
,
1
,
1
,
1
,
3
])
save_checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
model_3d_2plus1d
)
save_checkpoint
.
save
(
input_checkpoint_path
)
FLAGS
.
input_checkpoint_path
=
f
'
{
input_checkpoint_path
}
-1'
FLAGS
.
output_checkpoint_path
=
output_checkpoint_path
FLAGS
.
model_id
=
'a0'
FLAGS
.
use_positional_encoding
=
False
FLAGS
.
num_classes
=
600
FLAGS
.
verify_output
=
True
convert_3d_2plus1d
.
main
(
'unused_args'
)
print
(
os
.
listdir
(
saved_model_path
))
self
.
assertTrue
(
tf
.
io
.
gfile
.
exists
(
f
'
{
output_checkpoint_path
}
-1.index'
))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/beta/projects/movinet/train.py
View file @
78c43ef1
...
@@ -46,6 +46,7 @@ from official.modeling import performance
...
@@ -46,6 +46,7 @@ from official.modeling import performance
# Import movinet libraries to register the backbone and model into tf.vision
# Import movinet libraries to register the backbone and model into tf.vision
# model garden factory.
# model garden factory.
# pylint: disable=unused-import
# pylint: disable=unused-import
# the followings are the necessary imports.
from
official.vision.beta.projects.movinet.modeling
import
movinet
from
official.vision.beta.projects.movinet.modeling
import
movinet
from
official.vision.beta.projects.movinet.modeling
import
movinet_model
from
official.vision.beta.projects.movinet.modeling
import
movinet_model
# pylint: enable=unused-import
# pylint: enable=unused-import
...
...
official/vision/beta/projects/panoptic_maskrcnn/README.md
0 → 100644
View file @
78c43ef1
# Panoptic Segmentation
## Description
Panoptic Segmentation combines the two distinct vision tasks - semantic
segmentation and instance segmentation. These tasks are unified such that, each
pixel in the image is assigned the label of the class it belongs to, and also
the instance identifier of the object it a part of.
## Environment setup
The code can be run on multiple GPUs or TPUs with different distribution
strategies. See the TensorFlow distributed training
[
guide
](
https://www.tensorflow.org/guide/distributed_training
)
for an overview
of
`tf.distribute`
.
The code is compatible with TensorFlow 2.4+. See requirements.txt for all
prerequisites, and you can also install them using the following command.
`pip
install -r ./official/requirements.txt`
**DISCLAIMER**
: Panoptic MaskRCNN is still under active development, stay tuned!
official/vision/beta/projects/panoptic_maskrcnn/__init__.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Panoptic Mask R-CNN configuration definition."""
from
typing
import
List
import
dataclasses
from
official.vision.beta.configs
import
maskrcnn
from
official.vision.beta.configs
import
semantic_segmentation
@
dataclasses
.
dataclass
class
Parser
(
maskrcnn
.
Parser
):
"""Panoptic Segmentation parser config."""
# If resize_eval_groundtruth is set to False, original image sizes are used
# for eval. In that case, groundtruth_padded_size has to be specified too to
# allow for batching the variable input sizes of images.
resize_eval_segmentation_groundtruth
:
bool
=
True
segmentation_groundtruth_padded_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
segmentation_ignore_label
:
int
=
255
@
dataclasses
.
dataclass
class
DataConfig
(
maskrcnn
.
DataConfig
):
"""Input config for training."""
parser
:
Parser
=
Parser
()
@
dataclasses
.
dataclass
class
PanopticMaskRCNN
(
maskrcnn
.
MaskRCNN
):
"""Panoptic Mask R-CNN model config."""
segmentation_model
:
semantic_segmentation
.
SemanticSegmentationModel
=
(
semantic_segmentation
.
SemanticSegmentationModel
(
num_classes
=
2
))
shared_backbone
:
bool
=
True
shared_decoder
:
bool
=
True
official/vision/beta/projects/panoptic_maskrcnn/dataloaders/panoptic_maskrcnn_input.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data parser and processing for Panoptic Mask R-CNN."""
import
tensorflow
as
tf
from
official.vision.beta.dataloaders
import
maskrcnn_input
from
official.vision.beta.dataloaders
import
tf_example_decoder
from
official.vision.beta.ops
import
preprocess_ops
class
TfExampleDecoder
(
tf_example_decoder
.
TfExampleDecoder
):
"""Tensorflow Example proto decoder."""
def
__init__
(
self
,
regenerate_source_id
,
mask_binarize_threshold
):
super
(
TfExampleDecoder
,
self
).
__init__
(
include_mask
=
True
,
regenerate_source_id
=
regenerate_source_id
,
mask_binarize_threshold
=
None
)
self
.
_segmentation_keys_to_features
=
{
'image/segmentation/class/encoded'
:
tf
.
io
.
FixedLenFeature
((),
tf
.
string
,
default_value
=
''
)
}
def
decode
(
self
,
serialized_example
):
decoded_tensors
=
super
(
TfExampleDecoder
,
self
).
decode
(
serialized_example
)
segmentation_parsed_tensors
=
tf
.
io
.
parse_single_example
(
serialized_example
,
self
.
_segmentation_keys_to_features
)
segmentation_mask
=
tf
.
io
.
decode_image
(
segmentation_parsed_tensors
[
'image/segmentation/class/encoded'
],
channels
=
1
)
segmentation_mask
.
set_shape
([
None
,
None
,
1
])
decoded_tensors
.
update
({
'groundtruth_segmentation_mask'
:
segmentation_mask
})
return
decoded_tensors
class
Parser
(
maskrcnn_input
.
Parser
):
"""Parser to parse an image and its annotations into a dictionary of tensors."""
def
__init__
(
self
,
output_size
,
min_level
,
max_level
,
num_scales
,
aspect_ratios
,
anchor_size
,
rpn_match_threshold
=
0.7
,
rpn_unmatched_threshold
=
0.3
,
rpn_batch_size_per_im
=
256
,
rpn_fg_fraction
=
0.5
,
aug_rand_hflip
=
False
,
aug_scale_min
=
1.0
,
aug_scale_max
=
1.0
,
skip_crowd_during_training
=
True
,
max_num_instances
=
100
,
mask_crop_size
=
112
,
segmentation_resize_eval_groundtruth
=
True
,
segmentation_groundtruth_padded_size
=
None
,
segmentation_ignore_label
=
255
,
dtype
=
'float32'
):
"""Initializes parameters for parsing annotations in the dataset.
Args:
output_size: `Tensor` or `list` for [height, width] of output image. The
output_size should be divided by the largest feature stride 2^max_level.
min_level: `int` number of minimum level of the output feature pyramid.
max_level: `int` number of maximum level of the output feature pyramid.
num_scales: `int` number representing intermediate scales added
on each level. For instance, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: `list` of float numbers representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instance, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: `float` number representing the scale of size of the base
anchor to the feature stride 2^level.
rpn_match_threshold: `float`, match threshold for anchors in RPN.
rpn_unmatched_threshold: `float`, unmatched threshold for anchors in RPN.
rpn_batch_size_per_im: `int` for batch size per image in RPN.
rpn_fg_fraction: `float` for forground fraction per batch in RPN.
aug_rand_hflip: `bool`, if True, augment training with random
horizontal flip.
aug_scale_min: `float`, the minimum scale applied to `output_size` for
data augmentation during training.
aug_scale_max: `float`, the maximum scale applied to `output_size` for
data augmentation during training.
skip_crowd_during_training: `bool`, if True, skip annotations labeled with
`is_crowd` equals to 1.
max_num_instances: `int` number of maximum number of instances in an
image. The groundtruth data will be padded to `max_num_instances`.
mask_crop_size: the size which groundtruth mask is cropped to.
segmentation_resize_eval_groundtruth: `bool`, if True, eval groundtruth
masks are resized to output_size.
segmentation_groundtruth_padded_size: `Tensor` or `list` for [height,
width]. When resize_eval_groundtruth is set to False, the groundtruth
masks are padded to this size.
segmentation_ignore_label: `int` the pixel with ignore label will not used
for training and evaluation.
dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
"""
super
(
Parser
,
self
).
__init__
(
output_size
=
output_size
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
,
rpn_match_threshold
=
rpn_match_threshold
,
rpn_unmatched_threshold
=
rpn_unmatched_threshold
,
rpn_batch_size_per_im
=
rpn_batch_size_per_im
,
rpn_fg_fraction
=
rpn_fg_fraction
,
aug_rand_hflip
=
False
,
aug_scale_min
=
aug_scale_min
,
aug_scale_max
=
aug_scale_max
,
skip_crowd_during_training
=
skip_crowd_during_training
,
max_num_instances
=
max_num_instances
,
include_mask
=
True
,
mask_crop_size
=
mask_crop_size
,
dtype
=
dtype
)
self
.
aug_rand_hflip
=
aug_rand_hflip
self
.
_segmentation_resize_eval_groundtruth
=
segmentation_resize_eval_groundtruth
if
(
not
segmentation_resize_eval_groundtruth
)
and
(
segmentation_groundtruth_padded_size
is
None
):
raise
ValueError
(
'segmentation_groundtruth_padded_size ([height, width]) needs to be'
'specified when resize_eval_segmentation_groundtruth is False.'
)
self
.
_segmentation_groundtruth_padded_size
=
segmentation_groundtruth_padded_size
self
.
_segmentation_ignore_label
=
segmentation_ignore_label
def
_parse_train_data
(
self
,
data
):
"""Parses data for training.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following describes
{key: value} pairs in the dictionary.
image_info: a 2D `Tensor` that encodes the information of the image and
the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width]],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each level.
rpn_score_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location]. The height_l and
width_l represent the dimension of class logits at l-th level.
rpn_box_targets: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, anchors_per_location * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
gt_boxes: Groundtruth bounding box annotations. The box is represented
in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
image that is fed to the network. The tennsor is padded with -1 to
the fixed dimension [self._max_num_instances, 4].
gt_classes: Groundtruth classes annotations. The tennsor is padded
with -1 to the fixed dimension [self._max_num_instances].
gt_masks: Groundtruth masks cropped by the bounding box and
resized to a fixed size determined by mask_crop_size.
gt_segmentation_mask: Groundtruth mask for segmentation head, this is
resized to a fixed size determined by output_size.
gt_segmentation_valid_mask: Binary mask that marks the pixels that
are supposed to be used in computing the segmentation loss while
training.
"""
segmentation_mask
=
data
[
'groundtruth_segmentation_mask'
]
# Flips image randomly during training.
if
self
.
aug_rand_hflip
:
masks
=
data
[
'groundtruth_instance_masks'
]
image_mask
=
tf
.
concat
([
data
[
'image'
],
segmentation_mask
],
axis
=
2
)
image_mask
,
boxes
,
masks
=
preprocess_ops
.
random_horizontal_flip
(
image_mask
,
data
[
'groundtruth_boxes'
],
masks
)
segmentation_mask
=
image_mask
[:,
:,
-
1
:]
image
=
image_mask
[:,
:,
:
-
1
]
data
[
'image'
]
=
image
data
[
'boxes'
]
=
boxes
data
[
'masks'
]
=
masks
image
,
labels
=
super
(
Parser
,
self
).
_parse_train_data
(
data
)
image_info
=
labels
[
'image_info'
]
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
segmentation_mask
=
tf
.
reshape
(
segmentation_mask
,
shape
=
[
1
,
data
[
'height'
],
data
[
'width'
]])
segmentation_mask
=
tf
.
cast
(
segmentation_mask
,
tf
.
float32
)
# Pad label and make sure the padded region assigned to the ignore label.
# The label is first offset by +1 and then padded with 0.
segmentation_mask
+=
1
segmentation_mask
=
tf
.
expand_dims
(
segmentation_mask
,
axis
=
3
)
segmentation_mask
=
preprocess_ops
.
resize_and_crop_masks
(
segmentation_mask
,
image_scale
,
self
.
_output_size
,
offset
)
segmentation_mask
-=
1
segmentation_mask
=
tf
.
where
(
tf
.
equal
(
segmentation_mask
,
-
1
),
self
.
_segmentation_ignore_label
*
tf
.
ones_like
(
segmentation_mask
),
segmentation_mask
)
segmentation_mask
=
tf
.
squeeze
(
segmentation_mask
,
axis
=
0
)
segmentation_valid_mask
=
tf
.
not_equal
(
segmentation_mask
,
self
.
_segmentation_ignore_label
)
labels
.
update
({
'gt_segmentation_mask'
:
segmentation_mask
,
'gt_segmentation_valid_mask'
:
segmentation_valid_mask
})
return
image
,
labels
def
_parse_eval_data
(
self
,
data
):
"""Parses data for evaluation.
Args:
data: the decoded tensor dictionary from TfExampleDecoder.
Returns:
A dictionary of {'images': image, 'labels': labels} where
image: image tensor that is preproessed to have normalized value and
dimension [output_size[0], output_size[1], 3]
labels: a dictionary of tensors used for training. The following
describes {key: value} pairs in the dictionary.
source_ids: Source image id. Default value -1 if the source id is
empty in the groundtruth annotation.
image_info: a 2D `Tensor` that encodes the information of the image
and the applied preprocessing. It is in the format of
[[original_height, original_width], [scaled_height, scaled_width]],
anchor_boxes: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, 4] representing anchor boxes at each
level.
"""
segmentation_mask
=
tf
.
cast
(
data
[
'groundtruth_segmentation_mask'
],
tf
.
float32
)
segmentation_mask
=
tf
.
reshape
(
segmentation_mask
,
shape
=
[
1
,
data
[
'height'
],
data
[
'width'
],
1
])
segmentation_mask
+=
1
image
,
labels
=
super
(
Parser
,
self
).
_parse_eval_data
(
data
)
if
self
.
_segmentation_resize_eval_groundtruth
:
# Resizes eval masks to match input image sizes. In that case, mean IoU
# is computed on output_size not the original size of the images.
image_info
=
labels
[
'image_info'
]
image_scale
=
image_info
[
2
,
:]
offset
=
image_info
[
3
,
:]
segmentation_mask
=
preprocess_ops
.
resize_and_crop_masks
(
segmentation_mask
,
image_scale
,
self
.
_output_size
,
offset
)
else
:
segmentation_mask
=
tf
.
image
.
pad_to_bounding_box
(
segmentation_mask
,
0
,
0
,
self
.
_segmentation_groundtruth_padded_size
[
0
],
self
.
_segmentation_groundtruth_padded_size
[
1
])
segmentation_mask
-=
1
# Assign ignore label to the padded region.
segmentation_mask
=
tf
.
where
(
tf
.
equal
(
segmentation_mask
,
-
1
),
self
.
_segmentation_ignore_label
*
tf
.
ones_like
(
segmentation_mask
),
segmentation_mask
)
segmentation_mask
=
tf
.
squeeze
(
segmentation_mask
,
axis
=
0
)
segmentation_valid_mask
=
tf
.
not_equal
(
segmentation_mask
,
self
.
_segmentation_ignore_label
)
labels
[
'groundtruths'
].
update
({
'gt_segmentation_mask'
:
segmentation_mask
,
'gt_segmentation_valid_mask'
:
segmentation_valid_mask
})
return
image
,
labels
official/vision/beta/projects/panoptic_maskrcnn/modeling/factory.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Factory method to build panoptic segmentation model."""
import
tensorflow
as
tf
from
official.vision.beta.modeling
import
backbones
from
official.vision.beta.modeling
import
factory
as
models_factory
from
official.vision.beta.modeling.decoders
import
factory
as
decoder_factory
from
official.vision.beta.modeling.heads
import
segmentation_heads
from
official.vision.beta.projects.panoptic_maskrcnn.configs
import
panoptic_maskrcnn
as
panoptic_maskrcnn_cfg
from
official.vision.beta.projects.panoptic_maskrcnn.modeling
import
panoptic_maskrcnn_model
def
build_panoptic_maskrcnn
(
input_specs
:
tf
.
keras
.
layers
.
InputSpec
,
model_config
:
panoptic_maskrcnn_cfg
.
PanopticMaskRCNN
,
l2_regularizer
:
tf
.
keras
.
regularizers
.
Regularizer
=
None
)
->
tf
.
keras
.
Model
:
"""Builds Panoptic Mask R-CNN model.
This factory function builds the mask rcnn first, builds the non-shared
semantic segmentation layers, and finally combines the two models to form
the panoptic segmentation model.
Args:
input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
model_config: Config instance for the panoptic maskrcnn model.
l2_regularizer: Optional `tf.keras.regularizers.Regularizer`, if specified,
the model is built with the provided regularization layer.
Returns:
tf.keras.Model for the panoptic segmentation model.
"""
norm_activation_config
=
model_config
.
norm_activation
segmentation_config
=
model_config
.
segmentation_model
# Builds the maskrcnn model.
maskrcnn_model
=
models_factory
.
build_maskrcnn
(
input_specs
=
input_specs
,
model_config
=
model_config
,
l2_regularizer
=
l2_regularizer
)
# Builds the semantic segmentation branch.
if
not
model_config
.
shared_backbone
:
segmentation_backbone
=
backbones
.
factory
.
build_backbone
(
input_specs
=
input_specs
,
backbone_config
=
segmentation_config
.
backbone
,
norm_activation_config
=
norm_activation_config
,
l2_regularizer
=
l2_regularizer
)
segmentation_decoder_input_specs
=
segmentation_backbone
.
output_specs
else
:
segmentation_backbone
=
None
segmentation_decoder_input_specs
=
maskrcnn_model
.
backbone
.
output_specs
if
not
model_config
.
shared_decoder
:
segmentation_decoder
=
decoder_factory
.
build_decoder
(
input_specs
=
segmentation_decoder_input_specs
,
model_config
=
segmentation_config
,
l2_regularizer
=
l2_regularizer
)
else
:
segmentation_decoder
=
None
segmentation_head_config
=
segmentation_config
.
head
detection_head_config
=
model_config
.
detection_head
segmentation_head
=
segmentation_heads
.
SegmentationHead
(
num_classes
=
segmentation_config
.
num_classes
,
level
=
segmentation_head_config
.
level
,
num_convs
=
segmentation_head_config
.
num_convs
,
prediction_kernel_size
=
segmentation_head_config
.
prediction_kernel_size
,
num_filters
=
segmentation_head_config
.
num_filters
,
upsample_factor
=
segmentation_head_config
.
upsample_factor
,
feature_fusion
=
segmentation_head_config
.
feature_fusion
,
low_level
=
segmentation_head_config
.
low_level
,
low_level_num_filters
=
segmentation_head_config
.
low_level_num_filters
,
activation
=
norm_activation_config
.
activation
,
use_sync_bn
=
norm_activation_config
.
use_sync_bn
,
norm_momentum
=
norm_activation_config
.
norm_momentum
,
norm_epsilon
=
norm_activation_config
.
norm_epsilon
,
kernel_regularizer
=
l2_regularizer
)
# Combines maskrcnn, and segmentation models to build panoptic segmentation
# model.
model
=
panoptic_maskrcnn_model
.
PanopticMaskRCNNModel
(
backbone
=
maskrcnn_model
.
backbone
,
decoder
=
maskrcnn_model
.
decoder
,
rpn_head
=
maskrcnn_model
.
rpn_head
,
detection_head
=
maskrcnn_model
.
detection_head
,
roi_generator
=
maskrcnn_model
.
roi_generator
,
roi_sampler
=
maskrcnn_model
.
roi_sampler
,
roi_aligner
=
maskrcnn_model
.
roi_aligner
,
detection_generator
=
maskrcnn_model
.
detection_generator
,
mask_head
=
maskrcnn_model
.
mask_head
,
mask_sampler
=
maskrcnn_model
.
mask_sampler
,
mask_roi_aligner
=
maskrcnn_model
.
mask_roi_aligner
,
segmentation_backbone
=
segmentation_backbone
,
segmentation_decoder
=
segmentation_decoder
,
segmentation_head
=
segmentation_head
,
class_agnostic_bbox_pred
=
detection_head_config
.
class_agnostic_bbox_pred
,
cascade_class_ensemble
=
detection_head_config
.
cascade_class_ensemble
,
min_level
=
model_config
.
min_level
,
max_level
=
model_config
.
max_level
,
num_scales
=
model_config
.
anchor
.
num_scales
,
aspect_ratios
=
model_config
.
anchor
.
aspect_ratios
,
anchor_size
=
model_config
.
anchor
.
anchor_size
)
return
model
official/vision/beta/projects/panoptic_maskrcnn/modeling/factory_test.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for factory.py."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.vision.beta.configs
import
backbones
from
official.vision.beta.configs
import
decoders
from
official.vision.beta.configs
import
semantic_segmentation
from
official.vision.beta.projects.panoptic_maskrcnn.configs
import
panoptic_maskrcnn
as
panoptic_maskrcnn_cfg
from
official.vision.beta.projects.panoptic_maskrcnn.modeling
import
factory
class
PanopticMaskRCNNBuilderTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
(
'resnet'
,
(
640
,
640
),
'dilated_resnet'
,
'fpn'
),
(
'resnet'
,
(
640
,
640
),
'dilated_resnet'
,
'aspp'
),
(
'resnet'
,
(
640
,
640
),
None
,
'fpn'
),
(
'resnet'
,
(
640
,
640
),
None
,
'aspp'
),
(
'resnet'
,
(
640
,
640
),
None
,
None
),
(
'resnet'
,
(
None
,
None
),
'dilated_resnet'
,
'fpn'
),
(
'resnet'
,
(
None
,
None
),
'dilated_resnet'
,
'aspp'
),
(
'resnet'
,
(
None
,
None
),
None
,
'fpn'
),
(
'resnet'
,
(
None
,
None
),
None
,
'aspp'
),
(
'resnet'
,
(
None
,
None
),
None
,
None
)
)
def
test_builder
(
self
,
backbone_type
,
input_size
,
segmentation_backbone_type
,
segmentation_decoder_type
):
num_classes
=
2
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
None
,
input_size
[
0
],
input_size
[
1
],
3
])
segmentation_output_stride
=
16
level
=
int
(
np
.
math
.
log2
(
segmentation_output_stride
))
segmentation_model
=
semantic_segmentation
.
SemanticSegmentationModel
(
num_classes
=
2
,
backbone
=
backbones
.
Backbone
(
type
=
segmentation_backbone_type
),
decoder
=
decoders
.
Decoder
(
type
=
segmentation_decoder_type
),
head
=
semantic_segmentation
.
SegmentationHead
(
level
=
level
))
model_config
=
panoptic_maskrcnn_cfg
.
PanopticMaskRCNN
(
num_classes
=
num_classes
,
segmentation_model
=
segmentation_model
,
backbone
=
backbones
.
Backbone
(
type
=
backbone_type
),
shared_backbone
=
segmentation_backbone_type
is
None
,
shared_decoder
=
segmentation_decoder_type
is
None
)
l2_regularizer
=
tf
.
keras
.
regularizers
.
l2
(
5e-5
)
_
=
factory
.
build_panoptic_maskrcnn
(
input_specs
=
input_specs
,
model_config
=
model_config
,
l2_regularizer
=
l2_regularizer
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Panoptic Segmentation model."""
from
typing
import
List
,
Mapping
,
Optional
,
Union
import
tensorflow
as
tf
from
official.vision.beta.modeling
import
maskrcnn_model
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Vision'
)
class
PanopticMaskRCNNModel
(
maskrcnn_model
.
MaskRCNNModel
):
"""The Panoptic Segmentation model."""
def
__init__
(
self
,
backbone
:
tf
.
keras
.
Model
,
decoder
:
tf
.
keras
.
Model
,
rpn_head
:
tf
.
keras
.
layers
.
Layer
,
detection_head
:
Union
[
tf
.
keras
.
layers
.
Layer
,
List
[
tf
.
keras
.
layers
.
Layer
]],
roi_generator
:
tf
.
keras
.
layers
.
Layer
,
roi_sampler
:
Union
[
tf
.
keras
.
layers
.
Layer
,
List
[
tf
.
keras
.
layers
.
Layer
]],
roi_aligner
:
tf
.
keras
.
layers
.
Layer
,
detection_generator
:
tf
.
keras
.
layers
.
Layer
,
mask_head
:
Optional
[
tf
.
keras
.
layers
.
Layer
]
=
None
,
mask_sampler
:
Optional
[
tf
.
keras
.
layers
.
Layer
]
=
None
,
mask_roi_aligner
:
Optional
[
tf
.
keras
.
layers
.
Layer
]
=
None
,
segmentation_backbone
:
Optional
[
tf
.
keras
.
Model
]
=
None
,
segmentation_decoder
:
Optional
[
tf
.
keras
.
Model
]
=
None
,
segmentation_head
:
tf
.
keras
.
layers
.
Layer
=
None
,
class_agnostic_bbox_pred
:
bool
=
False
,
cascade_class_ensemble
:
bool
=
False
,
min_level
:
Optional
[
int
]
=
None
,
max_level
:
Optional
[
int
]
=
None
,
num_scales
:
Optional
[
int
]
=
None
,
aspect_ratios
:
Optional
[
List
[
float
]]
=
None
,
anchor_size
:
Optional
[
float
]
=
None
,
**
kwargs
):
"""Initializes the Panoptic Mask R-CNN model.
Args:
backbone: `tf.keras.Model`, the backbone network.
decoder: `tf.keras.Model`, the decoder network.
rpn_head: the RPN head.
detection_head: the detection head or a list of heads.
roi_generator: the ROI generator.
roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
detection heads.
roi_aligner: the ROI aligner.
detection_generator: the detection generator.
mask_head: the mask head.
mask_sampler: the mask sampler.
mask_roi_aligner: the ROI alginer for mask prediction.
segmentation_backbone: `tf.keras.Model`, the backbone network for the
segmentation head for panoptic task. Providing `segmentation_backbone`
will allow the segmentation head to use a standlone backbone. Setting
`segmentation_backbone=None` would enable backbone sharing between the
MaskRCNN model and segmentation head.
segmentation_decoder: `tf.keras.Model`, the decoder network for the
segmentation head for panoptic task. Providing `segmentation_decoder`
will allow the segmentation head to use a standlone decoder. Setting
`segmentation_decoder=None` would enable decoder sharing between the
MaskRCNN model and segmentation head. Decoders can only be shared when
`segmentation_backbone` is shared as well.
segmentation_head: segmentatation head for panoptic task.
class_agnostic_bbox_pred: if True, perform class agnostic bounding box
prediction. Needs to be `True` for Cascade RCNN models.
cascade_class_ensemble: if True, ensemble classification scores over all
detection heads.
min_level: Minimum level in output feature maps.
max_level: Maximum level in output feature maps.
num_scales: A number representing intermediate scales added on each level.
For instances, num_scales=2 adds one additional intermediate anchor
scales [2^0, 2^0.5] on each level.
aspect_ratios: A list representing the aspect raito anchors added on each
level. The number indicates the ratio of width to height. For instances,
aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
anchor_size: A number representing the scale of size of the base anchor to
the feature stride 2^level.
**kwargs: keyword arguments to be passed.
"""
super
(
PanopticMaskRCNNModel
,
self
).
__init__
(
backbone
=
backbone
,
decoder
=
decoder
,
rpn_head
=
rpn_head
,
detection_head
=
detection_head
,
roi_generator
=
roi_generator
,
roi_sampler
=
roi_sampler
,
roi_aligner
=
roi_aligner
,
detection_generator
=
detection_generator
,
mask_head
=
mask_head
,
mask_sampler
=
mask_sampler
,
mask_roi_aligner
=
mask_roi_aligner
,
class_agnostic_bbox_pred
=
class_agnostic_bbox_pred
,
cascade_class_ensemble
=
cascade_class_ensemble
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
,
**
kwargs
)
self
.
_config_dict
.
update
({
'segmentation_backbone'
:
segmentation_backbone
,
'segmentation_decoder'
:
segmentation_decoder
,
'segmentation_head'
:
segmentation_head
})
if
not
self
.
_include_mask
:
raise
ValueError
(
'`mask_head` needs to be provided for Panoptic Mask R-CNN.'
)
if
segmentation_backbone
is
not
None
and
segmentation_decoder
is
None
:
raise
ValueError
(
'`segmentation_decoder` needs to be provided for Panoptic Mask R-CNN'
'if `backbone` is not shared.'
)
self
.
segmentation_backbone
=
segmentation_backbone
self
.
segmentation_decoder
=
segmentation_decoder
self
.
segmentation_head
=
segmentation_head
def
call
(
self
,
images
:
tf
.
Tensor
,
image_shape
:
tf
.
Tensor
,
anchor_boxes
:
Optional
[
Mapping
[
str
,
tf
.
Tensor
]]
=
None
,
gt_boxes
:
Optional
[
tf
.
Tensor
]
=
None
,
gt_classes
:
Optional
[
tf
.
Tensor
]
=
None
,
gt_masks
:
Optional
[
tf
.
Tensor
]
=
None
,
training
:
Optional
[
bool
]
=
None
)
->
Mapping
[
str
,
tf
.
Tensor
]:
model_outputs
=
super
(
PanopticMaskRCNNModel
,
self
).
call
(
images
=
images
,
image_shape
=
image_shape
,
anchor_boxes
=
anchor_boxes
,
gt_boxes
=
gt_boxes
,
gt_classes
=
gt_classes
,
gt_masks
=
gt_masks
,
training
=
training
)
if
self
.
segmentation_backbone
is
not
None
:
backbone_features
=
self
.
segmentation_backbone
(
images
,
training
=
training
)
else
:
backbone_features
=
model_outputs
[
'backbone_features'
]
if
self
.
segmentation_decoder
is
not
None
:
decoder_features
=
self
.
segmentation_decoder
(
backbone_features
,
training
=
training
)
else
:
decoder_features
=
model_outputs
[
'decoder_features'
]
segmentation_outputs
=
self
.
segmentation_head
(
backbone_features
,
decoder_features
,
training
=
training
)
model_outputs
.
update
({
'segmentation_outputs'
:
segmentation_outputs
,
})
return
model_outputs
@
property
def
checkpoint_items
(
self
)
->
Mapping
[
str
,
Union
[
tf
.
keras
.
Model
,
tf
.
keras
.
layers
.
Layer
]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items
=
super
(
PanopticMaskRCNNModel
,
self
).
checkpoint_items
if
self
.
segmentation_backbone
is
not
None
:
items
.
update
(
segmentation_backbone
=
self
.
segmentation_backbone
)
if
self
.
segmentation_decoder
is
not
None
:
items
.
update
(
segmentation_decoder
=
self
.
segmentation_decoder
)
items
.
update
(
segmentation_head
=
self
.
segmentation_head
)
return
items
official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model_test.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for panoptic_maskrcnn_model.py."""
import
os
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.distribute
import
strategy_combinations
from
official.vision.beta.modeling.backbones
import
resnet
from
official.vision.beta.modeling.decoders
import
aspp
from
official.vision.beta.modeling.decoders
import
fpn
from
official.vision.beta.modeling.heads
import
dense_prediction_heads
from
official.vision.beta.modeling.heads
import
instance_heads
from
official.vision.beta.modeling.heads
import
segmentation_heads
from
official.vision.beta.modeling.layers
import
detection_generator
from
official.vision.beta.modeling.layers
import
mask_sampler
from
official.vision.beta.modeling.layers
import
roi_aligner
from
official.vision.beta.modeling.layers
import
roi_generator
from
official.vision.beta.modeling.layers
import
roi_sampler
from
official.vision.beta.ops
import
anchor
from
official.vision.beta.projects.panoptic_maskrcnn.modeling
import
panoptic_maskrcnn_model
class
PanopticMaskRCNNModelTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
combinations
.
generate
(
combinations
.
combine
(
use_separable_conv
=
[
True
,
False
],
build_anchor_boxes
=
[
True
,
False
],
shared_backbone
=
[
True
,
False
],
shared_decoder
=
[
True
,
False
],
is_training
=
[
True
,
False
]))
def
test_build_model
(
self
,
use_separable_conv
,
build_anchor_boxes
,
shared_backbone
,
shared_decoder
,
is_training
=
True
):
num_classes
=
3
min_level
=
3
max_level
=
7
num_scales
=
3
aspect_ratios
=
[
1.0
]
anchor_size
=
3
resnet_model_id
=
50
segmentation_resnet_model_id
=
50
segmentation_output_stride
=
16
aspp_dilation_rates
=
[
6
,
12
,
18
]
aspp_decoder_level
=
int
(
np
.
math
.
log2
(
segmentation_output_stride
))
fpn_decoder_level
=
3
num_anchors_per_location
=
num_scales
*
len
(
aspect_ratios
)
image_size
=
128
images
=
np
.
random
.
rand
(
2
,
image_size
,
image_size
,
3
)
image_shape
=
np
.
array
([[
image_size
,
image_size
],
[
image_size
,
image_size
]])
shared_decoder
=
shared_decoder
and
shared_backbone
if
build_anchor_boxes
:
anchor_boxes
=
anchor
.
Anchor
(
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
3
,
image_size
=
(
image_size
,
image_size
)).
multilevel_boxes
for
l
in
anchor_boxes
:
anchor_boxes
[
l
]
=
tf
.
tile
(
tf
.
expand_dims
(
anchor_boxes
[
l
],
axis
=
0
),
[
2
,
1
,
1
,
1
])
else
:
anchor_boxes
=
None
backbone
=
resnet
.
ResNet
(
model_id
=
resnet_model_id
)
decoder
=
fpn
.
FPN
(
input_specs
=
backbone
.
output_specs
,
min_level
=
min_level
,
max_level
=
max_level
,
use_separable_conv
=
use_separable_conv
)
rpn_head
=
dense_prediction_heads
.
RPNHead
(
min_level
=
min_level
,
max_level
=
max_level
,
num_anchors_per_location
=
num_anchors_per_location
,
num_convs
=
1
)
detection_head
=
instance_heads
.
DetectionHead
(
num_classes
=
num_classes
)
roi_generator_obj
=
roi_generator
.
MultilevelROIGenerator
()
roi_sampler_obj
=
roi_sampler
.
ROISampler
()
roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
()
detection_generator_obj
=
detection_generator
.
DetectionGenerator
()
mask_head
=
instance_heads
.
MaskHead
(
num_classes
=
num_classes
,
upsample_factor
=
2
)
mask_sampler_obj
=
mask_sampler
.
MaskSampler
(
mask_target_size
=
28
,
num_sampled_masks
=
1
)
mask_roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
(
crop_size
=
14
)
if
shared_backbone
:
segmentation_backbone
=
None
else
:
segmentation_backbone
=
resnet
.
ResNet
(
model_id
=
segmentation_resnet_model_id
)
if
not
shared_decoder
:
level
=
aspp_decoder_level
segmentation_decoder
=
aspp
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
)
else
:
level
=
fpn_decoder_level
segmentation_decoder
=
None
segmentation_head
=
segmentation_heads
.
SegmentationHead
(
num_classes
=
2
,
# stuff and common class for things,
level
=
level
,
num_convs
=
2
)
model
=
panoptic_maskrcnn_model
.
PanopticMaskRCNNModel
(
backbone
,
decoder
,
rpn_head
,
detection_head
,
roi_generator_obj
,
roi_sampler_obj
,
roi_aligner_obj
,
detection_generator_obj
,
mask_head
,
mask_sampler_obj
,
mask_roi_aligner_obj
,
segmentation_backbone
=
segmentation_backbone
,
segmentation_decoder
=
segmentation_decoder
,
segmentation_head
=
segmentation_head
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
)
gt_boxes
=
np
.
array
(
[[[
10
,
10
,
15
,
15
],
[
2.5
,
2.5
,
7.5
,
7.5
],
[
-
1
,
-
1
,
-
1
,
-
1
]],
[[
100
,
100
,
150
,
150
],
[
-
1
,
-
1
,
-
1
,
-
1
],
[
-
1
,
-
1
,
-
1
,
-
1
]]],
dtype
=
np
.
float32
)
gt_classes
=
np
.
array
([[
2
,
1
,
-
1
],
[
1
,
-
1
,
-
1
]],
dtype
=
np
.
int32
)
gt_masks
=
np
.
ones
((
2
,
3
,
100
,
100
))
# Results will be checked in test_forward.
_
=
model
(
images
,
image_shape
,
anchor_boxes
,
gt_boxes
,
gt_classes
,
gt_masks
,
training
=
is_training
)
@
combinations
.
generate
(
combinations
.
combine
(
strategy
=
[
strategy_combinations
.
cloud_tpu_strategy
,
strategy_combinations
.
one_device_strategy_gpu
,
],
shared_backbone
=
[
True
,
False
],
shared_decoder
=
[
True
,
False
],
training
=
[
True
,
False
],
))
def
test_forward
(
self
,
strategy
,
training
,
shared_backbone
,
shared_decoder
):
num_classes
=
3
min_level
=
3
max_level
=
4
num_scales
=
3
aspect_ratios
=
[
1.0
]
anchor_size
=
3
segmentation_resnet_model_id
=
101
segmentation_output_stride
=
16
aspp_dilation_rates
=
[
6
,
12
,
18
]
aspp_decoder_level
=
int
(
np
.
math
.
log2
(
segmentation_output_stride
))
fpn_decoder_level
=
3
class_agnostic_bbox_pred
=
False
cascade_class_ensemble
=
False
image_size
=
(
256
,
256
)
images
=
np
.
random
.
rand
(
2
,
image_size
[
0
],
image_size
[
1
],
3
)
image_shape
=
np
.
array
([[
224
,
100
],
[
100
,
224
]])
shared_decoder
=
shared_decoder
and
shared_backbone
with
strategy
.
scope
():
anchor_boxes
=
anchor
.
Anchor
(
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
,
image_size
=
image_size
).
multilevel_boxes
num_anchors_per_location
=
len
(
aspect_ratios
)
*
num_scales
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
None
,
None
,
None
,
3
])
backbone
=
resnet
.
ResNet
(
model_id
=
50
,
input_specs
=
input_specs
)
decoder
=
fpn
.
FPN
(
min_level
=
min_level
,
max_level
=
max_level
,
input_specs
=
backbone
.
output_specs
)
rpn_head
=
dense_prediction_heads
.
RPNHead
(
min_level
=
min_level
,
max_level
=
max_level
,
num_anchors_per_location
=
num_anchors_per_location
)
detection_head
=
instance_heads
.
DetectionHead
(
num_classes
=
num_classes
,
class_agnostic_bbox_pred
=
class_agnostic_bbox_pred
)
roi_generator_obj
=
roi_generator
.
MultilevelROIGenerator
()
roi_sampler_cascade
=
[]
roi_sampler_obj
=
roi_sampler
.
ROISampler
()
roi_sampler_cascade
.
append
(
roi_sampler_obj
)
roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
()
detection_generator_obj
=
detection_generator
.
DetectionGenerator
()
mask_head
=
instance_heads
.
MaskHead
(
num_classes
=
num_classes
,
upsample_factor
=
2
)
mask_sampler_obj
=
mask_sampler
.
MaskSampler
(
mask_target_size
=
28
,
num_sampled_masks
=
1
)
mask_roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
(
crop_size
=
14
)
if
shared_backbone
:
segmentation_backbone
=
None
else
:
segmentation_backbone
=
resnet
.
ResNet
(
model_id
=
segmentation_resnet_model_id
)
if
not
shared_decoder
:
level
=
aspp_decoder_level
segmentation_decoder
=
aspp
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
)
else
:
level
=
fpn_decoder_level
segmentation_decoder
=
None
segmentation_head
=
segmentation_heads
.
SegmentationHead
(
num_classes
=
2
,
# stuff and common class for things,
level
=
level
,
num_convs
=
2
)
model
=
panoptic_maskrcnn_model
.
PanopticMaskRCNNModel
(
backbone
,
decoder
,
rpn_head
,
detection_head
,
roi_generator_obj
,
roi_sampler_obj
,
roi_aligner_obj
,
detection_generator_obj
,
mask_head
,
mask_sampler_obj
,
mask_roi_aligner_obj
,
segmentation_backbone
=
segmentation_backbone
,
segmentation_decoder
=
segmentation_decoder
,
segmentation_head
=
segmentation_head
,
class_agnostic_bbox_pred
=
class_agnostic_bbox_pred
,
cascade_class_ensemble
=
cascade_class_ensemble
,
min_level
=
min_level
,
max_level
=
max_level
,
num_scales
=
num_scales
,
aspect_ratios
=
aspect_ratios
,
anchor_size
=
anchor_size
)
gt_boxes
=
np
.
array
(
[[[
10
,
10
,
15
,
15
],
[
2.5
,
2.5
,
7.5
,
7.5
],
[
-
1
,
-
1
,
-
1
,
-
1
]],
[[
100
,
100
,
150
,
150
],
[
-
1
,
-
1
,
-
1
,
-
1
],
[
-
1
,
-
1
,
-
1
,
-
1
]]],
dtype
=
np
.
float32
)
gt_classes
=
np
.
array
([[
2
,
1
,
-
1
],
[
1
,
-
1
,
-
1
]],
dtype
=
np
.
int32
)
gt_masks
=
np
.
ones
((
2
,
3
,
100
,
100
))
results
=
model
(
images
,
image_shape
,
anchor_boxes
,
gt_boxes
,
gt_classes
,
gt_masks
,
training
=
training
)
self
.
assertIn
(
'rpn_boxes'
,
results
)
self
.
assertIn
(
'rpn_scores'
,
results
)
if
training
:
self
.
assertIn
(
'class_targets'
,
results
)
self
.
assertIn
(
'box_targets'
,
results
)
self
.
assertIn
(
'class_outputs'
,
results
)
self
.
assertIn
(
'box_outputs'
,
results
)
self
.
assertIn
(
'mask_outputs'
,
results
)
else
:
self
.
assertIn
(
'detection_boxes'
,
results
)
self
.
assertIn
(
'detection_scores'
,
results
)
self
.
assertIn
(
'detection_classes'
,
results
)
self
.
assertIn
(
'num_detections'
,
results
)
self
.
assertIn
(
'detection_masks'
,
results
)
self
.
assertIn
(
'segmentation_outputs'
,
results
)
self
.
assertAllEqual
(
[
2
,
image_size
[
0
]
//
(
2
**
level
),
image_size
[
1
]
//
(
2
**
level
),
2
],
results
[
'segmentation_outputs'
].
numpy
().
shape
)
@
combinations
.
generate
(
combinations
.
combine
(
shared_backbone
=
[
True
,
False
],
shared_decoder
=
[
True
,
False
]))
def
test_serialize_deserialize
(
self
,
shared_backbone
,
shared_decoder
):
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
None
,
None
,
None
,
3
])
backbone
=
resnet
.
ResNet
(
model_id
=
50
,
input_specs
=
input_specs
)
decoder
=
fpn
.
FPN
(
min_level
=
3
,
max_level
=
7
,
input_specs
=
backbone
.
output_specs
)
rpn_head
=
dense_prediction_heads
.
RPNHead
(
min_level
=
3
,
max_level
=
7
,
num_anchors_per_location
=
3
)
detection_head
=
instance_heads
.
DetectionHead
(
num_classes
=
2
)
roi_generator_obj
=
roi_generator
.
MultilevelROIGenerator
()
roi_sampler_obj
=
roi_sampler
.
ROISampler
()
roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
()
detection_generator_obj
=
detection_generator
.
DetectionGenerator
()
segmentation_resnet_model_id
=
101
segmentation_output_stride
=
16
aspp_dilation_rates
=
[
6
,
12
,
18
]
aspp_decoder_level
=
int
(
np
.
math
.
log2
(
segmentation_output_stride
))
fpn_decoder_level
=
3
shared_decoder
=
shared_decoder
and
shared_backbone
mask_head
=
instance_heads
.
MaskHead
(
num_classes
=
2
,
upsample_factor
=
2
)
mask_sampler_obj
=
mask_sampler
.
MaskSampler
(
mask_target_size
=
28
,
num_sampled_masks
=
1
)
mask_roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
(
crop_size
=
14
)
if
shared_backbone
:
segmentation_backbone
=
None
else
:
segmentation_backbone
=
resnet
.
ResNet
(
model_id
=
segmentation_resnet_model_id
)
if
not
shared_decoder
:
level
=
aspp_decoder_level
segmentation_decoder
=
aspp
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
)
else
:
level
=
fpn_decoder_level
segmentation_decoder
=
None
segmentation_head
=
segmentation_heads
.
SegmentationHead
(
num_classes
=
2
,
# stuff and common class for things,
level
=
level
,
num_convs
=
2
)
model
=
panoptic_maskrcnn_model
.
PanopticMaskRCNNModel
(
backbone
,
decoder
,
rpn_head
,
detection_head
,
roi_generator_obj
,
roi_sampler_obj
,
roi_aligner_obj
,
detection_generator_obj
,
mask_head
,
mask_sampler_obj
,
mask_roi_aligner_obj
,
segmentation_backbone
=
segmentation_backbone
,
segmentation_decoder
=
segmentation_decoder
,
segmentation_head
=
segmentation_head
,
min_level
=
3
,
max_level
=
7
,
num_scales
=
3
,
aspect_ratios
=
[
1.0
],
anchor_size
=
3
)
config
=
model
.
get_config
()
new_model
=
panoptic_maskrcnn_model
.
PanopticMaskRCNNModel
.
from_config
(
config
)
# Validate that the config can be forced to JSON.
_
=
new_model
.
to_json
()
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
model
.
get_config
(),
new_model
.
get_config
())
@
combinations
.
generate
(
combinations
.
combine
(
shared_backbone
=
[
True
,
False
],
shared_decoder
=
[
True
,
False
]))
def
test_checkpoint
(
self
,
shared_backbone
,
shared_decoder
):
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
None
,
None
,
None
,
3
])
backbone
=
resnet
.
ResNet
(
model_id
=
50
,
input_specs
=
input_specs
)
decoder
=
fpn
.
FPN
(
min_level
=
3
,
max_level
=
7
,
input_specs
=
backbone
.
output_specs
)
rpn_head
=
dense_prediction_heads
.
RPNHead
(
min_level
=
3
,
max_level
=
7
,
num_anchors_per_location
=
3
)
detection_head
=
instance_heads
.
DetectionHead
(
num_classes
=
2
)
roi_generator_obj
=
roi_generator
.
MultilevelROIGenerator
()
roi_sampler_obj
=
roi_sampler
.
ROISampler
()
roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
()
detection_generator_obj
=
detection_generator
.
DetectionGenerator
()
segmentation_resnet_model_id
=
101
segmentation_output_stride
=
16
aspp_dilation_rates
=
[
6
,
12
,
18
]
aspp_decoder_level
=
int
(
np
.
math
.
log2
(
segmentation_output_stride
))
fpn_decoder_level
=
3
shared_decoder
=
shared_decoder
and
shared_backbone
mask_head
=
instance_heads
.
MaskHead
(
num_classes
=
2
,
upsample_factor
=
2
)
mask_sampler_obj
=
mask_sampler
.
MaskSampler
(
mask_target_size
=
28
,
num_sampled_masks
=
1
)
mask_roi_aligner_obj
=
roi_aligner
.
MultilevelROIAligner
(
crop_size
=
14
)
if
shared_backbone
:
segmentation_backbone
=
None
else
:
segmentation_backbone
=
resnet
.
ResNet
(
model_id
=
segmentation_resnet_model_id
)
if
not
shared_decoder
:
level
=
aspp_decoder_level
segmentation_decoder
=
aspp
.
ASPP
(
level
=
level
,
dilation_rates
=
aspp_dilation_rates
)
else
:
level
=
fpn_decoder_level
segmentation_decoder
=
None
segmentation_head
=
segmentation_heads
.
SegmentationHead
(
num_classes
=
2
,
# stuff and common class for things,
level
=
level
,
num_convs
=
2
)
model
=
panoptic_maskrcnn_model
.
PanopticMaskRCNNModel
(
backbone
,
decoder
,
rpn_head
,
detection_head
,
roi_generator_obj
,
roi_sampler_obj
,
roi_aligner_obj
,
detection_generator_obj
,
mask_head
,
mask_sampler_obj
,
mask_roi_aligner_obj
,
segmentation_backbone
=
segmentation_backbone
,
segmentation_decoder
=
segmentation_decoder
,
segmentation_head
=
segmentation_head
,
min_level
=
3
,
max_level
=
7
,
num_scales
=
3
,
aspect_ratios
=
[
1.0
],
anchor_size
=
3
)
expect_checkpoint_items
=
dict
(
backbone
=
backbone
,
decoder
=
decoder
,
rpn_head
=
rpn_head
,
detection_head
=
[
detection_head
])
expect_checkpoint_items
[
'mask_head'
]
=
mask_head
if
not
shared_backbone
:
expect_checkpoint_items
[
'segmentation_backbone'
]
=
segmentation_backbone
if
not
shared_decoder
:
expect_checkpoint_items
[
'segmentation_decoder'
]
=
segmentation_decoder
expect_checkpoint_items
[
'segmentation_head'
]
=
segmentation_head
self
.
assertAllEqual
(
expect_checkpoint_items
,
model
.
checkpoint_items
)
# Test save and load checkpoints.
ckpt
=
tf
.
train
.
Checkpoint
(
model
=
model
,
**
model
.
checkpoint_items
)
save_dir
=
self
.
create_tempdir
().
full_path
ckpt
.
save
(
os
.
path
.
join
(
save_dir
,
'ckpt'
))
partial_ckpt
=
tf
.
train
.
Checkpoint
(
backbone
=
backbone
)
partial_ckpt
.
restore
(
tf
.
train
.
latest_checkpoint
(
save_dir
)).
expect_partial
().
assert_existing_objects_matched
()
partial_ckpt_mask
=
tf
.
train
.
Checkpoint
(
backbone
=
backbone
,
mask_head
=
mask_head
)
partial_ckpt_mask
.
restore
(
tf
.
train
.
latest_checkpoint
(
save_dir
)).
expect_partial
().
assert_existing_objects_matched
()
if
not
shared_backbone
:
partial_ckpt_segmentation
=
tf
.
train
.
Checkpoint
(
segmentation_backbone
=
segmentation_backbone
,
segmentation_decoder
=
segmentation_decoder
,
segmentation_head
=
segmentation_head
)
elif
not
shared_decoder
:
partial_ckpt_segmentation
=
tf
.
train
.
Checkpoint
(
segmentation_decoder
=
segmentation_decoder
,
segmentation_head
=
segmentation_head
)
else
:
partial_ckpt_segmentation
=
tf
.
train
.
Checkpoint
(
segmentation_head
=
segmentation_head
)
partial_ckpt_segmentation
.
restore
(
tf
.
train
.
latest_checkpoint
(
save_dir
)).
expect_partial
().
assert_existing_objects_matched
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/vision/beta/projects/simclr/heads/simclr_head.py
View file @
78c43ef1
...
@@ -97,7 +97,6 @@ class ProjectionHead(tf.keras.layers.Layer):
...
@@ -97,7 +97,6 @@ class ProjectionHead(tf.keras.layers.Layer):
'kernel_initializer'
:
self
.
_kernel_initializer
,
'kernel_initializer'
:
self
.
_kernel_initializer
,
'kernel_regularizer'
:
self
.
_kernel_regularizer
,
'kernel_regularizer'
:
self
.
_kernel_regularizer
,
'bias_regularizer'
:
self
.
_bias_regularizer
,
'bias_regularizer'
:
self
.
_bias_regularizer
,
'use_normalization'
:
self
.
_use_normalization
,
'norm_momentum'
:
self
.
_norm_momentum
,
'norm_momentum'
:
self
.
_norm_momentum
,
'norm_epsilon'
:
self
.
_norm_epsilon
'norm_epsilon'
:
self
.
_norm_epsilon
}
}
...
...
official/vision/beta/projects/simclr/modeling/simclr_model.py
View file @
78c43ef1
...
@@ -90,14 +90,15 @@ class SimCLRModel(tf.keras.Model):
...
@@ -90,14 +90,15 @@ class SimCLRModel(tf.keras.Model):
if
training
and
self
.
_mode
==
PRETRAIN
:
if
training
and
self
.
_mode
==
PRETRAIN
:
num_transforms
=
2
num_transforms
=
2
# Split channels, and optionally apply extra batched augmentation.
# (bsz, h, w, c*num_transforms) -> [(bsz, h, w, c), ....]
features_list
=
tf
.
split
(
inputs
,
num_or_size_splits
=
num_transforms
,
axis
=-
1
)
# (num_transforms * bsz, h, w, c)
features
=
tf
.
concat
(
features_list
,
0
)
else
:
else
:
num_transforms
=
1
num_transforms
=
1
features
=
inputs
# Split channels, and optionally apply extra batched augmentation.
# (bsz, h, w, c*num_transforms) -> [(bsz, h, w, c), ....]
features_list
=
tf
.
split
(
inputs
,
num_or_size_splits
=
num_transforms
,
axis
=-
1
)
# (num_transforms * bsz, h, w, c)
features
=
tf
.
concat
(
features_list
,
0
)
# Base network forward pass.
# Base network forward pass.
endpoints
=
self
.
_backbone
(
features
,
training
=
training
)
endpoints
=
self
.
_backbone
(
features
,
training
=
training
)
...
...
official/vision/beta/projects/simclr/tasks/simclr.py
View file @
78c43ef1
...
@@ -415,7 +415,8 @@ class SimCLRFinetuneTask(base_task.Task):
...
@@ -415,7 +415,8 @@ class SimCLRFinetuneTask(base_task.Task):
backbone
=
backbones
.
factory
.
build_backbone
(
backbone
=
backbones
.
factory
.
build_backbone
(
input_specs
=
input_specs
,
input_specs
=
input_specs
,
model_config
=
model_config
,
backbone_config
=
model_config
.
backbone
,
norm_activation_config
=
model_config
.
norm_activation
,
l2_regularizer
=
l2_regularizer
)
l2_regularizer
=
l2_regularizer
)
norm_activation_config
=
model_config
.
norm_activation
norm_activation_config
=
model_config
.
norm_activation
...
...
official/vision/beta/projects/vit/README.md
0 → 100644
View file @
78c43ef1
# Vision Transformer (ViT)
**DISCLAIMER**
: This implementation is still under development. No support will
be provided during the development phase.
[

](https://arxiv.org/abs/2010.11929)
This repository is the implementations of Vision Transformer (ViT) in
TensorFlow 2.
*
Paper title:
[
An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
](
https://arxiv.org/pdf/2010.11929.pdf
)
.
\ No newline at end of file
official/vision/beta/projects/vit/configs/__init__.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Configs package definition."""
from
official.vision.beta.projects.vit.configs
import
image_classification
official/vision/beta/projects/vit/configs/backbones.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Backbones configurations."""
from
typing
import
Optional
import
dataclasses
from
official.modeling
import
hyperparams
@
dataclasses
.
dataclass
class
Transformer
(
hyperparams
.
Config
):
"""Transformer config."""
mlp_dim
:
int
=
1
num_heads
:
int
=
1
num_layers
:
int
=
1
attention_dropout_rate
:
float
=
0.0
dropout_rate
:
float
=
0.1
@
dataclasses
.
dataclass
class
VisionTransformer
(
hyperparams
.
Config
):
"""VisionTransformer config."""
model_name
:
str
=
'vit-b16'
# pylint: disable=line-too-long
classifier
:
str
=
'token'
# 'token' or 'gap'. If set to 'token', an extra classification token is added to sequence.
# pylint: enable=line-too-long
representation_size
:
int
=
0
hidden_size
:
int
=
1
patch_size
:
int
=
16
transformer
:
Transformer
=
Transformer
()
@
dataclasses
.
dataclass
class
Backbone
(
hyperparams
.
OneOfConfig
):
"""Configuration for backbones.
Attributes:
type: 'str', type of backbone be used, one the of fields below.
vit: vit backbone config.
"""
type
:
Optional
[
str
]
=
None
vit
:
VisionTransformer
=
VisionTransformer
()
official/vision/beta/projects/vit/configs/image_classification.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Image classification configuration definition."""
import
os
from
typing
import
List
,
Optional
import
dataclasses
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.core
import
task_factory
from
official.modeling
import
hyperparams
from
official.modeling
import
optimization
from
official.vision.beta.configs
import
common
from
official.vision.beta.configs
import
image_classification
as
img_cls_cfg
from
official.vision.beta.projects.vit.configs
import
backbones
from
official.vision.beta.tasks
import
image_classification
DataConfig
=
img_cls_cfg
.
DataConfig
@
dataclasses
.
dataclass
class
ImageClassificationModel
(
hyperparams
.
Config
):
"""The model config."""
num_classes
:
int
=
0
input_size
:
List
[
int
]
=
dataclasses
.
field
(
default_factory
=
list
)
backbone
:
backbones
.
Backbone
=
backbones
.
Backbone
(
type
=
'vit'
,
vit
=
backbones
.
VisionTransformer
())
dropout_rate
:
float
=
0.0
norm_activation
:
common
.
NormActivation
=
common
.
NormActivation
(
use_sync_bn
=
False
)
# Adds a BatchNormalization layer pre-GlobalAveragePooling in classification
add_head_batch_norm
:
bool
=
False
@
dataclasses
.
dataclass
class
Losses
(
hyperparams
.
Config
):
one_hot
:
bool
=
True
label_smoothing
:
float
=
0.0
l2_weight_decay
:
float
=
0.0
@
dataclasses
.
dataclass
class
Evaluation
(
hyperparams
.
Config
):
top_k
:
int
=
5
@
dataclasses
.
dataclass
class
ImageClassificationTask
(
cfg
.
TaskConfig
):
"""The task config. Same as the classification task for convnets."""
model
:
ImageClassificationModel
=
ImageClassificationModel
()
train_data
:
DataConfig
=
DataConfig
(
is_training
=
True
)
validation_data
:
DataConfig
=
DataConfig
(
is_training
=
False
)
losses
:
Losses
=
Losses
()
evaluation
:
Evaluation
=
Evaluation
()
init_checkpoint
:
Optional
[
str
]
=
None
init_checkpoint_modules
:
str
=
'all'
# all or backbone
IMAGENET_TRAIN_EXAMPLES
=
1281167
IMAGENET_VAL_EXAMPLES
=
50000
IMAGENET_INPUT_PATH_BASE
=
'imagenet-2012-tfrecord'
# TODO(b/177942984): integrate the experiments to TF-vision.
task_factory
.
register_task_cls
(
ImageClassificationTask
)(
image_classification
.
ImageClassificationTask
)
@
exp_factory
.
register_config_factory
(
'vit_imagenet_pretrain'
)
def
image_classification_imagenet_vit_pretrain
()
->
cfg
.
ExperimentConfig
:
"""Image classification on imagenet with vision transformer."""
train_batch_size
=
4096
eval_batch_size
=
4096
steps_per_epoch
=
IMAGENET_TRAIN_EXAMPLES
//
train_batch_size
config
=
cfg
.
ExperimentConfig
(
task
=
ImageClassificationTask
(
model
=
ImageClassificationModel
(
num_classes
=
1001
,
input_size
=
[
224
,
224
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'vit'
,
vit
=
backbones
.
VisionTransformer
(
model_name
=
'vit-b16'
,
representation_size
=
768
))),
losses
=
Losses
(
l2_weight_decay
=
0.0
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
IMAGENET_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
IMAGENET_INPUT_PATH_BASE
,
'valid*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
)),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
300
*
steps_per_epoch
,
validation_steps
=
IMAGENET_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'adamw'
,
'adamw'
:
{
'weight_decay_rate'
:
0.3
,
'include_in_weight_decay'
:
r
'.*(kernel|weight):0$'
,
}
},
'learning_rate'
:
{
'type'
:
'cosine'
,
'cosine'
:
{
'initial_learning_rate'
:
0.003
,
'decay_steps'
:
300
*
steps_per_epoch
,
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
10000
,
'warmup_learning_rate'
:
0
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'vit_imagenet_finetune'
)
def
image_classification_imagenet_vit_finetune
()
->
cfg
.
ExperimentConfig
:
"""Image classification on imagenet with vision transformer."""
train_batch_size
=
512
eval_batch_size
=
512
steps_per_epoch
=
IMAGENET_TRAIN_EXAMPLES
//
train_batch_size
config
=
cfg
.
ExperimentConfig
(
task
=
ImageClassificationTask
(
model
=
ImageClassificationModel
(
num_classes
=
1001
,
input_size
=
[
384
,
384
,
3
],
backbone
=
backbones
.
Backbone
(
type
=
'vit'
,
vit
=
backbones
.
VisionTransformer
(
model_name
=
'vit-b16'
))),
losses
=
Losses
(
l2_weight_decay
=
0.0
),
train_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
IMAGENET_INPUT_PATH_BASE
,
'train*'
),
is_training
=
True
,
global_batch_size
=
train_batch_size
),
validation_data
=
DataConfig
(
input_path
=
os
.
path
.
join
(
IMAGENET_INPUT_PATH_BASE
,
'valid*'
),
is_training
=
False
,
global_batch_size
=
eval_batch_size
)),
trainer
=
cfg
.
TrainerConfig
(
steps_per_loop
=
steps_per_epoch
,
summary_interval
=
steps_per_epoch
,
checkpoint_interval
=
steps_per_epoch
,
train_steps
=
20000
,
validation_steps
=
IMAGENET_VAL_EXAMPLES
//
eval_batch_size
,
validation_interval
=
steps_per_epoch
,
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
,
'global_clipnorm'
:
1.0
,
}
},
'learning_rate'
:
{
'type'
:
'cosine'
,
'cosine'
:
{
'initial_learning_rate'
:
0.003
,
'decay_steps'
:
20000
,
}
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
official/vision/beta/projects/vit/modeling/vit.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""VisionTransformer models."""
import
tensorflow
as
tf
from
official.modeling
import
activations
from
official.nlp
import
keras_nlp
from
official.vision.beta.modeling.backbones
import
factory
layers
=
tf
.
keras
.
layers
VIT_SPECS
=
{
'vit-testing'
:
dict
(
hidden_size
=
1
,
patch_size
=
16
,
transformer
=
dict
(
mlp_dim
=
1
,
num_heads
=
1
,
num_layers
=
1
),
),
'vit-b16'
:
dict
(
hidden_size
=
768
,
patch_size
=
16
,
transformer
=
dict
(
mlp_dim
=
3072
,
num_heads
=
12
,
num_layers
=
12
),
),
'vit-b32'
:
dict
(
hidden_size
=
768
,
patch_size
=
32
,
transformer
=
dict
(
mlp_dim
=
3072
,
num_heads
=
12
,
num_layers
=
12
),
),
'vit-l16'
:
dict
(
hidden_size
=
1024
,
patch_size
=
16
,
transformer
=
dict
(
mlp_dim
=
4096
,
num_heads
=
16
,
num_layers
=
24
),
),
'vit-l32'
:
dict
(
hidden_size
=
1024
,
patch_size
=
32
,
transformer
=
dict
(
mlp_dim
=
4096
,
num_heads
=
16
,
num_layers
=
24
),
),
'vit-h14'
:
dict
(
hidden_size
=
1280
,
patch_size
=
14
,
transformer
=
dict
(
mlp_dim
=
5120
,
num_heads
=
16
,
num_layers
=
32
),
),
}
class
AddPositionEmbs
(
tf
.
keras
.
layers
.
Layer
):
"""Adds (optionally learned) positional embeddings to the inputs."""
def
__init__
(
self
,
posemb_init
=
None
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
posemb_init
=
posemb_init
def
build
(
self
,
inputs_shape
):
pos_emb_shape
=
(
1
,
inputs_shape
[
1
],
inputs_shape
[
2
])
self
.
pos_embedding
=
self
.
add_weight
(
'pos_embedding'
,
pos_emb_shape
,
initializer
=
self
.
posemb_init
)
def
call
(
self
,
inputs
,
inputs_positions
=
None
):
# inputs.shape is (batch_size, seq_len, emb_dim).
pos_embedding
=
tf
.
cast
(
self
.
pos_embedding
,
inputs
.
dtype
)
return
inputs
+
pos_embedding
class
TokenLayer
(
tf
.
keras
.
layers
.
Layer
):
"""A simple layer to wrap token parameters."""
def
build
(
self
,
inputs_shape
):
self
.
cls
=
self
.
add_weight
(
'cls'
,
(
1
,
1
,
inputs_shape
[
-
1
]),
initializer
=
'zeros'
)
def
call
(
self
,
inputs
):
cls
=
tf
.
cast
(
self
.
cls
,
inputs
.
dtype
)
cls
=
cls
+
tf
.
zeros_like
(
inputs
[:,
0
:
1
])
# A hacky way to tile.
x
=
tf
.
concat
([
cls
,
inputs
],
axis
=
1
)
return
x
class
Encoder
(
tf
.
keras
.
layers
.
Layer
):
"""Transformer Encoder."""
def
__init__
(
self
,
num_layers
,
mlp_dim
,
num_heads
,
dropout_rate
=
0.1
,
attention_dropout_rate
=
0.1
,
kernel_regularizer
=
None
,
inputs_positions
=
None
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
_num_layers
=
num_layers
self
.
_mlp_dim
=
mlp_dim
self
.
_num_heads
=
num_heads
self
.
_dropout_rate
=
dropout_rate
self
.
_attention_dropout_rate
=
attention_dropout_rate
self
.
_kernel_regularizer
=
kernel_regularizer
self
.
_inputs_positions
=
inputs_positions
def
build
(
self
,
input_shape
):
self
.
_pos_embed
=
AddPositionEmbs
(
posemb_init
=
tf
.
keras
.
initializers
.
RandomNormal
(
stddev
=
0.02
),
name
=
'posembed_input'
)
self
.
_dropout
=
layers
.
Dropout
(
rate
=
self
.
_dropout_rate
)
self
.
_encoder_layers
=
[]
# Set layer norm epsilons to 1e-6 to be consistent with JAX implementation.
# https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.LayerNorm.html
for
_
in
range
(
self
.
_num_layers
):
encoder_layer
=
keras_nlp
.
layers
.
TransformerEncoderBlock
(
inner_activation
=
activations
.
gelu
,
num_attention_heads
=
self
.
_num_heads
,
inner_dim
=
self
.
_mlp_dim
,
output_dropout
=
self
.
_dropout_rate
,
attention_dropout
=
self
.
_attention_dropout_rate
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
norm_first
=
True
,
norm_epsilon
=
1e-6
)
self
.
_encoder_layers
.
append
(
encoder_layer
)
self
.
_norm
=
layers
.
LayerNormalization
(
epsilon
=
1e-6
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
training
=
None
):
x
=
self
.
_pos_embed
(
inputs
,
inputs_positions
=
self
.
_inputs_positions
)
x
=
self
.
_dropout
(
x
,
training
=
training
)
for
encoder_layer
in
self
.
_encoder_layers
:
x
=
encoder_layer
(
x
,
training
=
training
)
x
=
self
.
_norm
(
x
)
return
x
class
VisionTransformer
(
tf
.
keras
.
Model
):
"""Class to build VisionTransformer family model."""
def
__init__
(
self
,
mlp_dim
=
3072
,
num_heads
=
12
,
num_layers
=
12
,
attention_dropout_rate
=
0.0
,
dropout_rate
=
0.1
,
input_specs
=
layers
.
InputSpec
(
shape
=
[
None
,
None
,
None
,
3
]),
patch_size
=
16
,
hidden_size
=
768
,
representation_size
=
0
,
classifier
=
'token'
,
kernel_regularizer
=
None
):
"""VisionTransformer initialization function."""
inputs
=
tf
.
keras
.
Input
(
shape
=
input_specs
.
shape
[
1
:])
x
=
layers
.
Conv2D
(
filters
=
hidden_size
,
kernel_size
=
patch_size
,
strides
=
patch_size
,
padding
=
'valid'
,
kernel_regularizer
=
kernel_regularizer
)(
inputs
)
if
tf
.
keras
.
backend
.
image_data_format
()
==
'channels_last'
:
rows_axis
,
cols_axis
=
(
1
,
2
)
else
:
rows_axis
,
cols_axis
=
(
2
,
3
)
# The reshape below assumes the data_format is 'channels_last,' so
# transpose to that. Once the data is flattened by the reshape, the
# data_format is irrelevant, so no need to update
# tf.keras.backend.image_data_format.
x
=
tf
.
transpose
(
x
,
perm
=
[
0
,
2
,
3
,
1
])
seq_len
=
(
input_specs
.
shape
[
rows_axis
]
//
patch_size
)
*
(
input_specs
.
shape
[
cols_axis
]
//
patch_size
)
x
=
tf
.
reshape
(
x
,
[
-
1
,
seq_len
,
hidden_size
])
# If we want to add a class token, add it here.
if
classifier
==
'token'
:
x
=
TokenLayer
(
name
=
'cls'
)(
x
)
x
=
Encoder
(
num_layers
=
num_layers
,
mlp_dim
=
mlp_dim
,
num_heads
=
num_heads
,
dropout_rate
=
dropout_rate
,
attention_dropout_rate
=
attention_dropout_rate
,
kernel_regularizer
=
kernel_regularizer
)(
x
)
if
classifier
==
'token'
:
x
=
x
[:,
0
]
elif
classifier
==
'gap'
:
x
=
tf
.
reduce_mean
(
x
,
axis
=
1
)
if
representation_size
:
x
=
tf
.
keras
.
layers
.
Dense
(
representation_size
,
kernel_regularizer
=
kernel_regularizer
,
name
=
'pre_logits'
)(
x
)
x
=
tf
.
nn
.
tanh
(
x
)
else
:
x
=
tf
.
identity
(
x
,
name
=
'pre_logits'
)
endpoints
=
{
'pre_logits'
:
tf
.
reshape
(
x
,
[
-
1
,
1
,
1
,
representation_size
or
hidden_size
])
}
super
(
VisionTransformer
,
self
).
__init__
(
inputs
=
inputs
,
outputs
=
endpoints
)
@
factory
.
register_backbone_builder
(
'vit'
)
def
build_vit
(
input_specs
,
backbone_config
,
norm_activation_config
,
l2_regularizer
=
None
):
"""Build ViT model."""
del
norm_activation_config
backbone_type
=
backbone_config
.
type
backbone_cfg
=
backbone_config
.
get
()
assert
backbone_type
==
'vit'
,
(
f
'Inconsistent backbone type '
f
'
{
backbone_type
}
'
)
backbone_cfg
.
override
(
VIT_SPECS
[
backbone_cfg
.
model_name
])
return
VisionTransformer
(
mlp_dim
=
backbone_cfg
.
transformer
.
mlp_dim
,
num_heads
=
backbone_cfg
.
transformer
.
num_heads
,
num_layers
=
backbone_cfg
.
transformer
.
num_layers
,
attention_dropout_rate
=
backbone_cfg
.
transformer
.
attention_dropout_rate
,
dropout_rate
=
backbone_cfg
.
transformer
.
dropout_rate
,
input_specs
=
input_specs
,
patch_size
=
backbone_cfg
.
patch_size
,
hidden_size
=
backbone_cfg
.
hidden_size
,
representation_size
=
backbone_cfg
.
representation_size
,
classifier
=
backbone_cfg
.
classifier
,
kernel_regularizer
=
l2_regularizer
)
official/vision/beta/projects/vit/modeling/vit_test.py
0 → 100644
View file @
78c43ef1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for VIT."""
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.vision.beta.projects.vit.modeling
import
vit
class
VisionTransformerTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
@
parameterized
.
parameters
(
(
224
,
85798656
),
(
256
,
85844736
),
)
def
test_network_creation
(
self
,
input_size
,
params_count
):
"""Test creation of VisionTransformer family models."""
tf
.
keras
.
backend
.
set_image_data_format
(
'channels_last'
)
input_specs
=
tf
.
keras
.
layers
.
InputSpec
(
shape
=
[
2
,
input_size
,
input_size
,
3
])
network
=
vit
.
VisionTransformer
(
input_specs
=
input_specs
)
inputs
=
tf
.
keras
.
Input
(
shape
=
(
input_size
,
input_size
,
3
),
batch_size
=
1
)
_
=
network
(
inputs
)
self
.
assertEqual
(
network
.
count_params
(),
params_count
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
Prev
1
…
3
4
5
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment