Commit cf80ed4e authored by anivegesana's avatar anivegesana
Browse files

Merge branch 'purdue-yolo' of https://github.com/tensorflow/models into detection_generator_pr_2

parents 394cefcc 461b3587
......@@ -12,80 +12,124 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Contains the factory method to create decoders."""
"""Decoder registers and factory method.
from typing import Mapping, Optional
One can register a new decoder model by the following two steps:
1 Import the factory and register the build in the decoder file.
2 Import the decoder class and add a build in __init__.py.
```
# my_decoder.py
from modeling.decoders import factory
class MyDecoder():
...
@factory.register_decoder_builder('my_decoder')
def build_my_decoder():
return MyDecoder()
# decoders/__init__.py adds import
from modeling.decoders.my_decoder import MyDecoder
```
If one wants the MyDecoder class to be used only by those binary
then don't imported the decoder module in decoders/__init__.py, but import it
in place that uses it.
"""
from typing import Any, Callable, Mapping, Optional, Union
# Import libraries
import tensorflow as tf
from official.core import registry
from official.modeling import hyperparams
from official.vision.beta.modeling import decoders
_REGISTERED_DECODER_CLS = {}
def register_decoder_builder(key: str) -> Callable[..., Any]:
"""Decorates a builder of decoder class.
The builder should be a Callable (a class or a function).
This decorator supports registration of decoder builder as follows:
```
class MyDecoder(tf.keras.Model):
pass
@register_decoder_builder('mydecoder')
def builder(input_specs, config, l2_reg):
return MyDecoder(...)
# Builds a MyDecoder object.
my_decoder = build_decoder_3d(input_specs, config, l2_reg)
```
Args:
key: A `str` of key to look up the builder.
Returns:
A callable for using as class decorator that registers the decorated class
for creation from an instance of task_config_cls.
"""
return registry.register(_REGISTERED_DECODER_CLS, key)
@register_decoder_builder('identity')
def build_identity(
input_specs: Optional[Mapping[str, tf.TensorShape]] = None,
model_config: Optional[hyperparams.Config] = None,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None) -> None:
"""Builds identity decoder from a config.
All the input arguments are not used by identity decoder but kept here to
ensure the interface is consistent.
Args:
input_specs: A `dict` of input specifications. A dictionary consists of
{level: TensorShape} from a backbone.
model_config: A `OneOfConfig` of model config.
l2_regularizer: A `tf.keras.regularizers.Regularizer` object. Default to
None.
Returns:
An instance of the identity decoder.
"""
del input_specs, model_config, l2_regularizer # Unused by identity decoder.
def build_decoder(
input_specs: Mapping[str, tf.TensorShape],
model_config: hyperparams.Config,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
) -> tf.keras.Model:
l2_regularizer: tf.keras.regularizers.Regularizer = None,
**kwargs) -> Union[None, tf.keras.Model, tf.keras.layers.Layer]:
"""Builds decoder from a config.
A decoder can be a keras.Model, a keras.layers.Layer, or None. If it is not
None, the decoder will take features from the backbone as input and generate
decoded feature maps. If it is None, such as an identity decoder, the decoder
is skipped and features from the backbone are regarded as model output.
Args:
input_specs: A `dict` of input specifications. A dictionary consists of
{level: TensorShape} from a backbone.
model_config: A OneOfConfig. Model config.
l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
model_config: A `OneOfConfig` of model config.
l2_regularizer: A `tf.keras.regularizers.Regularizer` object. Default to
None.
**kwargs: Additional keyword args to be passed to decoder builder.
Returns:
A `tf.keras.Model` instance of the decoder.
An instance of the decoder.
"""
decoder_type = model_config.decoder.type
decoder_cfg = model_config.decoder.get()
norm_activation_config = model_config.norm_activation
if decoder_type == 'identity':
decoder = None
elif decoder_type == 'fpn':
decoder = decoders.FPN(
input_specs=input_specs,
min_level=model_config.min_level,
max_level=model_config.max_level,
num_filters=decoder_cfg.num_filters,
use_separable_conv=decoder_cfg.use_separable_conv,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
elif decoder_type == 'nasfpn':
decoder = decoders.NASFPN(
input_specs=input_specs,
min_level=model_config.min_level,
max_level=model_config.max_level,
num_filters=decoder_cfg.num_filters,
num_repeats=decoder_cfg.num_repeats,
use_separable_conv=decoder_cfg.use_separable_conv,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
elif decoder_type == 'aspp':
decoder = decoders.ASPP(
level=decoder_cfg.level,
dilation_rates=decoder_cfg.dilation_rates,
num_filters=decoder_cfg.num_filters,
pool_kernel_size=decoder_cfg.pool_kernel_size,
dropout_rate=decoder_cfg.dropout_rate,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
activation=norm_activation_config.activation,
kernel_regularizer=l2_regularizer)
else:
raise ValueError('Decoder {!r} not implement'.format(decoder_type))
return decoder
decoder_builder = registry.lookup(_REGISTERED_DECODER_CLS,
model_config.decoder.type)
return decoder_builder(
input_specs=input_specs,
model_config=model_config,
l2_regularizer=l2_regularizer,
**kwargs)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for decoder factory functions."""
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.distribute import combinations
from official.vision.beta import configs
from official.vision.beta.configs import decoders as decoders_cfg
from official.vision.beta.modeling import decoders
from official.vision.beta.modeling.decoders import factory
class FactoryTest(tf.test.TestCase, parameterized.TestCase):
@combinations.generate(
combinations.combine(
num_filters=[128, 256], use_separable_conv=[True, False]))
def test_fpn_decoder_creation(self, num_filters, use_separable_conv):
"""Test creation of FPN decoder."""
min_level = 3
max_level = 7
input_specs = {}
for level in range(min_level, max_level):
input_specs[str(level)] = tf.TensorShape(
[1, 128 // (2**level), 128 // (2**level), 3])
network = decoders.FPN(
input_specs=input_specs,
num_filters=num_filters,
use_separable_conv=use_separable_conv,
use_sync_bn=True)
model_config = configs.retinanet.RetinaNet()
model_config.min_level = min_level
model_config.max_level = max_level
model_config.num_classes = 10
model_config.input_size = [None, None, 3]
model_config.decoder = decoders_cfg.Decoder(
type='fpn',
fpn=decoders_cfg.FPN(
num_filters=num_filters, use_separable_conv=use_separable_conv))
factory_network = factory.build_decoder(
input_specs=input_specs, model_config=model_config)
network_config = network.get_config()
factory_network_config = factory_network.get_config()
self.assertEqual(network_config, factory_network_config)
@combinations.generate(
combinations.combine(
num_filters=[128, 256],
num_repeats=[3, 5],
use_separable_conv=[True, False]))
def test_nasfpn_decoder_creation(self, num_filters, num_repeats,
use_separable_conv):
"""Test creation of NASFPN decoder."""
min_level = 3
max_level = 7
input_specs = {}
for level in range(min_level, max_level):
input_specs[str(level)] = tf.TensorShape(
[1, 128 // (2**level), 128 // (2**level), 3])
network = decoders.NASFPN(
input_specs=input_specs,
num_filters=num_filters,
num_repeats=num_repeats,
use_separable_conv=use_separable_conv,
use_sync_bn=True)
model_config = configs.retinanet.RetinaNet()
model_config.min_level = min_level
model_config.max_level = max_level
model_config.num_classes = 10
model_config.input_size = [None, None, 3]
model_config.decoder = decoders_cfg.Decoder(
type='nasfpn',
nasfpn=decoders_cfg.NASFPN(
num_filters=num_filters,
num_repeats=num_repeats,
use_separable_conv=use_separable_conv))
factory_network = factory.build_decoder(
input_specs=input_specs, model_config=model_config)
network_config = network.get_config()
factory_network_config = factory_network.get_config()
self.assertEqual(network_config, factory_network_config)
@combinations.generate(
combinations.combine(
level=[3, 4],
dilation_rates=[[6, 12, 18], [6, 12]],
num_filters=[128, 256]))
def test_aspp_decoder_creation(self, level, dilation_rates, num_filters):
"""Test creation of ASPP decoder."""
input_specs = {'1': tf.TensorShape([1, 128, 128, 3])}
network = decoders.ASPP(
level=level,
dilation_rates=dilation_rates,
num_filters=num_filters,
use_sync_bn=True)
model_config = configs.semantic_segmentation.SemanticSegmentationModel()
model_config.num_classes = 10
model_config.input_size = [None, None, 3]
model_config.decoder = decoders_cfg.Decoder(
type='aspp',
aspp=decoders_cfg.ASPP(
level=level, dilation_rates=dilation_rates,
num_filters=num_filters))
factory_network = factory.build_decoder(
input_specs=input_specs, model_config=model_config)
network_config = network.get_config()
factory_network_config = factory_network.get_config()
self.assertEqual(network_config, factory_network_config)
def test_identity_decoder_creation(self):
"""Test creation of identity decoder."""
model_config = configs.retinanet.RetinaNet()
model_config.num_classes = 2
model_config.input_size = [None, None, 3]
model_config.decoder = decoders_cfg.Decoder(
type='identity', identity=decoders_cfg.Identity())
factory_network = factory.build_decoder(
input_specs=None, model_config=model_config)
self.assertIsNone(factory_network)
if __name__ == '__main__':
tf.test.main()
......@@ -16,9 +16,12 @@
from typing import Any, Mapping, Optional
# Import libraries
import tensorflow as tf
from official.modeling import hyperparams
from official.modeling import tf_utils
from official.vision.beta.modeling.decoders import factory
from official.vision.beta.ops import spatial_transform_ops
......@@ -187,3 +190,43 @@ class FPN(tf.keras.Model):
def output_specs(self) -> Mapping[str, tf.TensorShape]:
"""A dict of {level: TensorShape} pairs for the model output."""
return self._output_specs
@factory.register_decoder_builder('fpn')
def build_fpn_decoder(
input_specs: Mapping[str, tf.TensorShape],
model_config: hyperparams.Config,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
) -> tf.keras.Model:
"""Builds FPN decoder from a config.
Args:
input_specs: A `dict` of input specifications. A dictionary consists of
{level: TensorShape} from a backbone.
model_config: A OneOfConfig. Model config.
l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
None.
Returns:
A `tf.keras.Model` instance of the FPN decoder.
Raises:
ValueError: If the model_config.decoder.type is not `fpn`.
"""
decoder_type = model_config.decoder.type
decoder_cfg = model_config.decoder.get()
if decoder_type != 'fpn':
raise ValueError(f'Inconsistent decoder type {decoder_type}. '
'Need to be `fpn`.')
norm_activation_config = model_config.norm_activation
return FPN(
input_specs=input_specs,
min_level=model_config.min_level,
max_level=model_config.max_level,
num_filters=decoder_cfg.num_filters,
use_separable_conv=decoder_cfg.use_separable_conv,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
......@@ -19,6 +19,7 @@
from absl.testing import parameterized
import tensorflow as tf
from official.vision.beta.modeling.backbones import mobilenet
from official.vision.beta.modeling.backbones import resnet
from official.vision.beta.modeling.decoders import fpn
......@@ -52,6 +53,33 @@ class FPNTest(parameterized.TestCase, tf.test.TestCase):
[1, input_size // 2**level, input_size // 2**level, 256],
feats[str(level)].shape.as_list())
@parameterized.parameters(
(256, 3, 7, False),
(256, 3, 7, True),
)
def test_network_creation_with_mobilenet(self, input_size, min_level,
max_level, use_separable_conv):
"""Test creation of FPN with mobilenet backbone."""
tf.keras.backend.set_image_data_format('channels_last')
inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
backbone = mobilenet.MobileNet(model_id='MobileNetV2')
network = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level,
use_separable_conv=use_separable_conv)
endpoints = backbone(inputs)
feats = network(endpoints)
for level in range(min_level, max_level + 1):
self.assertIn(str(level), feats)
self.assertAllEqual(
[1, input_size // 2**level, input_size // 2**level, 256],
feats[str(level)].shape.as_list())
def test_serialize_deserialize(self):
# Create a network object that sets all of its config options.
kwargs = dict(
......
......@@ -13,12 +13,16 @@
# limitations under the License.
"""Contains definitions of NAS-FPN."""
from typing import Any, Mapping, List, Tuple, Optional
from typing import Any, List, Mapping, Optional, Tuple
# Import libraries
from absl import logging
import tensorflow as tf
from official.modeling import hyperparams
from official.vision.beta.modeling.decoders import factory
from official.vision.beta.ops import spatial_transform_ops
......@@ -316,3 +320,45 @@ class NASFPN(tf.keras.Model):
def output_specs(self) -> Mapping[str, tf.TensorShape]:
"""A dict of {level: TensorShape} pairs for the model output."""
return self._output_specs
@factory.register_decoder_builder('nasfpn')
def build_nasfpn_decoder(
input_specs: Mapping[str, tf.TensorShape],
model_config: hyperparams.Config,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
) -> tf.keras.Model:
"""Builds NASFPN decoder from a config.
Args:
input_specs: A `dict` of input specifications. A dictionary consists of
{level: TensorShape} from a backbone.
model_config: A OneOfConfig. Model config.
l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
None.
Returns:
A `tf.keras.Model` instance of the NASFPN decoder.
Raises:
ValueError: If the model_config.decoder.type is not `nasfpn`.
"""
decoder_type = model_config.decoder.type
decoder_cfg = model_config.decoder.get()
if decoder_type != 'nasfpn':
raise ValueError(f'Inconsistent decoder type {decoder_type}. '
'Need to be `nasfpn`.')
norm_activation_config = model_config.norm_activation
return NASFPN(
input_specs=input_specs,
min_level=model_config.min_level,
max_level=model_config.max_level,
num_filters=decoder_cfg.num_filters,
num_repeats=decoder_cfg.num_repeats,
use_separable_conv=decoder_cfg.use_separable_conv,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
kernel_regularizer=l2_regularizer)
......@@ -24,10 +24,10 @@ from official.vision.beta.configs import retinanet as retinanet_cfg
from official.vision.beta.configs import semantic_segmentation as segmentation_cfg
from official.vision.beta.modeling import backbones
from official.vision.beta.modeling import classification_model
from official.vision.beta.modeling import decoders
from official.vision.beta.modeling import maskrcnn_model
from official.vision.beta.modeling import retinanet_model
from official.vision.beta.modeling import segmentation_model
from official.vision.beta.modeling.decoders import factory as decoder_factory
from official.vision.beta.modeling.heads import dense_prediction_heads
from official.vision.beta.modeling.heads import instance_heads
from official.vision.beta.modeling.heads import segmentation_heads
......@@ -78,7 +78,7 @@ def build_maskrcnn(
l2_regularizer=l2_regularizer)
backbone(tf.keras.Input(input_specs.shape[1:]))
decoder = decoder_factory.build_decoder(
decoder = decoders.factory.build_decoder(
input_specs=backbone.output_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
......@@ -253,7 +253,7 @@ def build_retinanet(
l2_regularizer=l2_regularizer)
backbone(tf.keras.Input(input_specs.shape[1:]))
decoder = decoder_factory.build_decoder(
decoder = decoders.factory.build_decoder(
input_specs=backbone.output_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
......@@ -313,7 +313,7 @@ def build_segmentation_model(
norm_activation_config=norm_activation_config,
l2_regularizer=l2_regularizer)
decoder = decoder_factory.build_decoder(
decoder = decoders.factory.build_decoder(
input_specs=backbone.output_specs,
model_config=model_config,
l2_regularizer=l2_regularizer)
......
......@@ -26,10 +26,6 @@ from official.modeling import tf_utils
States = Dict[str, tf.Tensor]
Activation = Union[str, Callable]
# TODO(dankondratyuk): keep legacy padding until new checkpoints are trained.
# Otherwise, accuracy will be affected.
LEGACY_PADDING = True
def make_divisible(value: float,
divisor: int,
......@@ -89,6 +85,22 @@ def hard_swish(x: tf.Tensor) -> tf.Tensor:
tf.keras.utils.get_custom_objects().update({'hard_swish': hard_swish})
def simple_swish(x: tf.Tensor) -> tf.Tensor:
"""A swish/silu activation function without custom gradients.
Useful for exporting to SavedModel to avoid custom gradient warnings.
Args:
x: the input tensor.
Returns:
The activation output.
"""
return x * tf.math.sigmoid(x)
tf.keras.utils.get_custom_objects().update({'simple_swish': simple_swish})
@tf.keras.utils.register_keras_serializable(package='Vision')
class SqueezeExcitation(tf.keras.layers.Layer):
"""Creates a squeeze and excitation layer."""
......@@ -752,14 +764,10 @@ class CausalConvMixin:
(self.kernel_size[i] - 1) * (self.dilation_rate[i] - 1))
for i in range(self.rank)
]
if LEGACY_PADDING:
# Apply legacy padding that does not take into account spatial strides
pad_total = [kernel_size_effective[i] - 1 for i in range(self.rank)]
else:
pad_total = [kernel_size_effective[0] - 1]
for i in range(1, self.rank):
overlap = (input_shape[i] - 1) % self.strides[i] + 1
pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0))
pad_total = [kernel_size_effective[0] - 1]
for i in range(1, self.rank):
overlap = (input_shape[i] - 1) % self.strides[i] + 1
pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0))
pad_beg = [pad_total[i] // 2 for i in range(self.rank)]
pad_end = [pad_total[i] - pad_beg[i] for i in range(self.rank)]
padding = [[pad_beg[i], pad_end[i]] for i in range(self.rank)]
......
......@@ -24,10 +24,6 @@ from official.vision.beta.modeling.layers import nn_layers
class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
def setUp(self):
super().setUp()
nn_layers.LEGACY_PADDING = False
def test_hard_swish(self):
activation = tf.keras.layers.Activation('hard_swish')
output = activation(tf.constant([-3, -1.5, 0, 3]))
......
......@@ -50,6 +50,60 @@ def yxyx_to_xywh(boxes):
return new_boxes
def yxyx_to_cycxhw(boxes):
"""Converts box corner coordinates to center plus height and width terms.
Args:
boxes: a `Tensor` with last dimension of 4, representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
Returns:
boxes: a `Tensor` with the same shape as the inputted boxes, in the format
of cy, cx, height, width.
Raises:
ValueError: if the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('Last dimension of boxes must be 4 but is {:d}'.format(
boxes.shape[-1]))
boxes_ycenter = (boxes[..., 0] + boxes[..., 2]) / 2
boxes_xcenter = (boxes[..., 1] + boxes[..., 3]) / 2
boxes_height = boxes[..., 2] - boxes[..., 0]
boxes_width = boxes[..., 3] - boxes[..., 1]
new_boxes = tf.stack(
[boxes_ycenter, boxes_xcenter, boxes_height, boxes_width], axis=-1)
return new_boxes
def cycxhw_to_yxyx(boxes):
"""Converts box center coordinates plus height and width terms to corner.
Args:
boxes: a numpy array whose last dimension is 4 representing the coordinates
of boxes in cy, cx, height, width order.
Returns:
boxes: a numpy array whose shape is the same as `boxes` in new format.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
boxes_ymin = boxes[..., 0] - boxes[..., 2] / 2
boxes_xmin = boxes[..., 1] - boxes[..., 3] / 2
boxes_ymax = boxes[..., 0] + boxes[..., 2] / 2
boxes_xmax = boxes[..., 1] + boxes[..., 3] / 2
new_boxes = tf.stack([
boxes_ymin, boxes_xmin, boxes_ymax, boxes_xmax], axis=-1)
return new_boxes
def jitter_boxes(boxes, noise_scale=0.025):
"""Jitter the box coordinates by some noise distribution.
......
......@@ -8,6 +8,8 @@ This repository is the official implementation of
[MoViNets: Mobile Video Networks for Efficient Video
Recognition](https://arxiv.org/abs/2103.11511).
**[UPDATE 2021-07-12] Mobile Models Available via [TF Lite](#tf-lite-streaming-models)**
<p align="center">
<img src="https://storage.googleapis.com/tf_model_garden/vision/movinet/artifacts/hoverboard_stream.gif" height=500>
</p>
......@@ -53,6 +55,8 @@ approach that performs redundant computation and limits temporal scope.
## History
- **2021-07-12** Add TF Lite support and replace 3D stream models with
mobile-friendly (2+1)D stream.
- **2021-05-30** Add streaming MoViNet checkpoints and examples.
- **2021-05-11** Initial Commit.
......@@ -68,6 +72,7 @@ approach that performs redundant computation and limits temporal scope.
- [Results and Pretrained Weights](#results-and-pretrained-weights)
- [Kinetics 600](#kinetics-600)
- [Prediction Examples](#prediction-examples)
- [TF Lite Example](#tf-lite-example)
- [Training and Evaluation](#training-and-evaluation)
- [References](#references)
- [License](#license)
......@@ -108,10 +113,14 @@ MoViNet-A5.
#### Base Models
Base models implement standard 3D convolutions without stream buffers.
Base models implement standard 3D convolutions without stream buffers. Base
models are not recommended for fast inference on CPU or mobile due to
limited support for
[`tf.nn.conv3d`](https://www.tensorflow.org/api_docs/python/tf/nn/conv3d).
Instead, see the [streaming models section](#streaming-models).
| Model Name | Top-1 Accuracy | Top-5 Accuracy | Input Shape | GFLOPs\* | Chekpoint | TF Hub SavedModel |
|------------|----------------|----------------|-------------|----------|-----------|-------------------|
| Model Name | Top-1 Accuracy | Top-5 Accuracy | Input Shape | GFLOPs\* | Checkpoint | TF Hub SavedModel |
|------------|----------------|----------------|-------------|----------|------------|-------------------|
| MoViNet-A0-Base | 72.28 | 90.92 | 50 x 172 x 172 | 2.7 | [checkpoint (12 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_base.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a0/base/kinetics-600/classification/) |
| MoViNet-A1-Base | 76.69 | 93.40 | 50 x 172 x 172 | 6.0 | [checkpoint (18 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a1_base.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a1/base/kinetics-600/classification/) |
| MoViNet-A2-Base | 78.62 | 94.17 | 50 x 224 x 224 | 10 | [checkpoint (20 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a2_base.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a2/base/kinetics-600/classification/) |
......@@ -123,10 +132,19 @@ Base models implement standard 3D convolutions without stream buffers.
#### Streaming Models
Streaming models implement causal 3D convolutions with stream buffers.
Streaming models implement causal (2+1)D convolutions with stream buffers.
Streaming models use (2+1)D convolution instead of 3D to utilize optimized
[`tf.nn.conv2d`](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d)
operations, which offer fast inference on CPU. Streaming models can be run on
individual frames or on larger video clips like base models.
Note: A3, A4, and A5 models use a positional encoding in the squeeze-excitation
blocks, while A0, A1, and A2 do not. For the smaller models, accuracy is
unaffected without positional encoding, while for the larger models accuracy is
significantly worse without positional encoding.
| Model Name | Top-1 Accuracy | Top-5 Accuracy | Input Shape\* | GFLOPs\*\* | Chekpoint | TF Hub SavedModel |
|------------|----------------|----------------|---------------|------------|-----------|-------------------|
| Model Name | Top-1 Accuracy | Top-5 Accuracy | Input Shape\* | GFLOPs\*\* | Checkpoint | TF Hub SavedModel |
|------------|----------------|----------------|---------------|------------|------------|-------------------|
| MoViNet-A0-Stream | 72.05 | 90.63 | 50 x 172 x 172 | 2.7 | [checkpoint (12 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_stream.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a0/stream/kinetics-600/classification/) |
| MoViNet-A1-Stream | 76.45 | 93.25 | 50 x 172 x 172 | 6.0 | [checkpoint (18 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a1_stream.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a1/stream/kinetics-600/classification/) |
| MoViNet-A2-Stream | 78.40 | 94.05 | 50 x 224 x 224 | 10 | [checkpoint (20 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a2_stream.tar.gz) | [tfhub](https://tfhub.dev/tensorflow/movinet/a2/stream/kinetics-600/classification/) |
......@@ -139,6 +157,35 @@ duration of the 10-second clip.
\*\*GFLOPs per video on Kinetics 600.
Note: current streaming model checkpoints have been updated with a slightly
different architecture. To download the old checkpoints, insert `_legacy` before
`.tar.gz` in the URL. E.g., `movinet_a0_stream_legacy.tar.gz`.
##### TF Lite Streaming Models
For convenience, we provide converted TF Lite models for inference on mobile
devices. See the [TF Lite Example](#tf-lite-example) to export and run your own
models.
For reference, MoViNet-A0-Stream runs with a similar latency to
[MobileNetV3-Large]
(https://tfhub.dev/google/imagenet/mobilenet_v3_large_100_224/classification/)
with +5% accuracy on Kinetics 600.
| Model Name | Input Shape | Pixel 4 Latency\* | x86 Latency\* | TF Lite Binary |
|------------|-------------|-------------------|---------------|----------------|
| MoViNet-A0-Stream | 1 x 1 x 172 x 172 | 22 ms | 16 ms | [TF Lite (13 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_stream.tflite) |
| MoViNet-A1-Stream | 1 x 1 x 172 x 172 | 42 ms | 33 ms | [TF Lite (45 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a1_stream.tflite) |
| MoViNet-A2-Stream | 1 x 1 x 224 x 224 | 200 ms | 66 ms | [TF Lite (53 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a2_stream.tflite) |
| MoViNet-A3-Stream | 1 x 1 x 256 x 256 | - | 120 ms | [TF Lite (73 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a3_stream.tflite) |
| MoViNet-A4-Stream | 1 x 1 x 290 x 290 | - | 300 ms | [TF Lite (101 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a4_stream.tflite) |
| MoViNet-A5-Stream | 1 x 1 x 320 x 320 | - | 450 ms | [TF Lite (153 MB)](https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a5_stream.tflite) |
\*Single-frame latency measured on with unaltered float32 operations on a
single CPU core. Observed latency may differ depending on hardware
configuration. Measured on a stock Pixel 4 (Android 11) and x86 Intel Xeon
W-2135 CPU.
## Prediction Examples
Please check out our [Colab Notebook](https://colab.research.google.com/github/tensorflow/models/tree/master/official/vision/beta/projects/movinet/movinet_tutorial.ipynb)
......@@ -146,7 +193,7 @@ to get started with MoViNets.
This section provides examples on how to run prediction.
For base models, run the following:
For **base models**, run the following:
```python
import tensorflow as tf
......@@ -181,7 +228,7 @@ output = model(inputs)
prediction = tf.argmax(output, -1)
```
For streaming models, run the following:
For **streaming models**, run the following:
```python
import tensorflow as tf
......@@ -189,20 +236,31 @@ import tensorflow as tf
from official.vision.beta.projects.movinet.modeling import movinet
from official.vision.beta.projects.movinet.modeling import movinet_model
model_id = 'a0'
use_positional_encoding = model_id in {'a3', 'a4', 'a5'}
# Create backbone and model.
backbone = movinet.Movinet(
model_id='a0',
model_id=model_id,
causal=True,
conv_type='2plus1d',
se_type='2plus3d',
activation='hard_swish',
gating_activation='hard_sigmoid',
use_positional_encoding=use_positional_encoding,
use_external_states=True,
)
model = movinet_model.MovinetClassifier(
backbone, num_classes=600, output_states=True)
backbone,
num_classes=600,
output_states=True)
# Create your example input here.
# Refer to the paper for recommended input shapes.
inputs = tf.ones([1, 8, 172, 172, 3])
# [Optional] Build the model and load a pretrained checkpoint
# [Optional] Build the model and load a pretrained checkpoint.
model.build(inputs.shape)
checkpoint_dir = '/path/to/checkpoint'
......@@ -237,23 +295,89 @@ non_streaming_output, _ = model({**init_states, 'image': inputs})
non_streaming_prediction = tf.argmax(non_streaming_output, -1)
```
## TF Lite Example
This section outlines an example on how to export a model to run on mobile
devices with [TF Lite](https://www.tensorflow.org/lite).
First, convert to [TF SavedModel](https://www.tensorflow.org/guide/saved_model)
by running `export_saved_model.py`. For example, for `MoViNet-A0-Stream`, run:
```shell
python3 export_saved_model.py \
--model_id=a0 \
--causal=True \
--conv_type=2plus1d \
--se_type=2plus3d \
--activation=hard_swish \
--gating_activation=hard_sigmoid \
--use_positional_encoding=False \
--num_classes=600 \
--batch_size=1 \
--num_frames=1 \
--image_size=172 \
--bundle_input_init_states_fn=False \
--checkpoint_path=/path/to/checkpoint \
--export_path=/tmp/movinet_a0_stream
```
Then the SavedModel can be converted to TF Lite using the [`TFLiteConverter`](https://www.tensorflow.org/lite/convert):
```python
saved_model_dir = '/tmp/movinet_a0_stream'
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
tflite_model = converter.convert()
with open('/tmp/movinet_a0_stream.tflite', 'wb') as f:
f.write(tflite_model)
```
To run with TF Lite using [tf.lite.Interpreter](https://www.tensorflow.org/lite/guide/inference#load_and_run_a_model_in_python)
with the Python API:
```python
# Create the interpreter and signature runner
interpreter = tf.lite.Interpreter('/tmp/movinet_a0_stream.tflite')
signature = interpreter.get_signature_runner()
# Extract state names and create the initial (zero) states
def state_name(name: str) -> str:
return name[len('serving_default_'):-len(':0')]
init_states = {
state_name(x['name']): tf.zeros(x['shape'], dtype=x['dtype'])
for x in interpreter.get_input_details()
}
del init_states['image']
# Insert your video clip here
video = tf.ones([1, 8, 172, 172, 3])
clips = tf.split(video, video.shape[1], axis=1)
# To run on a video, pass in one frame at a time
states = init_states
for clip in clips:
# Input shape: [1, 1, 172, 172, 3]
outputs = signature(**states, image=clip)
logits = outputs.pop('logits')
states = outputs
```
Follow the [official guide](https://www.tensorflow.org/lite/guide) to run a
model with TF Lite on your mobile device.
## Training and Evaluation
Run this command line for continuous training and evaluation.
```shell
MODE=train_and_eval # Can also be 'train'
MODE=train_and_eval # Can also be 'train' if using a separate evaluator job
CONFIG_FILE=official/vision/beta/projects/movinet/configs/yaml/movinet_a0_k600_8x8.yaml
python3 official/vision/beta/projects/movinet/train.py \
--experiment=movinet_kinetics600 \
--mode=${MODE} \
--model_dir=/tmp/movinet/ \
--config_file=${CONFIG_FILE} \
--params_override="" \
--gin_file="" \
--gin_params="" \
--tpu="" \
--tf_data_service=""
--model_dir=/tmp/movinet_a0_base/ \
--config_file=${CONFIG_FILE}
```
Run this command line for evaluation.
......@@ -264,13 +388,8 @@ CONFIG_FILE=official/vision/beta/projects/movinet/configs/yaml/movinet_a0_k600_8
python3 official/vision/beta/projects/movinet/train.py \
--experiment=movinet_kinetics600 \
--mode=${MODE} \
--model_dir=/tmp/movinet/ \
--config_file=${CONFIG_FILE} \
--params_override="" \
--gin_file="" \
--gin_params="" \
--tpu="" \
--tf_data_service=""
--model_dir=/tmp/movinet_a0_base/ \
--config_file=${CONFIG_FILE}
```
## License
......
......@@ -130,6 +130,7 @@ class MovinetModel(video_classification.VideoClassificationModel):
norm_momentum=0.99,
norm_epsilon=1e-3,
use_sync_bn=True)
activation: str = 'swish'
output_states: bool = False
......
......@@ -15,6 +15,11 @@ task:
movinet:
model_id: 'a0'
causal: true
# Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
conv_type: '3d_2plus1d'
se_type: '2plus3d'
activation: 'hard_swish'
gating_activation: 'hard_sigmoid'
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
......
......@@ -15,6 +15,11 @@ task:
movinet:
model_id: 'a1'
causal: true
# Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
conv_type: '3d_2plus1d'
se_type: '2plus3d'
activation: 'hard_swish'
gating_activation: 'hard_sigmoid'
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
......
......@@ -15,10 +15,15 @@ task:
movinet:
model_id: 'a2'
causal: true
# Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
conv_type: '3d_2plus1d'
se_type: '2plus3d'
activation: 'hard_swish'
gating_activation: 'hard_sigmoid'
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.2
dropout_rate: 0.5
train_data:
name: kinetics600
variant_name: rgb
......
......@@ -15,6 +15,11 @@ task:
movinet:
model_id: 'a3'
causal: true
# Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
conv_type: '3d_2plus1d'
se_type: '2plus3d'
activation: 'hard_swish'
gating_activation: 'hard_sigmoid'
use_positional_encoding: true
stochastic_depth_drop_rate: 0.2
norm_activation:
......
......@@ -15,6 +15,11 @@ task:
movinet:
model_id: 'a4'
causal: true
# Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
conv_type: '3d_2plus1d'
se_type: '2plus3d'
activation: 'hard_swish'
gating_activation: 'hard_sigmoid'
use_positional_encoding: true
stochastic_depth_drop_rate: 0.2
norm_activation:
......
......@@ -15,6 +15,11 @@ task:
movinet:
model_id: 'a5'
causal: true
# Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
conv_type: '3d_2plus1d'
se_type: '2plus3d'
activation: 'hard_swish'
gating_activation: 'hard_sigmoid'
use_positional_encoding: true
stochastic_depth_drop_rate: 0.2
norm_activation:
......@@ -42,7 +47,8 @@ task:
validation_data:
name: kinetics600
feature_shape: !!python/tuple
- 120
# Evaluate on 115 frames instead of 120, as the model will get OOM on TPU
- 115
- 320
- 320
- 3
......
......@@ -15,6 +15,11 @@ task:
movinet:
model_id: 't0'
causal: true
# Note: we train with '3d_2plus1d', but convert to '2plus1d' for inference
conv_type: '3d_2plus1d'
se_type: '2plus3d'
activation: 'hard_swish'
gating_activation: 'hard_sigmoid'
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
......
......@@ -28,6 +28,26 @@ python3 export_saved_model.py \
--checkpoint_path=""
```
Export for TF Lite example:
```shell
python3 export_saved_model.py \
--model_id=a0 \
--causal=True \
--conv_type=2plus1d \
--se_type=2plus3d \
--activation=hard_swish \
--gating_activation=hard_sigmoid \
--use_positional_encoding=False \
--num_classes=600 \
--batch_size=1 \
--num_frames=1 \ # Use a single frame for streaming mode
--image_size=172 \ # Input resolution for the model
--bundle_input_init_states_fn=False \
--checkpoint_path=/path/to/checkpoint \
--export_path=/tmp/movinet_a0_stream
```
To use an exported saved_model, refer to export_saved_model_test.py.
"""
......@@ -79,6 +99,10 @@ flags.DEFINE_integer(
flags.DEFINE_integer(
'image_size', None,
'The resolution of the input. Set to None for dynamic input.')
flags.DEFINE_bool(
'bundle_input_init_states_fn', True,
'Add init_states as a function signature to the saved model.'
'This is not necessary if the input shape is static (e.g., for TF Lite).')
flags.DEFINE_string(
'checkpoint_path', '',
'Checkpoint path to load. Leave blank for default initialization.')
......@@ -97,24 +121,33 @@ def main(_) -> None:
# Use dimensions of 1 except the channels to export faster,
# since we only really need the last dimension to build and get the output
# states. These dimensions will be set to `None` once the model is built.
# states. These dimensions can be set to `None` once the model is built.
input_shape = [1 if s is None else s for s in input_specs.shape]
activation = FLAGS.activation
if activation == 'swish':
# Override swish activation implementation to remove custom gradients
activation = 'simple_swish'
backbone = movinet.Movinet(
FLAGS.model_id,
model_id=FLAGS.model_id,
causal=FLAGS.causal,
use_positional_encoding=FLAGS.use_positional_encoding,
conv_type=FLAGS.conv_type,
use_external_states=FLAGS.causal,
se_type=FLAGS.se_type,
input_specs=input_specs,
activation=FLAGS.activation,
activation=activation,
gating_activation=FLAGS.gating_activation,
se_type=FLAGS.se_type,
use_positional_encoding=FLAGS.use_positional_encoding)
use_sync_bn=False,
use_external_states=FLAGS.causal)
model = movinet_model.MovinetClassifier(
backbone,
num_classes=FLAGS.num_classes,
output_states=FLAGS.causal,
input_specs=dict(image=input_specs))
input_specs=dict(image=input_specs),
# TODO(dankondratyuk): currently set to swish, but will need to
# re-train to use other activations.
activation='simple_swish')
model.build(input_shape)
# Compile model to generate some internal Keras variables.
......@@ -131,7 +164,7 @@ def main(_) -> None:
# with the full output state shapes.
input_image = tf.ones(input_shape)
_, states = model({**model.init_states(input_shape), 'image': input_image})
_, states = model({**states, 'image': input_image})
_ = model({**states, 'image': input_image})
# Create a function to explicitly set the names of the outputs
def predict(inputs):
......@@ -153,7 +186,10 @@ def main(_) -> None:
init_states_fn = init_states_fn.get_concrete_function(
tf.TensorSpec([5], dtype=tf.int32))
signatures = {'call': predict_fn, 'init_states': init_states_fn}
if FLAGS.bundle_input_init_states_fn:
signatures = {'call': predict_fn, 'init_states': init_states_fn}
else:
signatures = predict_fn
tf.keras.models.save_model(
model, FLAGS.export_path, signatures=signatures)
......
......@@ -48,7 +48,7 @@ class ExportSavedModelTest(tf.test.TestCase):
example_input = tf.ones([1, 8, 172, 172, 3])
outputs = model(example_input)
self.assertEqual(outputs.shape, [1, 600])
self.assertAllEqual(outputs.shape, [1, 600])
def test_movinet_export_a0_stream_with_tfhub(self):
saved_model_path = self.get_temp_dir()
......@@ -94,9 +94,55 @@ class ExportSavedModelTest(tf.test.TestCase):
for frame in frames:
outputs, states = model({**states, 'image': frame})
self.assertEqual(outputs.shape, [1, 600])
self.assertAllEqual(outputs.shape, [1, 600])
self.assertNotEmpty(states)
self.assertAllClose(outputs, expected_outputs, 1e-5, 1e-5)
def test_movinet_export_a0_stream_with_tflite(self):
saved_model_path = self.get_temp_dir()
FLAGS.export_path = saved_model_path
FLAGS.model_id = 'a0'
FLAGS.causal = True
FLAGS.conv_type = '2plus1d'
FLAGS.se_type = '2plus3d'
FLAGS.activation = 'hard_swish'
FLAGS.gating_activation = 'hard_sigmoid'
FLAGS.use_positional_encoding = False
FLAGS.num_classes = 600
FLAGS.batch_size = 1
FLAGS.num_frames = 1
FLAGS.image_size = 172
FLAGS.bundle_input_init_states_fn = False
export_saved_model.main('unused_args')
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
tflite_model = converter.convert()
interpreter = tf.lite.Interpreter(model_content=tflite_model)
signature = interpreter.get_signature_runner()
def state_name(name: str) -> str:
return name[len('serving_default_'):-len(':0')]
init_states = {
state_name(x['name']): tf.zeros(x['shape'], dtype=x['dtype'])
for x in interpreter.get_input_details()
}
del init_states['image']
video = tf.ones([1, 8, 172, 172, 3])
clips = tf.split(video, video.shape[1], axis=1)
states = init_states
for clip in clips:
outputs = signature(**states, image=clip)
logits = outputs.pop('logits')
states = outputs
self.assertAllEqual(logits.shape, [1, 600])
self.assertNotEmpty(states)
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment