Commit fd45760c authored by Dan Kondratyuk's avatar Dan Kondratyuk Committed by A. Unique TensorFlower
Browse files

Improve MoViNet-Base and MoViNet-Stream training, and add a4 stream.

PiperOrigin-RevId: 374236760
parent 102f267e
# Video classification on Kinetics-600 using MoViNet-A0 backbone.
# --experiment_type=movinet_kinetics600
# Achieves 71.65% Top-1 accuracy.
# http://mldash/experiments/4591693621833944103
# Achieves 72.28% Top-1 accuracy.
# http://mldash/experiments/2112621422911359474
runtime:
distribution_strategy: 'tpu'
......@@ -17,7 +17,7 @@ task:
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.5
dropout_rate: 0.2
train_data:
name: kinetics600
variant_name: rgb
......
# Video classification on Kinetics-600 using MoViNet-A0-Stream backbone.
# --experiment_type=movinet_kinetics600
# Achieves 69.56% Top-1 accuracy.
# http://mldash/experiments/6696393165423234453
# Achieves 72.03% Top-1 accuracy.
# http://mldash/experiments/7841061381580044300
runtime:
distribution_strategy: 'tpu'
......@@ -18,7 +18,7 @@ task:
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.5
dropout_rate: 0.2
train_data:
name: kinetics600
variant_name: rgb
......@@ -37,7 +37,6 @@ task:
aug_max_aspect_ratio: 2.0
aug_min_area_ratio: 0.08
aug_min_aspect_ratio: 0.5
aug_type: 'autoaug'
validation_data:
name: kinetics600
feature_shape: !!python/tuple
......
# Video classification on Kinetics-600 using MoViNet-A1 backbone.
# --experiment_type=movinet_kinetics600
# Achieves 76.63% Top-1 accuracy.
# Achieves 76.69% Top-1 accuracy.
# http://mldash/experiments/6004897086445740406
runtime:
......
# Video classification on Kinetics-600 using MoViNet-A1-Stream backbone.
# --experiment_type=movinet_kinetics600
# Achieves x% Top-1 accuracy.
# http://mldash/experiments/
# Achieves 76.45% Top-1 accuracy.
# http://mldash/experiments/2106053499367982379
runtime:
distribution_strategy: 'tpu'
......@@ -15,10 +15,10 @@ task:
movinet:
model_id: 'a1'
causal: true
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.5
stochastic_depth_rate: 0.2
dropout_rate: 0.2
train_data:
name: kinetics600
variant_name: rgb
......@@ -37,7 +37,6 @@ task:
aug_max_aspect_ratio: 2.0
aug_min_area_ratio: 0.08
aug_min_aspect_ratio: 0.5
aug_type: 'autoaug'
validation_data:
name: kinetics600
feature_shape: !!python/tuple
......
......@@ -15,10 +15,10 @@ task:
movinet:
model_id: 'a2'
causal: true
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.5
stochastic_depth_rate: 0.2
dropout_rate: 0.2
train_data:
name: kinetics600
variant_name: rgb
......@@ -37,7 +37,6 @@ task:
aug_max_aspect_ratio: 2.0
aug_min_area_ratio: 0.08
aug_min_aspect_ratio: 0.5
aug_type: 'autoaug'
validation_data:
name: kinetics600
feature_shape: !!python/tuple
......
# Video classification on Kinetics-600 using MoViNet-A3-Stream backbone.
# --experiment_type=movinet_kinetics600
# Achieves x% Top-1 accuracy.
# http://mldash/experiments/
# Achieves 80.09% Top-1 accuracy.
# http://mldash/experiments/8515953265355959123
runtime:
distribution_strategy: 'tpu'
......@@ -14,10 +14,12 @@ task:
backbone:
movinet:
model_id: 'a3'
causal: true
use_positional_encoding: true
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.5
stochastic_depth_rate: 0.2
train_data:
name: kinetics600
variant_name: rgb
......@@ -36,7 +38,6 @@ task:
aug_max_aspect_ratio: 2.0
aug_min_area_ratio: 0.08
aug_min_aspect_ratio: 0.5
aug_type: 'autoaug'
validation_data:
name: kinetics600
feature_shape: !!python/tuple
......
# Video classification on Kinetics-600 using MoViNet-A4 backbone.
# --experiment_type=movinet_kinetics600
# Achieves 81.33% Top-1 accuracy.
# http://mldash/experiments/3621454183108305685
runtime:
distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16'
task:
losses:
l2_weight_decay: 0.00003
label_smoothing: 0.1
model:
backbone:
movinet:
model_id: 'a4'
causal: true
use_positional_encoding: true
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.5
train_data:
name: kinetics600
variant_name: rgb
feature_shape: !!python/tuple
- 32
- 290
- 290
- 3
temporal_stride: 3
random_stride_range: 1
global_batch_size: 1024
dtype: 'bfloat16'
shuffle_buffer_size: 1024
min_image_size: 320
aug_max_area_ratio: 1.0
aug_max_aspect_ratio: 2.0
aug_min_area_ratio: 0.08
aug_min_aspect_ratio: 0.5
aug_type: 'autoaug'
validation_data:
name: kinetics600
feature_shape: !!python/tuple
- 80
- 290
- 290
- 3
temporal_stride: 3
num_test_clips: 1
num_test_crops: 1
global_batch_size: 64
min_image_size: 320
dtype: 'bfloat16'
drop_remainder: false
trainer:
optimizer_config:
learning_rate:
cosine:
initial_learning_rate: 1.8
decay_steps: 85785
warmup:
linear:
warmup_steps: 2145
optimizer:
type: 'rmsprop'
rmsprop:
rho: 0.9
momentum: 0.9
epsilon: 1.0
clipnorm: 1.0
train_steps: 85785
steps_per_loop: 500
summary_interval: 500
validation_interval: 500
......@@ -36,7 +36,7 @@ task:
aug_max_aspect_ratio: 2.0
aug_min_area_ratio: 0.08
aug_min_aspect_ratio: 0.5
aug_type: 'randaug'
aug_type: 'autoaug'
validation_data:
name: kinetics600
feature_shape: !!python/tuple
......
......@@ -17,7 +17,7 @@ task:
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.5
dropout_rate: 0.2
train_data:
name: kinetics600
variant_name: rgb
......
# Video classification on Kinetics-600 using MoViNet-T0-Stream backbone.
# --experiment_type=movinet_kinetics600
# Achieves x% Top-1 accuracy.
# http://mldash/experiments/
# Achieves 67.17% Top-1 accuracy.
# http://mldash/experiments/3540709722174127508
runtime:
distribution_strategy: 'tpu'
......@@ -14,10 +14,11 @@ task:
backbone:
movinet:
model_id: 't0'
causal: true
stochastic_depth_drop_rate: 0.2
norm_activation:
use_sync_bn: true
dropout_rate: 0.5
stochastic_depth_rate: 0.2
dropout_rate: 0.2
train_data:
name: kinetics600
variant_name: rgb
......
......@@ -294,8 +294,8 @@ class ConvBlock(tf.keras.layers.Layer):
own batch norm and activation. '3d_2plus1d' is like '2plus1d', but
uses two sequential 3D ops instead.
use_positional_encoding: add a positional encoding before the temporal
convolution. Assumes `use_2plus1d=True` and `kernel_size[0] > 1`.
Otherwise, this argument is ignored.
convolution. Assumes `kernel_size[0] > 1`. Otherwise, this argument
is ignored.
use_buffered_input: if True, the input is expected to be padded
beforehand. In effect, calling this layer will use 'valid' padding on
the temporal dimension to simulate 'causal' padding.
......@@ -426,7 +426,7 @@ class ConvBlock(tf.keras.layers.Layer):
use_buffered_input=self._use_buffered_input,
name='conv3d')
if self._use_positional_encoding and self._conv_temporal is not None:
if self._use_positional_encoding and self._kernel_size[0] > 1:
self._pos_encoding = nn_layers.PositionalEncoding()
else:
self._pos_encoding = None
......@@ -451,7 +451,7 @@ class ConvBlock(tf.keras.layers.Layer):
"""Calls the layer with the given inputs."""
x = inputs
if self._pos_encoding is not None:
if self._pos_encoding is not None and self._conv_temporal is None:
x = self._pos_encoding(x)
x = self._conv(x)
......
......@@ -162,5 +162,7 @@ def build_movinet_model(
num_classes=num_classes,
kernel_regularizer=l2_regularizer,
input_specs=input_specs_dict,
dropout_rate=model_config.dropout_rate)
dropout_rate=model_config.dropout_rate,
output_states=model_config.output_states)
return model
......@@ -15,7 +15,6 @@
# Lint as: python3
"""Tests for movinet_model.py."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
......
......@@ -15,7 +15,6 @@
# Lint as: python3
"""Tests for movinet.py."""
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment