Internal change

PiperOrigin-RevId: 328803102

Internal change
PiperOrigin-RevId: 328803102
28cbb02d · Abdullah Rashwan · A. Unique TensorFlower · 5a1bce51 · 28cbb02d · 28cbb02d
Commit 28cbb02d authored Aug 27, 2020 by Abdullah Rashwan Committed by A. Unique TensorFlower Aug 27, 2020
7 changed files
--- a/official/modeling/optimization/__init__.py
+++ b/official/modeling/optimization/__init__.py
@@ -4,4 +4,5 @@
 from official.modeling.optimization.configs.learning_rate_config import *
 from official.modeling.optimization.configs.optimization_config import *
 from official.modeling.optimization.configs.optimizer_config import *
+from official.modeling.optimization.ema_optimizer import ExponentialMovingAverage
 from official.modeling.optimization.optimizer_factory import OptimizerFactory
--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
@@ -91,9 +91,12 @@ class OptimizationConfig(base_config.Config):
  Attributes:
    optimizer: optimizer oneof config.
+    ema: optional exponential moving average optimizer config, if specified,
+      ema optimizer will be used.
    learning_rate: learning rate oneof config.
    warmup: warmup oneof config.
  """
  optimizer: OptimizerConfig = OptimizerConfig()
+  ema: Optional[opt_cfg.EMAConfig] = None
  learning_rate: LrConfig = LrConfig()
  warmup: WarmupConfig = WarmupConfig()
--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
@@ -136,3 +136,19 @@ class LAMBConfig(base_config.Config):
  weight_decay_rate: float = 0.0
  exclude_from_weight_decay: Optional[List[str]] = None
  exclude_from_layer_adaptation: Optional[List[str]] = None
+@dataclasses.dataclass
+class EMAConfig(base_config.Config):
+  """Exponential moving average optimizer config.
+  Attributes:
+    name: 'str', name of the optimizer.
+    average_decay: 'float', average decay value.
+    start_step: 'int', start step to apply moving average.
+    dynamic_decay: 'bool', whether to apply dynamic decay or not.
+  """
+  name: str = "ExponentialMovingAverage"
+  average_decay: float = 0.99
+  start_step: int = 0
+  dynamic_decay: bool = True
--- a/official/modeling/optimization/ema_optimizer.py
+++ b/official/modeling/optimization/ema_optimizer.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exponential moving average optimizer."""
+from typing import Text, List
+import tensorflow as tf
+# pylint: disable=protected-access
+class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
+  """Optimizer that computes an exponential moving average of the variables.
+  Empirically it has been found that using the moving average of the trained
+  parameters of a deep network is better than using its trained parameters
+  directly. This optimizer allows you to compute this moving average and swap
+  the variables at save time so that any code outside of the training loop
+  will use by default the average values instead of the original ones.
+  Example of usage for training:
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = ExponentialMovingAverage(opt)
+  opt.shadow_copy(model)
+  ```
+  At test time, swap the shadow variables to evaluate on the averaged weights:
+  ```python
+  opt.swap_weights()
+  # Test eval the model here
+  opt.swap_weights()
+  ```
+  """
+  def __init__(self,
+               optimizer: tf.keras.optimizers.Optimizer,
+               average_decay: float = 0.99,
+               start_step: int = 0,
+               dynamic_decay: bool = True,
+               name: Text = 'ExponentialMovingAverage',
+               **kwargs):
+    """Construct a new ExponentialMovingAverage optimizer.
+    Args:
+      optimizer: `tf.keras.optimizers.Optimizer` that will be
+        used to compute and apply gradients.
+      average_decay: float. Decay to use to maintain the moving averages
+        of trained variables.
+      start_step: int. What step to start the moving average.
+      dynamic_decay: bool. Whether to change the decay based on the number
+        of optimizer updates. Decay will start at 0.1 and gradually increase
+        up to `average_decay` after each optimizer update. This behavior is
+        similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
+      name: Optional name for the operations created when applying
+        gradients. Defaults to "moving_average".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+        `clipvalue`, `lr`, `decay`}.
+    """
+    super(ExponentialMovingAverage, self).__init__(name, **kwargs)
+    self._average_decay = average_decay
+    self._start_step = tf.constant(start_step, tf.float32)
+    self._dynamic_decay = dynamic_decay
+    self._optimizer = optimizer
+    self._track_trackable(self._optimizer, 'base_optimizer')
+  def shadow_copy(self, model: tf.keras.Model):
+    """Creates shadow variables for the given model weights."""
+    for var in model.weights:
+      self.add_slot(var, 'average', initializer='zeros')
+    self._average_weights = [
+        self.get_slot(var, 'average') for var in model.weights
+    ]
+    self._model_weights = model.weights
+  @property
+  def has_shadow_copy(self):
+    """Whether this optimizer has created shadow variables."""
+    return self._model_weights is not None
+  def _create_slots(self, var_list):
+    self._optimizer._create_slots(var_list=var_list)  # pylint: disable=protected-access
+  def apply_gradients(self, grads_and_vars, name: Text = None):
+    result = self._optimizer.apply_gradients(grads_and_vars, name)
+    self.update_average(self.iterations)
+    return result
+  @tf.function
+  def update_average(self, step: tf.Tensor):
+    step = tf.cast(step, tf.float32)
+    if step < self._start_step:
+      decay = tf.constant(0., tf.float32)
+    elif self._dynamic_decay:
+      decay = step - self._start_step
+      decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
+    else:
+      decay = self._average_decay
+    def _apply_moving(v_moving, v_normal):
+      diff = v_moving - v_normal
+      v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
+      return v_moving
+    def _update(strategy, v_moving_and_v_normal):
+      for v_moving, v_normal in v_moving_and_v_normal:
+        strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(_update, args=(zip(self._average_weights,
+                                             self._model_weights),))
+  def swap_weights(self):
+    """Swap the average and moving weights.
+    This is a convenience method to allow one to evaluate the averaged weights
+    at test time. Loads the weights stored in `self._average` into the model,
+    keeping a copy of the original model weights. Swapping twice will return
+    the original weights.
+    """
+    if tf.distribute.in_cross_replica_context():
+      strategy = tf.distribute.get_strategy()
+      strategy.run(self._swap_weights, args=())
+    else:
+      raise ValueError('Swapping weights must occur under a '
+                       'tf.distribute.Strategy')
+  @tf.function
+  def _swap_weights(self):
+    def fn_0(a, b):
+      a.assign_add(b)
+      return a
+    def fn_1(b, a):
+      b.assign(a - b)
+      return b
+    def fn_2(a, b):
+      a.assign_sub(b)
+      return a
+    def swap(strategy, a_and_b):
+      """Swap `a` and `b` and mirror to all devices."""
+      for a, b in a_and_b:
+        strategy.extended.update(a, fn_0, args=(b,))  # a = a + b
+        strategy.extended.update(b, fn_1, args=(a,))  # b = a - b
+        strategy.extended.update(a, fn_2, args=(b,))  # a = a - b
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(
+        swap, args=(zip(self._average_weights, self._model_weights),))
+  def assign_average_vars(self, var_list: List[tf.Variable]):
+    """Assign variables in var_list with their respective averages.
+    Args:
+      var_list: List of model variables to be assigned to their average.
+    Returns:
+      assign_op: The op corresponding to the assignment operation of
+        variables to their average.
+    """
+    assign_op = tf.group([
+        var.assign(self.get_slot(var, 'average')) for var in var_list
+        if var.trainable
+    ])
+    return assign_op
+  def _create_hypers(self):
+    self._optimizer._create_hypers()  # pylint: disable=protected-access
+  def _prepare(self, var_list):
+    return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
+  @property
+  def iterations(self):
+    return self._optimizer.iterations
+  @iterations.setter
+  def iterations(self, variable):
+    self._optimizer.iterations = variable
+  @property
+  def weights(self):
+    # return self._weights + self._optimizer.weights
+    return self._optimizer.weights
+  def variables(self):
+    return self._weights + [self.iterations]
+  @property
+  def lr(self):
+    return self._optimizer._get_hyper('learning_rate')
+  @lr.setter
+  def lr(self, lr):
+    self._optimizer._set_hyper('learning_rate', lr)
+  @property
+  def learning_rate(self):
+    return self._optimizer._get_hyper('learning_rate')
+  @learning_rate.setter
+  def learning_rate(self, learning_rate):  # pylint: disable=redefined-outer-name
+    self._optimizer._set_hyper('learning_rate', learning_rate)
+  def _resource_apply_dense(self, grad, var):
+    return self._optimizer._resource_apply_dense(grad, var)
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse(grad, var, indices)
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse_duplicate_indices(
+        grad, var, indices)
+  def get_config(self):
+    config = {
+        'optimizer': tf.keras.optimizers.serialize(self._optimizer),
+        'average_decay': self._average_decay,
+        'start_step': self._start_step,
+        'dynamic_decay': self._dynamic_decay,
+    }
+    base_config = super(ExponentialMovingAverage, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    optimizer = tf.keras.optimizers.deserialize(
+        config.pop('optimizer'),
+        custom_objects=custom_objects,
+    )
+    return cls(optimizer, **config)
--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -16,10 +16,11 @@
 """Optimizer factory class."""
 from typing import Union
-import tensorflow as tf
+import tensorflow as tf
 import tensorflow_addons.optimizers as tfa_optimizers
+from official.modeling.optimization import ema_optimizer
 from official.modeling.optimization import lr_schedule
 from official.modeling.optimization.configs import optimization_config as opt_cfg
 from official.nlp import optimization as nlp_optimization
@@ -89,7 +90,10 @@ class OptimizerFactory(object):
    self._optimizer_config = config.optimizer.get()
    self._optimizer_type = config.optimizer.type
-    if self._optimizer_type is None:
+    self._use_ema = config.ema is not None
+    self._ema_config = config.ema
+    if self._optimizer_config is None:
      raise ValueError('Optimizer type must be specified')
    self._lr_config = config.learning_rate.get()
@@ -143,4 +147,9 @@ class OptimizerFactory(object):
    optimizer_dict['learning_rate'] = lr
    optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
+    if self._use_ema:
+      optimizer = ema_optimizer.ExponentialMovingAverage(
+          optimizer, **self._ema_config.as_dict())
    return optimizer
--- a/official/vision/image_classification/callbacks.py
+++ b/official/vision/image_classification/callbacks.py
@@ -25,8 +25,8 @@ from typing import Any, List, MutableMapping, Text
 from absl import logging
 import tensorflow as tf
+from official.modeling import optimization
 from official.utils.misc import keras_utils
-from official.vision.image_classification import optimizer_factory
 def get_callbacks(model_checkpoint: bool = True,
@@ -165,7 +165,7 @@ class CustomTensorBoard(tf.keras.callbacks.TensorBoard):
 class MovingAverageCallback(tf.keras.callbacks.Callback):
-  """A Callback to be used with a `MovingAverage` optimizer.
+  """A Callback to be used with a `ExponentialMovingAverage` optimizer.
  Applies moving average weights to the model during validation time to test
  and predict on the averaged weights rather than the current model weights.
@@ -184,7 +184,8 @@ class MovingAverageCallback(tf.keras.callbacks.Callback):
  def set_model(self, model: tf.keras.Model):
    super(MovingAverageCallback, self).set_model(model)
-    assert isinstance(self.model.optimizer, optimizer_factory.MovingAverage)
+    assert isinstance(self.model.optimizer,
+                      optimization.ExponentialMovingAverage)
    self.model.optimizer.shadow_copy(self.model)
  def on_test_begin(self, logs: MutableMapping[Text, Any] = None):
@@ -225,13 +226,14 @@ class AverageModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
                     save_weights_only, mode, save_freq, **kwargs)
  def set_model(self, model):
-    if not isinstance(model.optimizer, optimizer_factory.MovingAverage):
+    if not isinstance(model.optimizer, optimization.ExponentialMovingAverage):
      raise TypeError('AverageModelCheckpoint is only used when training'
                      'with MovingAverage')
    return super().set_model(model)
  def _save_model(self, epoch, logs):
-    assert isinstance(self.model.optimizer, optimizer_factory.MovingAverage)
+    assert isinstance(self.model.optimizer,
+                      optimization.ExponentialMovingAverage)
    if self.update_weights:
      self.model.optimizer.assign_average_vars(self.model.variables)

--- a/official/vision/image_classification/optimizer_factory.py
+++ b/official/vision/image_classification/optimizer_factory.py
@@ -18,241 +18,19 @@ from __future__ import division
 # from __future__ import google_type_annotations
 from __future__ import print_function
-from typing import Any, Dict, Text, List
+from typing import Any, Dict, Text
 from absl import logging
 import tensorflow as tf
 import tensorflow_addons as tfa
+from official.modeling import optimization
 from official.vision.image_classification import learning_rate
 from official.vision.image_classification.configs import base_configs
 # pylint: disable=protected-access
-class MovingAverage(tf.keras.optimizers.Optimizer):
-  """Optimizer that computes a moving average of the variables.
-  Empirically it has been found that using the moving average of the trained
-  parameters of a deep network is better than using its trained parameters
-  directly. This optimizer allows you to compute this moving average and swap
-  the variables at save time so that any code outside of the training loop
-  will use by default the average values instead of the original ones.
-  Example of usage for training:
-  ```python
-  opt = tf.keras.optimizers.SGD(learning_rate)
-  opt = MovingAverage(opt)
-  opt.shadow_copy(model)
-  ```
-  At test time, swap the shadow variables to evaluate on the averaged weights:
-  ```python
-  opt.swap_weights()
-  # Test eval the model here
-  opt.swap_weights()
-  ```
-  """
-  def __init__(self,
-               optimizer: tf.keras.optimizers.Optimizer,
-               average_decay: float = 0.99,
-               start_step: int = 0,
-               dynamic_decay: bool = True,
-               name: Text = 'moving_average',
-               **kwargs):
-    """Construct a new MovingAverage optimizer.
-    Args:
-      optimizer: `tf.keras.optimizers.Optimizer` that will be used to compute
-        and apply gradients.
-      average_decay: float. Decay to use to maintain the moving averages of
-        trained variables.
-      start_step: int. What step to start the moving average.
-      dynamic_decay: bool. Whether to change the decay based on the number of
-        optimizer updates. Decay will start at 0.1 and gradually increase up to
-        `average_decay` after each optimizer update. This behavior is similar to
-        `tf.train.ExponentialMovingAverage` in TF 1.x.
-      name: Optional name for the operations created when applying gradients.
-        Defaults to "moving_average".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}.
-    """
-    super(MovingAverage, self).__init__(name, **kwargs)
-    self._optimizer = optimizer
-    self._average_decay = average_decay
-    self._start_step = tf.constant(start_step, tf.float32)
-    self._dynamic_decay = dynamic_decay
-  def shadow_copy(self, model: tf.keras.Model):
-    """Creates shadow variables for the given model weights."""
-    for var in model.weights:
-      self.add_slot(var, 'average', initializer='zeros')
-    self._average_weights = [
-        self.get_slot(var, 'average') for var in model.weights
-    ]
-    self._model_weights = model.weights
-  @property
-  def has_shadow_copy(self):
-    """Whether this optimizer has created shadow variables."""
-    return self._model_weights is not None
-  def _create_slots(self, var_list):
-    self._optimizer._create_slots(var_list=var_list)  # pylint: disable=protected-access
-  def apply_gradients(self, grads_and_vars, name: Text = None):
-    result = self._optimizer.apply_gradients(grads_and_vars, name)
-    self.update_average(self._optimizer.iterations)
-    return result
-  @tf.function
-  def update_average(self, step: tf.Tensor):
-    step = tf.cast(step, tf.float32)
-    if step < self._start_step:
-      decay = tf.constant(0., tf.float32)
-    elif self._dynamic_decay:
-      decay = step - self._start_step
-      decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
-    else:
-      decay = self._average_decay
-    def _apply_moving(v_moving, v_normal):
-      diff = v_moving - v_normal
-      v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
-      return v_moving
-    def _update(strategy, v_moving_and_v_normal):
-      for v_moving, v_normal in v_moving_and_v_normal:
-        strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
-    ctx = tf.distribute.get_replica_context()
-    return ctx.merge_call(
-        _update, args=(zip(self._average_weights, self._model_weights),))
-  def swap_weights(self):
-    """Swap the average and moving weights.
-    This is a convenience method to allow one to evaluate the averaged weights
-    at test time. Loads the weights stored in `self._average` into the model,
-    keeping a copy of the original model weights. Swapping twice will return
-    the original weights.
-    """
-    if tf.distribute.in_cross_replica_context():
-      strategy = tf.distribute.get_strategy()
-      strategy.run(self._swap_weights, args=())
-    else:
-      raise ValueError('Swapping weights must occur under a '
-                       'tf.distribute.Strategy')
-  @tf.function
-  def _swap_weights(self):
-    def fn_0(a, b):
-      a.assign_add(b)
-      return a
-    def fn_1(b, a):
-      b.assign(a - b)
-      return b
-    def fn_2(a, b):
-      a.assign_sub(b)
-      return a
-    def swap(strategy, a_and_b):
-      """Swap `a` and `b` and mirror to all devices."""
-      for a, b in a_and_b:
-        strategy.extended.update(a, fn_0, args=(b,))  # a = a + b
-        strategy.extended.update(b, fn_1, args=(a,))  # b = a - b
-        strategy.extended.update(a, fn_2, args=(b,))  # a = a - b
-    ctx = tf.distribute.get_replica_context()
-    return ctx.merge_call(
-        swap, args=(zip(self._average_weights, self._model_weights),))
-  def assign_average_vars(self, var_list: List[tf.Variable]):
-    """Assign variables in var_list with their respective averages.
-    Args:
-      var_list: List of model variables to be assigned to their average.
-    Returns:
-      assign_op: The op corresponding to the assignment operation of
-        variables to their average.
-    """
-    assign_op = tf.group([
-        var.assign(self.get_slot(var, 'average'))
-        for var in var_list
-        if var.trainable
-    ])
-    return assign_op
-  def _create_hypers(self):
-    self._optimizer._create_hypers()  # pylint: disable=protected-access
-  def _prepare(self, var_list):
-    return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
-  @property
-  def iterations(self):
-    return self._optimizer.iterations
-  @iterations.setter
-  def iterations(self, variable):
-    self._optimizer.iterations = variable
-  @property
-  def weights(self):
-    # return self._weights + self._optimizer.weights
-    return self._optimizer.weights
-  @property
-  def lr(self):
-    return self._optimizer._get_hyper('learning_rate')
-  @lr.setter
-  def lr(self, lr):
-    self._optimizer._set_hyper('learning_rate', lr)
-  @property
-  def learning_rate(self):
-    return self._optimizer._get_hyper('learning_rate')
-  @learning_rate.setter
-  def learning_rate(self, learning_rate):  # pylint: disable=redefined-outer-name
-    self._optimizer._set_hyper('learning_rate', learning_rate)
-  def _resource_apply_dense(self, grad, var):
-    return self._optimizer._resource_apply_dense(grad, var)
-  def _resource_apply_sparse(self, grad, var, indices):
-    return self._optimizer._resource_apply_sparse(grad, var, indices)
-  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
-    return self._optimizer._resource_apply_sparse_duplicate_indices(
-        grad, var, indices)
-  def get_config(self):
-    config = {
-        'optimizer': tf.keras.optimizers.serialize(self._optimizer),
-        'average_decay': self._average_decay,
-        'start_step': self._start_step,
-        'dynamic_decay': self._dynamic_decay,
-    }
-    base_config = super(MovingAverage, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    optimizer = tf.keras.optimizers.deserialize(
-        config.pop('optimizer'),
-        custom_objects=custom_objects,
-    )
-    return cls(optimizer, **config)
 def build_optimizer(
    optimizer_name: Text,
    base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
@@ -269,7 +47,7 @@ def build_optimizer(
      should contain optimizer specific parameters such as `base_learning_rate`,
      `decay`, etc.
    model: The `tf.keras.Model`. This is used for the shadow copy if using
-      `MovingAverage`.
+      `ExponentialMovingAverage`.
  Returns:
    A tf.keras.Optimizer.
@@ -336,9 +114,10 @@ def build_optimizer(
  moving_average_decay = params.get('moving_average_decay', 0.)
  if moving_average_decay is not None and moving_average_decay > 0.:
    if model is None:
-      raise ValueError('`model` must be provided if using `MovingAverage`.')
+      raise ValueError(
+          '`model` must be provided if using `ExponentialMovingAverage`.')
    logging.info('Including moving average decay.')
-    optimizer = MovingAverage(
+    optimizer = optimization.ExponentialMovingAverage(
        optimizer=optimizer, average_decay=moving_average_decay)
    optimizer.shadow_copy(model)
  return optimizer