Commit 28cbb02d authored by Abdullah Rashwan's avatar Abdullah Rashwan Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 328803102
parent 5a1bce51
...@@ -4,4 +4,5 @@ ...@@ -4,4 +4,5 @@
from official.modeling.optimization.configs.learning_rate_config import * from official.modeling.optimization.configs.learning_rate_config import *
from official.modeling.optimization.configs.optimization_config import * from official.modeling.optimization.configs.optimization_config import *
from official.modeling.optimization.configs.optimizer_config import * from official.modeling.optimization.configs.optimizer_config import *
from official.modeling.optimization.ema_optimizer import ExponentialMovingAverage
from official.modeling.optimization.optimizer_factory import OptimizerFactory from official.modeling.optimization.optimizer_factory import OptimizerFactory
...@@ -91,9 +91,12 @@ class OptimizationConfig(base_config.Config): ...@@ -91,9 +91,12 @@ class OptimizationConfig(base_config.Config):
Attributes: Attributes:
optimizer: optimizer oneof config. optimizer: optimizer oneof config.
ema: optional exponential moving average optimizer config, if specified,
ema optimizer will be used.
learning_rate: learning rate oneof config. learning_rate: learning rate oneof config.
warmup: warmup oneof config. warmup: warmup oneof config.
""" """
optimizer: OptimizerConfig = OptimizerConfig() optimizer: OptimizerConfig = OptimizerConfig()
ema: Optional[opt_cfg.EMAConfig] = None
learning_rate: LrConfig = LrConfig() learning_rate: LrConfig = LrConfig()
warmup: WarmupConfig = WarmupConfig() warmup: WarmupConfig = WarmupConfig()
...@@ -136,3 +136,19 @@ class LAMBConfig(base_config.Config): ...@@ -136,3 +136,19 @@ class LAMBConfig(base_config.Config):
weight_decay_rate: float = 0.0 weight_decay_rate: float = 0.0
exclude_from_weight_decay: Optional[List[str]] = None exclude_from_weight_decay: Optional[List[str]] = None
exclude_from_layer_adaptation: Optional[List[str]] = None exclude_from_layer_adaptation: Optional[List[str]] = None
@dataclasses.dataclass
class EMAConfig(base_config.Config):
"""Exponential moving average optimizer config.
Attributes:
name: 'str', name of the optimizer.
average_decay: 'float', average decay value.
start_step: 'int', start step to apply moving average.
dynamic_decay: 'bool', whether to apply dynamic decay or not.
"""
name: str = "ExponentialMovingAverage"
average_decay: float = 0.99
start_step: int = 0
dynamic_decay: bool = True
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Exponential moving average optimizer."""
from typing import Text, List
import tensorflow as tf
# pylint: disable=protected-access
class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
"""Optimizer that computes an exponential moving average of the variables.
Empirically it has been found that using the moving average of the trained
parameters of a deep network is better than using its trained parameters
directly. This optimizer allows you to compute this moving average and swap
the variables at save time so that any code outside of the training loop
will use by default the average values instead of the original ones.
Example of usage for training:
```python
opt = tf.keras.optimizers.SGD(learning_rate)
opt = ExponentialMovingAverage(opt)
opt.shadow_copy(model)
```
At test time, swap the shadow variables to evaluate on the averaged weights:
```python
opt.swap_weights()
# Test eval the model here
opt.swap_weights()
```
"""
def __init__(self,
optimizer: tf.keras.optimizers.Optimizer,
average_decay: float = 0.99,
start_step: int = 0,
dynamic_decay: bool = True,
name: Text = 'ExponentialMovingAverage',
**kwargs):
"""Construct a new ExponentialMovingAverage optimizer.
Args:
optimizer: `tf.keras.optimizers.Optimizer` that will be
used to compute and apply gradients.
average_decay: float. Decay to use to maintain the moving averages
of trained variables.
start_step: int. What step to start the moving average.
dynamic_decay: bool. Whether to change the decay based on the number
of optimizer updates. Decay will start at 0.1 and gradually increase
up to `average_decay` after each optimizer update. This behavior is
similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
name: Optional name for the operations created when applying
gradients. Defaults to "moving_average".
**kwargs: keyword arguments. Allowed to be {`clipnorm`,
`clipvalue`, `lr`, `decay`}.
"""
super(ExponentialMovingAverage, self).__init__(name, **kwargs)
self._average_decay = average_decay
self._start_step = tf.constant(start_step, tf.float32)
self._dynamic_decay = dynamic_decay
self._optimizer = optimizer
self._track_trackable(self._optimizer, 'base_optimizer')
def shadow_copy(self, model: tf.keras.Model):
"""Creates shadow variables for the given model weights."""
for var in model.weights:
self.add_slot(var, 'average', initializer='zeros')
self._average_weights = [
self.get_slot(var, 'average') for var in model.weights
]
self._model_weights = model.weights
@property
def has_shadow_copy(self):
"""Whether this optimizer has created shadow variables."""
return self._model_weights is not None
def _create_slots(self, var_list):
self._optimizer._create_slots(var_list=var_list) # pylint: disable=protected-access
def apply_gradients(self, grads_and_vars, name: Text = None):
result = self._optimizer.apply_gradients(grads_and_vars, name)
self.update_average(self.iterations)
return result
@tf.function
def update_average(self, step: tf.Tensor):
step = tf.cast(step, tf.float32)
if step < self._start_step:
decay = tf.constant(0., tf.float32)
elif self._dynamic_decay:
decay = step - self._start_step
decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
else:
decay = self._average_decay
def _apply_moving(v_moving, v_normal):
diff = v_moving - v_normal
v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
return v_moving
def _update(strategy, v_moving_and_v_normal):
for v_moving, v_normal in v_moving_and_v_normal:
strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
ctx = tf.distribute.get_replica_context()
return ctx.merge_call(_update, args=(zip(self._average_weights,
self._model_weights),))
def swap_weights(self):
"""Swap the average and moving weights.
This is a convenience method to allow one to evaluate the averaged weights
at test time. Loads the weights stored in `self._average` into the model,
keeping a copy of the original model weights. Swapping twice will return
the original weights.
"""
if tf.distribute.in_cross_replica_context():
strategy = tf.distribute.get_strategy()
strategy.run(self._swap_weights, args=())
else:
raise ValueError('Swapping weights must occur under a '
'tf.distribute.Strategy')
@tf.function
def _swap_weights(self):
def fn_0(a, b):
a.assign_add(b)
return a
def fn_1(b, a):
b.assign(a - b)
return b
def fn_2(a, b):
a.assign_sub(b)
return a
def swap(strategy, a_and_b):
"""Swap `a` and `b` and mirror to all devices."""
for a, b in a_and_b:
strategy.extended.update(a, fn_0, args=(b,)) # a = a + b
strategy.extended.update(b, fn_1, args=(a,)) # b = a - b
strategy.extended.update(a, fn_2, args=(b,)) # a = a - b
ctx = tf.distribute.get_replica_context()
return ctx.merge_call(
swap, args=(zip(self._average_weights, self._model_weights),))
def assign_average_vars(self, var_list: List[tf.Variable]):
"""Assign variables in var_list with their respective averages.
Args:
var_list: List of model variables to be assigned to their average.
Returns:
assign_op: The op corresponding to the assignment operation of
variables to their average.
"""
assign_op = tf.group([
var.assign(self.get_slot(var, 'average')) for var in var_list
if var.trainable
])
return assign_op
def _create_hypers(self):
self._optimizer._create_hypers() # pylint: disable=protected-access
def _prepare(self, var_list):
return self._optimizer._prepare(var_list=var_list) # pylint: disable=protected-access
@property
def iterations(self):
return self._optimizer.iterations
@iterations.setter
def iterations(self, variable):
self._optimizer.iterations = variable
@property
def weights(self):
# return self._weights + self._optimizer.weights
return self._optimizer.weights
def variables(self):
return self._weights + [self.iterations]
@property
def lr(self):
return self._optimizer._get_hyper('learning_rate')
@lr.setter
def lr(self, lr):
self._optimizer._set_hyper('learning_rate', lr)
@property
def learning_rate(self):
return self._optimizer._get_hyper('learning_rate')
@learning_rate.setter
def learning_rate(self, learning_rate): # pylint: disable=redefined-outer-name
self._optimizer._set_hyper('learning_rate', learning_rate)
def _resource_apply_dense(self, grad, var):
return self._optimizer._resource_apply_dense(grad, var)
def _resource_apply_sparse(self, grad, var, indices):
return self._optimizer._resource_apply_sparse(grad, var, indices)
def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
return self._optimizer._resource_apply_sparse_duplicate_indices(
grad, var, indices)
def get_config(self):
config = {
'optimizer': tf.keras.optimizers.serialize(self._optimizer),
'average_decay': self._average_decay,
'start_step': self._start_step,
'dynamic_decay': self._dynamic_decay,
}
base_config = super(ExponentialMovingAverage, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
@classmethod
def from_config(cls, config, custom_objects=None):
optimizer = tf.keras.optimizers.deserialize(
config.pop('optimizer'),
custom_objects=custom_objects,
)
return cls(optimizer, **config)
...@@ -16,10 +16,11 @@ ...@@ -16,10 +16,11 @@
"""Optimizer factory class.""" """Optimizer factory class."""
from typing import Union from typing import Union
import tensorflow as tf
import tensorflow as tf
import tensorflow_addons.optimizers as tfa_optimizers import tensorflow_addons.optimizers as tfa_optimizers
from official.modeling.optimization import ema_optimizer
from official.modeling.optimization import lr_schedule from official.modeling.optimization import lr_schedule
from official.modeling.optimization.configs import optimization_config as opt_cfg from official.modeling.optimization.configs import optimization_config as opt_cfg
from official.nlp import optimization as nlp_optimization from official.nlp import optimization as nlp_optimization
...@@ -89,7 +90,10 @@ class OptimizerFactory(object): ...@@ -89,7 +90,10 @@ class OptimizerFactory(object):
self._optimizer_config = config.optimizer.get() self._optimizer_config = config.optimizer.get()
self._optimizer_type = config.optimizer.type self._optimizer_type = config.optimizer.type
if self._optimizer_type is None: self._use_ema = config.ema is not None
self._ema_config = config.ema
if self._optimizer_config is None:
raise ValueError('Optimizer type must be specified') raise ValueError('Optimizer type must be specified')
self._lr_config = config.learning_rate.get() self._lr_config = config.learning_rate.get()
...@@ -143,4 +147,9 @@ class OptimizerFactory(object): ...@@ -143,4 +147,9 @@ class OptimizerFactory(object):
optimizer_dict['learning_rate'] = lr optimizer_dict['learning_rate'] = lr
optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict) optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
if self._use_ema:
optimizer = ema_optimizer.ExponentialMovingAverage(
optimizer, **self._ema_config.as_dict())
return optimizer return optimizer
...@@ -25,8 +25,8 @@ from typing import Any, List, MutableMapping, Text ...@@ -25,8 +25,8 @@ from typing import Any, List, MutableMapping, Text
from absl import logging from absl import logging
import tensorflow as tf import tensorflow as tf
from official.modeling import optimization
from official.utils.misc import keras_utils from official.utils.misc import keras_utils
from official.vision.image_classification import optimizer_factory
def get_callbacks(model_checkpoint: bool = True, def get_callbacks(model_checkpoint: bool = True,
...@@ -165,7 +165,7 @@ class CustomTensorBoard(tf.keras.callbacks.TensorBoard): ...@@ -165,7 +165,7 @@ class CustomTensorBoard(tf.keras.callbacks.TensorBoard):
class MovingAverageCallback(tf.keras.callbacks.Callback): class MovingAverageCallback(tf.keras.callbacks.Callback):
"""A Callback to be used with a `MovingAverage` optimizer. """A Callback to be used with a `ExponentialMovingAverage` optimizer.
Applies moving average weights to the model during validation time to test Applies moving average weights to the model during validation time to test
and predict on the averaged weights rather than the current model weights. and predict on the averaged weights rather than the current model weights.
...@@ -184,7 +184,8 @@ class MovingAverageCallback(tf.keras.callbacks.Callback): ...@@ -184,7 +184,8 @@ class MovingAverageCallback(tf.keras.callbacks.Callback):
def set_model(self, model: tf.keras.Model): def set_model(self, model: tf.keras.Model):
super(MovingAverageCallback, self).set_model(model) super(MovingAverageCallback, self).set_model(model)
assert isinstance(self.model.optimizer, optimizer_factory.MovingAverage) assert isinstance(self.model.optimizer,
optimization.ExponentialMovingAverage)
self.model.optimizer.shadow_copy(self.model) self.model.optimizer.shadow_copy(self.model)
def on_test_begin(self, logs: MutableMapping[Text, Any] = None): def on_test_begin(self, logs: MutableMapping[Text, Any] = None):
...@@ -225,13 +226,14 @@ class AverageModelCheckpoint(tf.keras.callbacks.ModelCheckpoint): ...@@ -225,13 +226,14 @@ class AverageModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
save_weights_only, mode, save_freq, **kwargs) save_weights_only, mode, save_freq, **kwargs)
def set_model(self, model): def set_model(self, model):
if not isinstance(model.optimizer, optimizer_factory.MovingAverage): if not isinstance(model.optimizer, optimization.ExponentialMovingAverage):
raise TypeError('AverageModelCheckpoint is only used when training' raise TypeError('AverageModelCheckpoint is only used when training'
'with MovingAverage') 'with MovingAverage')
return super().set_model(model) return super().set_model(model)
def _save_model(self, epoch, logs): def _save_model(self, epoch, logs):
assert isinstance(self.model.optimizer, optimizer_factory.MovingAverage) assert isinstance(self.model.optimizer,
optimization.ExponentialMovingAverage)
if self.update_weights: if self.update_weights:
self.model.optimizer.assign_average_vars(self.model.variables) self.model.optimizer.assign_average_vars(self.model.variables)
......
...@@ -18,241 +18,19 @@ from __future__ import division ...@@ -18,241 +18,19 @@ from __future__ import division
# from __future__ import google_type_annotations # from __future__ import google_type_annotations
from __future__ import print_function from __future__ import print_function
from typing import Any, Dict, Text, List from typing import Any, Dict, Text
from absl import logging from absl import logging
import tensorflow as tf import tensorflow as tf
import tensorflow_addons as tfa import tensorflow_addons as tfa
from official.modeling import optimization
from official.vision.image_classification import learning_rate from official.vision.image_classification import learning_rate
from official.vision.image_classification.configs import base_configs from official.vision.image_classification.configs import base_configs
# pylint: disable=protected-access # pylint: disable=protected-access
class MovingAverage(tf.keras.optimizers.Optimizer):
"""Optimizer that computes a moving average of the variables.
Empirically it has been found that using the moving average of the trained
parameters of a deep network is better than using its trained parameters
directly. This optimizer allows you to compute this moving average and swap
the variables at save time so that any code outside of the training loop
will use by default the average values instead of the original ones.
Example of usage for training:
```python
opt = tf.keras.optimizers.SGD(learning_rate)
opt = MovingAverage(opt)
opt.shadow_copy(model)
```
At test time, swap the shadow variables to evaluate on the averaged weights:
```python
opt.swap_weights()
# Test eval the model here
opt.swap_weights()
```
"""
def __init__(self,
optimizer: tf.keras.optimizers.Optimizer,
average_decay: float = 0.99,
start_step: int = 0,
dynamic_decay: bool = True,
name: Text = 'moving_average',
**kwargs):
"""Construct a new MovingAverage optimizer.
Args:
optimizer: `tf.keras.optimizers.Optimizer` that will be used to compute
and apply gradients.
average_decay: float. Decay to use to maintain the moving averages of
trained variables.
start_step: int. What step to start the moving average.
dynamic_decay: bool. Whether to change the decay based on the number of
optimizer updates. Decay will start at 0.1 and gradually increase up to
`average_decay` after each optimizer update. This behavior is similar to
`tf.train.ExponentialMovingAverage` in TF 1.x.
name: Optional name for the operations created when applying gradients.
Defaults to "moving_average".
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
`decay`}.
"""
super(MovingAverage, self).__init__(name, **kwargs)
self._optimizer = optimizer
self._average_decay = average_decay
self._start_step = tf.constant(start_step, tf.float32)
self._dynamic_decay = dynamic_decay
def shadow_copy(self, model: tf.keras.Model):
"""Creates shadow variables for the given model weights."""
for var in model.weights:
self.add_slot(var, 'average', initializer='zeros')
self._average_weights = [
self.get_slot(var, 'average') for var in model.weights
]
self._model_weights = model.weights
@property
def has_shadow_copy(self):
"""Whether this optimizer has created shadow variables."""
return self._model_weights is not None
def _create_slots(self, var_list):
self._optimizer._create_slots(var_list=var_list) # pylint: disable=protected-access
def apply_gradients(self, grads_and_vars, name: Text = None):
result = self._optimizer.apply_gradients(grads_and_vars, name)
self.update_average(self._optimizer.iterations)
return result
@tf.function
def update_average(self, step: tf.Tensor):
step = tf.cast(step, tf.float32)
if step < self._start_step:
decay = tf.constant(0., tf.float32)
elif self._dynamic_decay:
decay = step - self._start_step
decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
else:
decay = self._average_decay
def _apply_moving(v_moving, v_normal):
diff = v_moving - v_normal
v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
return v_moving
def _update(strategy, v_moving_and_v_normal):
for v_moving, v_normal in v_moving_and_v_normal:
strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
ctx = tf.distribute.get_replica_context()
return ctx.merge_call(
_update, args=(zip(self._average_weights, self._model_weights),))
def swap_weights(self):
"""Swap the average and moving weights.
This is a convenience method to allow one to evaluate the averaged weights
at test time. Loads the weights stored in `self._average` into the model,
keeping a copy of the original model weights. Swapping twice will return
the original weights.
"""
if tf.distribute.in_cross_replica_context():
strategy = tf.distribute.get_strategy()
strategy.run(self._swap_weights, args=())
else:
raise ValueError('Swapping weights must occur under a '
'tf.distribute.Strategy')
@tf.function
def _swap_weights(self):
def fn_0(a, b):
a.assign_add(b)
return a
def fn_1(b, a):
b.assign(a - b)
return b
def fn_2(a, b):
a.assign_sub(b)
return a
def swap(strategy, a_and_b):
"""Swap `a` and `b` and mirror to all devices."""
for a, b in a_and_b:
strategy.extended.update(a, fn_0, args=(b,)) # a = a + b
strategy.extended.update(b, fn_1, args=(a,)) # b = a - b
strategy.extended.update(a, fn_2, args=(b,)) # a = a - b
ctx = tf.distribute.get_replica_context()
return ctx.merge_call(
swap, args=(zip(self._average_weights, self._model_weights),))
def assign_average_vars(self, var_list: List[tf.Variable]):
"""Assign variables in var_list with their respective averages.
Args:
var_list: List of model variables to be assigned to their average.
Returns:
assign_op: The op corresponding to the assignment operation of
variables to their average.
"""
assign_op = tf.group([
var.assign(self.get_slot(var, 'average'))
for var in var_list
if var.trainable
])
return assign_op
def _create_hypers(self):
self._optimizer._create_hypers() # pylint: disable=protected-access
def _prepare(self, var_list):
return self._optimizer._prepare(var_list=var_list) # pylint: disable=protected-access
@property
def iterations(self):
return self._optimizer.iterations
@iterations.setter
def iterations(self, variable):
self._optimizer.iterations = variable
@property
def weights(self):
# return self._weights + self._optimizer.weights
return self._optimizer.weights
@property
def lr(self):
return self._optimizer._get_hyper('learning_rate')
@lr.setter
def lr(self, lr):
self._optimizer._set_hyper('learning_rate', lr)
@property
def learning_rate(self):
return self._optimizer._get_hyper('learning_rate')
@learning_rate.setter
def learning_rate(self, learning_rate): # pylint: disable=redefined-outer-name
self._optimizer._set_hyper('learning_rate', learning_rate)
def _resource_apply_dense(self, grad, var):
return self._optimizer._resource_apply_dense(grad, var)
def _resource_apply_sparse(self, grad, var, indices):
return self._optimizer._resource_apply_sparse(grad, var, indices)
def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
return self._optimizer._resource_apply_sparse_duplicate_indices(
grad, var, indices)
def get_config(self):
config = {
'optimizer': tf.keras.optimizers.serialize(self._optimizer),
'average_decay': self._average_decay,
'start_step': self._start_step,
'dynamic_decay': self._dynamic_decay,
}
base_config = super(MovingAverage, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
@classmethod
def from_config(cls, config, custom_objects=None):
optimizer = tf.keras.optimizers.deserialize(
config.pop('optimizer'),
custom_objects=custom_objects,
)
return cls(optimizer, **config)
def build_optimizer( def build_optimizer(
optimizer_name: Text, optimizer_name: Text,
base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule, base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
...@@ -269,7 +47,7 @@ def build_optimizer( ...@@ -269,7 +47,7 @@ def build_optimizer(
should contain optimizer specific parameters such as `base_learning_rate`, should contain optimizer specific parameters such as `base_learning_rate`,
`decay`, etc. `decay`, etc.
model: The `tf.keras.Model`. This is used for the shadow copy if using model: The `tf.keras.Model`. This is used for the shadow copy if using
`MovingAverage`. `ExponentialMovingAverage`.
Returns: Returns:
A tf.keras.Optimizer. A tf.keras.Optimizer.
...@@ -336,9 +114,10 @@ def build_optimizer( ...@@ -336,9 +114,10 @@ def build_optimizer(
moving_average_decay = params.get('moving_average_decay', 0.) moving_average_decay = params.get('moving_average_decay', 0.)
if moving_average_decay is not None and moving_average_decay > 0.: if moving_average_decay is not None and moving_average_decay > 0.:
if model is None: if model is None:
raise ValueError('`model` must be provided if using `MovingAverage`.') raise ValueError(
'`model` must be provided if using `ExponentialMovingAverage`.')
logging.info('Including moving average decay.') logging.info('Including moving average decay.')
optimizer = MovingAverage( optimizer = optimization.ExponentialMovingAverage(
optimizer=optimizer, average_decay=moving_average_decay) optimizer=optimizer, average_decay=moving_average_decay)
optimizer.shadow_copy(model) optimizer.shadow_copy(model)
return optimizer return optimizer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment