Add adagrad optimizer support.

PiperOrigin-RevId: 370551493

Add adagrad optimizer support.
PiperOrigin-RevId: 370551493
95d1b298 · Hao Wu · A. Unique TensorFlower · 71c7b7f9 · 95d1b298 · 95d1b298
Commit 95d1b298 authored Apr 26, 2021 by Hao Wu Committed by A. Unique TensorFlower Apr 26, 2021
4 changed files
--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
@@ -40,6 +40,7 @@ class OptimizerConfig(oneof.OneOfConfig):
    lamb: lamb optimizer.
    rmsprop: rmsprop optimizer.
    lars: lars optimizer.
+    adagrad: adagrad optimizer.
  """
  type: Optional[str] = None
  sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
@@ -48,6 +49,7 @@ class OptimizerConfig(oneof.OneOfConfig):
  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
  rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()
  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
+  adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()


 @dataclasses.dataclass
@@ -99,8 +101,8 @@ class OptimizationConfig(base_config.Config):

  Attributes:
    optimizer: optimizer oneof config.
-    ema: optional exponential moving average optimizer config, if specified,
-      ema optimizer will be used.
+    ema: optional exponential moving average optimizer config, if specified, ema
+      optimizer will be used.
    learning_rate: learning rate oneof config.
    warmup: warmup oneof config.
  """

--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
@@ -29,7 +29,7 @@ class BaseOptimizerConfig(base_config.Config):
    clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
      their absolute value exceeds this value.
    global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
-        clipped so that their global norm is no higher than this value
+      clipped so that their global norm is no higher than this value
  """
  clipnorm: Optional[float] = None
  clipvalue: Optional[float] = None
@@ -75,6 +75,24 @@ class RMSPropConfig(BaseOptimizerConfig):
  centered: bool = False


+@dataclasses.dataclass
+class AdagradConfig(BaseOptimizerConfig):
+  """Configuration for Adagrad optimizer.
+
+  The attributes of this class match the arguments of
+  tf.keras.optimizer.Adagrad.
+
+  Attributes:
+    name: name of the optimizer.
+    initial_accumulator_value: A floating point value. Starting value for the
+      accumulators, must be non-negative.
+    epsilon: A small floating point value to avoid zero denominator.
+  """
+  name: str = "Adagrad"
+  initial_accumulator_value: float = 0.1
+  epsilon: float = 1e-07
+
+
 @dataclasses.dataclass
 class AdamConfig(BaseOptimizerConfig):
  """Configuration for Adam optimizer.
@@ -178,24 +196,23 @@ class LARSConfig(BaseOptimizerConfig):

  Attributes:
    name: 'str', name of the optimizer.
-    momentum: `float` hyperparameter >= 0 that accelerates gradient descent
-        in the relevant direction and dampens oscillations. Defaults to 0.9.
+    momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
+      the relevant direction and dampens oscillations. Defaults to 0.9.
    eeta: `float` LARS coefficient as used in the paper. Default set to LARS
-        coefficient from the paper. (eeta / weight_decay) determines the
-        highest scaling factor in LARS..
+      coefficient from the paper. (eeta / weight_decay) determines the highest
+      scaling factor in LARS..
    weight_decay_rate: `float` for weight decay.
    nesterov: 'boolean' for whether to use nesterov momentum.
    classic_momentum: `boolean` for whether to use classic (or popular)
-        momentum. The learning rate is applied during momentum update in
-        classic momentum, but after momentum for popular momentum.
-    exclude_from_weight_decay: A list of `string` for variable screening, if
-        any of the string appears in a variable's name, the variable will be
-        excluded for computing weight decay. For example, one could specify
-        the list like ['batch_normalization', 'bias'] to exclude BN and bias
-        from weight decay.
-    exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
-        for layer adaptation. If it is None, it will be defaulted the same as
-        exclude_from_weight_decay.
+      momentum. The learning rate is applied during momentum update in classic
+      momentum, but after momentum for popular momentum.
+    exclude_from_weight_decay: A list of `string` for variable screening, if any
+      of the string appears in a variable's name, the variable will be excluded
+      for computing weight decay. For example, one could specify the list like
+      ['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
+    exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
+      layer adaptation. If it is None, it will be defaulted the same as
+      exclude_from_weight_decay.
  """
  name: str = "LARS"
  momentum: float = 0.9

--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -15,7 +15,6 @@
 """Optimizer factory class."""
 from typing import Callable, Union

-
 import gin
 import tensorflow as tf
 import tensorflow_addons.optimizers as tfa_optimizers
@@ -33,6 +32,7 @@ OPTIMIZERS_CLS = {
    'lamb': tfa_optimizers.LAMB,
    'rmsprop': tf.keras.optimizers.RMSprop,
    'lars': lars_optimizer.LARS,
+    'adagrad': tf.keras.optimizers.Adagrad,
 }

 LR_CLS = {

--- a/official/modeling/optimization/optimizer_factory_test.py
+++ b/official/modeling/optimization/optimizer_factory_test.py
@@ -23,9 +23,8 @@ from official.modeling.optimization.configs import optimization_config

 class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):

-  @parameterized.parameters(('sgd'), ('rmsprop'),
-                            ('adam'), ('adamw'),
-                            ('lamb'), ('lars'))
+  @parameterized.parameters(('sgd'), ('rmsprop'), ('adam'), ('adamw'), ('lamb'),
+                            ('lars'), ('adagrad'))
  def test_optimizers(self, optimizer_type):
    params = {
        'optimizer': {
@@ -50,10 +49,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    self.assertIsInstance(optimizer, optimizer_cls)
    self.assertEqual(expected_optimizer_config, optimizer.get_config())

-  @parameterized.parameters(
-      (None, None),
-      (1.0, None),
-      (None, 1.0))
+  @parameterized.parameters((None, None), (1.0, None), (None, 1.0))
  def test_gradient_clipping(self, clipnorm, clipvalue):
    params = {
        'optimizer': {
@@ -359,8 +355,8 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
            }
        }
    }
-    expected_lr_step_values = [
-        [0, 1.0], [1, 1.0], [40, 1. / 40.], [60, 1. / 60. * 0.8]]
+    expected_lr_step_values = [[0, 1.0], [1, 1.0], [40, 1. / 40.],
+                               [60, 1. / 60. * 0.8]]
    opt_config = optimization_config.OptimizationConfig(params)
    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
    lr = opt_factory.build_learning_rate()