Internal change

PiperOrigin-RevId: 314002442

Internal change
PiperOrigin-RevId: 314002442
87ec3d2a · Abdullah Rashwan · A. Unique TensorFlower · 4ce55184 · 87ec3d2a · 87ec3d2a
Commit 87ec3d2a authored May 31, 2020 by Abdullah Rashwan Committed by A. Unique TensorFlower May 31, 2020
9 changed files
--- a/official/modeling/optimization/__init__.py
+++ b/official/modeling/optimization/__init__.py
+"""Optimization package definition."""
+
+# pylint: disable=wildcard-import
+from official.modeling.optimization.configs.learning_rate_config import *
+from official.modeling.optimization.configs.optimization_config import *
+from official.modeling.optimization.configs.optimizer_config import *
+from official.modeling.optimization.optimizer_factory import OptimizerFactory
--- a/official/modeling/optimization/configs/__init__.py
+++ b/official/modeling/optimization/configs/__init__.py
--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dataclasses for learning rate schedule config."""
+from typing import List, Optional
+
+import dataclasses
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class StepwiseLrConfig(base_config.Config):
+  """Configuration for stepwise learning rate decay.
+
+  This class is a container for the piecewise constant learning rate scheduling
+  configs. It will configure an instance of PiecewiseConstantDecay keras
+  learning rate schedule.
+
+  An example (from keras docs): use a learning rate that's 1.0 for the first
+  100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5, 0.1]
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
+    boundaries: A list of ints of strictly increasing entries.
+    Defaults to None.
+    values: A list of floats that specifies the values for the intervals defined
+            by `boundaries`. It should have one more element than `boundaries`.
+            The learning rate is computed as follows:
+              [0, boundaries[0]]                 -> values[0]
+              [boundaries[0], boundaries[1]]     -> values[1]
+              [boundaries[n-1], boundaries[n]]   -> values[n]
+              [boundaries[n], end]               -> values[n+1]
+            Defaults to None.
+  """
+  name: str = 'PiecewiseConstantDecay'
+  boundaries: Optional[List[int]] = None
+  values: Optional[List[float]] = None
+
+
+@dataclasses.dataclass
+class ExponentialLrConfig(base_config.Config):
+  """Configuration for exponential learning rate decay.
+
+  This class is a containers for the exponential learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to ExponentialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to
+                           None.
+    decay_steps: A positive integer that is used for decay computation.
+                 Defaults to None.
+    decay_rate: A float. Defaults to None.
+    staircase: A boolean, if true, learning rate is decreased at discreate
+               intervals. Defaults to False.
+  """
+  name: str = 'ExponentialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  decay_rate: Optional[float] = None
+  staircase: Optional[bool] = None
+
+
+@dataclasses.dataclass
+class PolynomialLrConfig(base_config.Config):
+  """Configuration for polynomial learning rate decay.
+
+  This class is a containers for the polynomial learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PolynomialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to
+                           None.
+    decay_steps: A positive integer that is used for decay computation.
+                 Defaults to None.
+    end_learning_rate: A float.  The minimal end learning rate.
+    power: A float.  The power of the polynomial. Defaults to linear, 1.0.
+    cycle: A boolean, whether or not it should cycle beyond decay_steps.
+           Defaults to False.
+  """
+  name: str = 'PolynomialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  end_learning_rate: float = 0.0001
+  power: float = 1.0
+  cycle: bool = False
+
+
+@dataclasses.dataclass
+class LinearWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+
+  This class is a container for the linear warmup schedule configs.
+  Warmup_learning_rate is the initial learning rate, the final learning rate of
+  the warmup period is the learning_rate of the optimizer in use. The learning
+  rate at each step linearly increased according to the following formula:
+    warmup_learning_rate = warmup_learning_rate +
+    step / warmup_steps * (final_learning_rate - warmup_learning_rate).
+  Using warmup overrides the learning rate schedule by the number of warmup
+  steps.
+
+  Attributes:
+    name: The name of warmup schedule. Defaults to linear.
+    warmup_learning_rate: Initial learning rate for the warmup. Defaults to 0.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'LinearWarmup'
+  warmup_learning_rate: float = 0
+  warmup_steps: Optional[int] = None
+
+
--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dataclasses for optimization configs.
+
+This file define the dataclass for optimization configs (OptimizationConfig).
+It also has two helper functions get_optimizer_config, and get_lr_config from
+an OptimizationConfig class.
+"""
+from typing import Optional
+
+import dataclasses
+
+from official.modeling.hyperparams import base_config
+from official.modeling.hyperparams import oneof
+from official.modeling.optimization.configs import learning_rate_config as lr_cfg
+from official.modeling.optimization.configs import optimizer_config as opt_cfg
+
+
+@dataclasses.dataclass
+class OptimizerConfig(oneof.OneOfConfig):
+  """Configuration for optimizer.
+
+  Attributes:
+    type: 'str', type of optimizer to be used, on the of fields below.
+    sgd: sgd optimizer config.
+    adam: adam optimizer config.
+    adamw: adam with weight decay.
+    lamb: lamb optimizer.
+  """
+  type: Optional[str] = None
+  sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
+  adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
+  adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
+  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
+
+
+@dataclasses.dataclass
+class LrConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+
+  Attributes:
+    type: 'str', type of lr schedule to be used, on the of fields below.
+    stepwise: stepwise learning rate config.
+    exponential: exponential learning rate config.
+    polynomial: polynomial learning rate config.
+  """
+  type: Optional[str] = None
+  stepwise: lr_cfg.StepwiseLrConfig = lr_cfg.StepwiseLrConfig()
+  exponential: lr_cfg.ExponentialLrConfig = lr_cfg.ExponentialLrConfig()
+  polynomial: lr_cfg.PolynomialLrConfig = lr_cfg.PolynomialLrConfig()
+
+
+@dataclasses.dataclass
+class WarmupConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+
+  Attributes:
+    type: 'str', type of warmup schedule to be used, on the of fields below.
+    linear: linear warmup config.
+  """
+  type: Optional[str] = None
+  linear: lr_cfg.LinearWarmupConfig = lr_cfg.LinearWarmupConfig()
+
+
+@dataclasses.dataclass
+class OptimizationConfig(base_config.Config):
+  """Configuration for optimizer and learning rate schedule.
+
+  Attributes:
+    optimizer: optimizer oneof config.
+    learning_rate: learning rate oneof config.
+    warmup: warmup oneof config.
+  """
+  optimizer: OptimizerConfig = OptimizerConfig()
+  learning_rate: LrConfig = LrConfig()
+  warmup: WarmupConfig = WarmupConfig()
--- a/official/modeling/optimization/configs/optimization_config_test.py
+++ b/official/modeling/optimization/configs/optimization_config_test.py
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for optimization_config.py."""
+
+import tensorflow as tf
+
+from official.modeling.optimization.configs import learning_rate_config as lr_cfg
+from official.modeling.optimization.configs import optimization_config
+from official.modeling.optimization.configs import optimizer_config as opt_cfg
+
+
+class OptimizerConfigTest(tf.test.TestCase):
+
+  def test_no_optimizer(self):
+    optimizer = optimization_config.OptimizationConfig({}).optimizer.get()
+    self.assertEqual(optimizer, None)
+
+  def test_no_lr_schedule(self):
+    lr = optimization_config.OptimizationConfig({}).learning_rate.get()
+    self.assertEqual(lr, None)
+
+  def test_no_warmup_schedule(self):
+    warmup = optimization_config.OptimizationConfig({}).warmup.get()
+    self.assertEqual(warmup, None)
+
+  def test_config(self):
+    opt_config = optimization_config.OptimizationConfig({
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {}  # default config
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {}
+        },
+        'warmup': {
+            'type': 'linear'
+        }
+    })
+    self.assertEqual(opt_config.optimizer.get(),
+                     opt_cfg.SGDConfig())
+    self.assertEqual(opt_config.learning_rate.get(),
+                     lr_cfg.PolynomialLrConfig())
+    self.assertEqual(opt_config.warmup.get(),
+                     lr_cfg.LinearWarmupConfig())
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dataclasses for optimizer configs."""
+from typing import List, Optional
+
+import dataclasses
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class SGDConfig(base_config.Config):
+  """Configuration for SGD optimizer.
+
+  The attributes for this class matches the arguments of tf.keras.optimizer.SGD.
+
+  Attributes:
+    name: name of the optimizer.
+    learning_rate: learning_rate for SGD optimizer.
+    decay: decay rate for SGD optimizer.
+    nesterov: nesterov for SGD optimizer.
+    momentum: momentum for SGD optimizer.
+  """
+  name: str = "SGD"
+  learning_rate: float = 0.01
+  decay: float = 0.0
+  nesterov: bool = False
+  momentum: float = 0.0
+
+
+@dataclasses.dataclass
+class AdamConfig(base_config.Config):
+  """Configuration for Adam optimizer.
+
+  The attributes for this class matches the arguments of
+  tf.keras.optimizer.Adam.
+
+  Attributes:
+    name: name of the optimizer.
+    learning_rate: learning_rate for Adam optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in Adam optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+    the paper "On the Convergence of Adam and beyond".
+  """
+  name: str = "Adam"
+  learning_rate: float = 0.001
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+
+
+@dataclasses.dataclass
+class AdamWeightDecayConfig(base_config.Config):
+  """Configuration for Adam optimizer with weight decay.
+
+  Attributes:
+    name: name of the optimizer.
+    learning_rate: learning_rate for the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in the optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+    the paper "On the Convergence of Adam and beyond".
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    include_in_weight_decay: list[str], or None. List of weight names to include
+                             in weight decay.
+    include_in_weight_decay: list[str], or None. List of weight names to not
+                             include in weight decay.
+  """
+  name: str = "AdamWeightDecay"
+  learning_rate: float = 0.001
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+  weight_decay_rate: float = 0.0
+  include_in_weight_decay: Optional[List[str]] = None
+  exclude_from_weight_decay: Optional[List[str]] = None
+
+
+@dataclasses.dataclass
+class LAMBConfig(base_config.Config):
+  """Configuration for LAMB optimizer.
+
+  The attributes for this class matches the arguments of
+  tensorflow_addons.optimizers.LAMB.
+
+  Attributes:
+    name: name of the optimizer.
+    learning_rate: learning_rate for Adam optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in LAMB optimizer.
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    exclude_from_weight_decay: List of regex patterns of variables excluded from
+                               weight decay. Variables whose name contain a
+                               substring matching the pattern will be excluded.
+    exclude_from_layer_adaptation: List of regex patterns of variables excluded
+                                   from layer adaptation. Variables whose name
+                                   contain a substring matching the pattern will
+                                   be excluded.
+  """
+  name: str = "LAMB"
+  learning_rate: float = 0.001
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
--- a/official/modeling/optimization/lr_schedule.py
+++ b/official/modeling/optimization/lr_schedule.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Learning rate schedule classes."""
+
+from typing import Mapping, Any, Union, Optional
+
+import tensorflow as tf
+
+
+class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Linear warmup schedule."""
+
+  def __init__(self, after_warmup_lr_sched: Union[
+      tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int, warmup_learning_rate: float,
+               name: Optional[str] = None):
+    """Add linear warmup schedule to a learning rate schedule.
+
+    warmup_lr is the initial learning rate, the final learning rate of the
+    init_warmup period is the initial learning rate of lr_schedule in use.
+    The learning rate at each step linearly increased according to the following
+    formula:
+      learning_rate = warmup_lr + step / warmup_steps
+                    * (final_warmup_lr - warmup_lr).
+    Using warmup overrides the learning rate schedule by the number of warmup
+    steps.
+
+    Args:
+      after_warmup_lr_sched: tf.keras.optimizers.schedules
+                                .LearningRateSchedule or a constant.
+      warmup_steps: int. number of the warmup steps.
+      warmup_learning_rate: floating point number. Initial learning rate for the
+                      warmup.
+      name: Optional, name of warmup schedule.
+    """
+    super(LinearWarmup, self).__init__()
+    self._name = name
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._warmup_steps = warmup_steps
+    self._init_warmup_lr = warmup_learning_rate
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._final_warmup_lr = tf.cast(
+          after_warmup_lr_sched, dtype=tf.float32)
+
+  def __call__(self, step: int):
+
+    global_step = tf.cast(step, dtype=tf.float32)
+
+    linear_warmup_lr = (
+        self._init_warmup_lr + global_step / self._warmup_steps *
+        (self._final_warmup_lr - self._init_warmup_lr))
+
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      after_warmup_lr = self._after_warmup_lr_sched(step)
+    else:
+      after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+    lr = tf.cond(global_step < self._warmup_steps,
+                 lambda: linear_warmup_lr,
+                 lambda: after_warmup_lr)
+    return lr
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      name = "{!s}WithWarmup".format(self._after_warmup_lr_sched.name)  # pytype: disable=attribute-error
+      config = self._after_warmup_lr_sched.get_config()  # pytype: disable=attribute-error
+    else:
+      name = "ConstantWithWarmup"
+      config = {"learning_rate": self._after_warmup_lr_sched}
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "warmup_learning_rate": self._init_warmup_lr,
+        "name": name
+    })
+    return config
--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer factory class."""
+
+from typing import Union
+
+import tensorflow as tf
+
+import tensorflow_addons.optimizers as tfa_optimizers
+
+from official.modeling.optimization import lr_schedule
+from official.modeling.optimization.configs import optimization_config as opt_cfg
+from official.nlp import optimization as nlp_optimization
+
+OPTIMIZERS_CLS = {
+    'sgd': tf.keras.optimizers.SGD,
+    'adam': tf.keras.optimizers.Adam,
+    'adamw': nlp_optimization.AdamWeightDecay,
+    'lamb': tfa_optimizers.LAMB
+}
+
+LR_CLS = {
+    'stepwise': tf.keras.optimizers.schedules.PiecewiseConstantDecay,
+    'polynomial': tf.keras.optimizers.schedules.PolynomialDecay,
+    'exponential': tf.keras.optimizers.schedules.ExponentialDecay,
+}
+
+WARMUP_CLS = {
+    'linear': lr_schedule.LinearWarmup
+}
+
+
+class OptimizerFactory(object):
+  """Optimizer factory class.
+
+  This class builds learning rate and optimizer based on an optimization config.
+  To use this class, you need to do the following:
+  (1) Define optimization config, this includes optimizer, and learning rate
+      schedule.
+  (2) Initialize the class using the optimization config.
+  (3) Build learning rate.
+  (4) Build optimizer.
+
+  This is a typical example for using this class:
+  params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {'boundaries': [10000, 20000],
+                         'values': [0.1, 0.01, 0.001]}
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01}
+        }
+    }
+  opt_config = OptimizationConfig(params)
+  opt_factory = OptimizerFactory(opt_config)
+  lr = opt_factory.build_learning_rate()
+  optimizer = opt_factory.build_optimizer(lr)
+  """
+
+  def __init__(self, config: opt_cfg.OptimizationConfig):
+    """Initializing OptimizerFactory.
+
+    Args:
+      config: OptimizationConfig instance contain optimization config.
+    """
+    self._config = config
+    self._optimizer_config = config.optimizer.get()
+    self._optimizer_type = config.optimizer.type
+
+    if self._optimizer_config is None:
+      raise ValueError('Optimizer type must be specified')
+
+    self._lr_config = config.learning_rate.get()
+    self._lr_type = config.learning_rate.type
+
+    self._warmup_config = config.warmup.get()
+    self._warmup_type = config.warmup.type
+
+  def build_learning_rate(self):
+    """Build learning rate.
+
+    Builds learning rate from config. Learning rate schedule is built according
+    to the learning rate config. If there is no learning rate config, optimizer
+    learning rate is returned.
+
+    Returns:
+      tf.keras.optimizers.schedules.LearningRateSchedule instance. If no
+      learning rate schedule defined, optimizer_config.learning_rate is
+      returned.
+    """
+
+    if not self._lr_config:
+      lr = self._optimizer_config.learning_rate
+    else:
+      lr = LR_CLS[self._lr_type](**self._lr_config.as_dict())
+
+    if self._warmup_config:
+      lr = WARMUP_CLS[self._warmup_type](lr, **self._warmup_config.as_dict())
+
+    return lr
+
+  def build_optimizer(
+      self, lr: Union[tf.keras.optimizers.schedules.LearningRateSchedule,
+                      float]):
+    """Build optimizer.
+
+    Builds optimizer from config. It takes learning rate as input, and builds
+    the optimizer according to the optimizer config. Typically, the learning
+    rate built using self.build_lr() is passed as an argument to this method.
+
+    Args:
+      lr: A floating point value, or
+          a tf.keras.optimizers.schedules.LearningRateSchedule instance.
+    Returns:
+      tf.keras.optimizers.Optimizer instance.
+    """
+
+    optimizer_dict = self._optimizer_config.as_dict()
+    optimizer_dict['learning_rate'] = lr
+
+    optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
+    return optimizer
+
--- a/official/modeling/optimization/optimizer_factory_test.py
+++ b/official/modeling/optimization/optimizer_factory_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for optimizer_factory.py."""
+
+import tensorflow as tf
+import tensorflow_addons.optimizers as tfa_optimizers
+
+from official.modeling.optimization import optimizer_factory
+from official.modeling.optimization.configs import optimization_config
+from official.nlp import optimization as nlp_optimization
+
+
+class OptimizerFactoryTest(tf.test.TestCase):
+
+  def test_sgd_optimizer(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+        }
+    }
+    expected_optimizer_config = {
+        'name': 'SGD',
+        'learning_rate': 0.1,
+        'decay': 0.0,
+        'momentum': 0.9,
+        'nesterov': False
+    }
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr)
+
+    self.assertIsInstance(optimizer, tf.keras.optimizers.SGD)
+    self.assertEqual(expected_optimizer_config, optimizer.get_config())
+
+  def test_adam_optimizer(self):
+
+    # Define adam optimizer with default values.
+    params = {
+        'optimizer': {
+            'type': 'adam'
+        }
+    }
+    expected_optimizer_config = tf.keras.optimizers.Adam().get_config()
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr)
+
+    self.assertIsInstance(optimizer, tf.keras.optimizers.Adam)
+    self.assertEqual(expected_optimizer_config, optimizer.get_config())
+
+  def test_adam_weight_decay_optimizer(self):
+    params = {
+        'optimizer': {
+            'type': 'adamw'
+        }
+    }
+    expected_optimizer_config = nlp_optimization.AdamWeightDecay().get_config()
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr)
+
+    self.assertIsInstance(optimizer, nlp_optimization.AdamWeightDecay)
+    self.assertEqual(expected_optimizer_config, optimizer.get_config())
+
+  def test_lamb_optimizer(self):
+    params = {
+        'optimizer': {
+            'type': 'lamb'
+        }
+    }
+    expected_optimizer_config = tfa_optimizers.LAMB().get_config()
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr)
+
+    self.assertIsInstance(optimizer, tfa_optimizers.LAMB)
+    self.assertEqual(expected_optimizer_config, optimizer.get_config())
+
+  def test_stepwise_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {'boundaries': [10000, 20000],
+                         'values': [0.1, 0.01, 0.001]}
+        }
+    }
+    expected_lr_step_values = [
+        [0, 0.1],
+        [5000, 0.1],
+        [10000, 0.1],
+        [10001, 0.01],
+        [20000, 0.01],
+        [20001, 0.001]
+    ]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_stepwise_lr_with_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {'boundaries': [10000, 20000],
+                         'values': [0.1, 0.01, 0.001]}
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01}
+        }
+    }
+    expected_lr_step_values = [
+        [0, 0.01],
+        [250, 0.055],
+        [500, 0.1],
+        [5500, 0.1],
+        [10000, 0.1],
+        [10001, 0.01],
+        [20000, 0.01],
+        [20001, 0.001]
+    ]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_exponential_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'exponential',
+            'exponential': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000,
+                'decay_rate': 0.96,
+                'staircase': True
+            }
+        }
+    }
+    expected_lr_step_values = [
+        [0, 0.1],
+        [999, 0.1],
+        [1000, 0.096],
+        [1999, 0.096],
+        [2000, 0.09216],
+    ]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_polynomial_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000,
+                'end_learning_rate': 0.001
+            }
+        }
+    }
+
+    expected_lr_step_values = [[0, 0.1], [500, 0.0505], [1000, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_constant_lr_with_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 500,
+                'warmup_learning_rate': 0.01
+            }
+        }
+    }
+
+    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5000, 0.1],
+                               [10000, 0.1], [20000, 0.1]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+
+if __name__ == '__main__':
+  tf.test.main()