Merge branch 'dtk21.10.1_v1' into 'main'

update some TF file See merge request dcutoolkit/deeplearing/dlexamples_new!5

Merge branch 'dtk21.10.1_v1' into 'main'
update some TF file See merge request dcutoolkit/deeplearing/dlexamples_new!5
7f99c1c3 · huchen · 6b6f8b0c · cf66c525 · 7f99c1c3 · 7f99c1c3
Commit 7f99c1c3 authored Apr 15, 2022 by huchen
20 changed files
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/lr_schedule.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/lr_schedule.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Learning rate schedule classes."""
+
+import math
+from typing import Mapping, Any, Union, Optional
+
+import tensorflow as tf
+
+
+def _make_offset_wrapper(new_class_name: str, base_lr_class):
+  """Generates a offset wrapper of learning rate schedule.
+
+  It will returns a subclass of the the `base_lr_class`, the subclass takes an
+  `offset` argument in the constructor. When the new class instance is called,
+  the behavior is:
+    new_class_object(step) = base_lr_class_object(step - offset)
+
+  Example:
+    CosineDecayWithOffset = _make_offset_wrapper(
+                     'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
+    # Use the lr:
+    lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
+                               decay_steps=1000)
+    lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
+
+  Args:
+    new_class_name: the name of the new class.
+    base_lr_class: the base learning rate schedule class. Should be subclass of
+      tf.keras.optimizers.schedules.LearningRateSchedule
+
+  Returns:
+    A new class (subclass of the base_lr_class) that can take an offset.
+  """
+  assert issubclass(base_lr_class,
+                    tf.keras.optimizers.schedules.LearningRateSchedule), (
+                        "base_lr_class should be subclass of keras "
+                        f"LearningRateSchedule, got {base_lr_class}")
+
+  # pylint: disable=protected-access,pointless-statement
+  def offset_learning_rate_init(self, offset=0, **kwargs):
+    """Construct learning rate schedule object.
+
+    When this object is called, its behavior is
+       self.__call__(step) == base_lr_class.__call__(step - offset)
+    Args:
+      self: this object.
+      offset: The offset when computing the learning rate schedule.
+      **kwargs: Pass through to base learning rate class constructor.
+    """
+    base_lr_class.__init__(self, **kwargs)
+    self._offset = offset
+
+  def offset_learning_rate_call(self, step):
+    step = tf.cast(step - self._offset, tf.float32)
+    return base_lr_class.__call__(self, step)
+
+  # pylint: enable=protected-access,pointless-statement
+
+  return type(
+      new_class_name, (base_lr_class,), {
+          "base_lr_class": base_lr_class,
+          "__init__": offset_learning_rate_init,
+          "__call__": offset_learning_rate_call
+      })
+
+
+PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
+    "PiecewiseConstantDecayWithOffset",
+    tf.keras.optimizers.schedules.PiecewiseConstantDecay)
+PolynomialDecayWithOffset = _make_offset_wrapper(
+    "PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
+ExponentialDecayWithOffset = _make_offset_wrapper(
+    "ExponentialDecayWithOffset",
+    tf.keras.optimizers.schedules.ExponentialDecay)
+CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
+                                             tf.keras.experimental.CosineDecay)
+
+
+class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Linear warmup schedule."""
+
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               warmup_learning_rate: float,
+               name: Optional[str] = None):
+    """Add linear warmup schedule to a learning rate schedule.
+
+    warmup_lr is the initial learning rate, the final learning rate of the
+    init_warmup period is the initial learning rate of lr_schedule in use.
+    The learning rate at each step linearly increased according to the following
+    formula:
+      learning_rate = warmup_lr + step / warmup_steps
+                    * (final_warmup_lr - warmup_lr).
+    Using warmup overrides the learning rate schedule by the number of warmup
+    steps.
+
+    Args:
+      after_warmup_lr_sched: tf.keras.optimizers.schedules .LearningRateSchedule
+        or a constant.
+      warmup_steps: Number of the warmup steps.
+      warmup_learning_rate: Initial learning rate for the warmup.
+      name: Optional, name of warmup schedule.
+    """
+    super().__init__()
+    self._name = name
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._warmup_steps = warmup_steps
+    self._init_warmup_lr = warmup_learning_rate
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32)
+
+  def __call__(self, step: int):
+
+    global_step = tf.cast(step, dtype=tf.float32)
+
+    linear_warmup_lr = (
+        self._init_warmup_lr + global_step / self._warmup_steps *
+        (self._final_warmup_lr - self._init_warmup_lr))
+
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      after_warmup_lr = self._after_warmup_lr_sched(step)
+    else:
+      after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+    lr = tf.cond(global_step < self._warmup_steps,
+                 lambda: linear_warmup_lr,
+                 lambda: after_warmup_lr)
+    return lr
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "warmup_learning_rate": self._init_warmup_lr,
+        "name": self._name
+    })
+    return config
+
+
+class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applies polynomial warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               power: float = 1.0,
+               name: str = "PolynomialWarmup"):
+    super().__init__()
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._initial_learning_rate = tf.cast(
+          after_warmup_lr_sched, dtype=tf.float32)
+
+    self._warmup_steps = warmup_steps
+    self._power = power
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PolynomialWarmUp") as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)
+
+      if self._warmup_steps <= 0:
+        warmup_percent_done = 1.0
+      else:
+        # A zero `step` may cause Inf. So make `step` positive.
+        step_non_zero = tf.math.maximum(global_step_float, 1.0)
+        warmup_percent_done = step_non_zero / warmup_steps_float
+
+      warmup_learning_rate = (
+          self._initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self._power))
+
+      if isinstance(self._after_warmup_lr_sched,
+                    tf.keras.optimizers.schedules.LearningRateSchedule):
+        after_warmup_lr = self._after_warmup_lr_sched(step)
+      else:
+        after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: after_warmup_lr,
+          name=name)
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "power": self._power,
+        "name": self._name
+    })
+    return config
+
+
+class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule follows lr * (step)^power."""
+
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               name: str = "DirectPowerDecay"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "DirectPowerDecay"):
+      step = tf.cast(step, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "name": self._name,
+    }
+
+
+class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule with multiplied by linear decay at the end.
+
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+  """
+
+  def __init__(self,
+               initial_learning_rate: float,
+               total_decay_steps: int,
+               power: float = 1.0,
+               linear_decay_fraction: float = 0.1,
+               offset: int = 0,
+               name: str = "PowerAndLinearDecay"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      total_decay_steps: The total number of steps for power + linear decay.
+      power: The order of the polynomial.
+      linear_decay_fraction: In the last `linear_decay_fraction` steps, the
+        learning rate will be multiplied by a linear decay.
+      offset: The offset applied to steps.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._total_decay_steps = total_decay_steps
+    self._power = power
+    self._linear_decay_fraction = linear_decay_fraction
+    self._offset = offset
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerAndLinearDecay"):
+      step = tf.cast(step - self._offset, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      if self._total_decay_steps * self._linear_decay_fraction > 0:
+        learning_rate *= tf.minimum(
+            1.0, (self._total_decay_steps - step) /
+            (self._total_decay_steps * self._linear_decay_fraction))
+        learning_rate = tf.maximum(0.0, learning_rate)
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "total_decay_steps": self._total_decay_steps,
+        "power": self._power,
+        "linear_decay_fraction": self._linear_decay_fraction,
+        "offset": self._offset,
+        "name": self._name,
+    }
+
+
+class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Power learning rate decay with offset.
+
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+  """
+
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               offset: int = 0,
+               pre_offset_learning_rate: float = 1.0e6,
+               name: str = "PowerDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      offset: The offset when computing the power decay.
+      pre_offset_learning_rate: The maximum learning rate we'll use.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._offset = offset
+    self._pre_offset_lr = pre_offset_learning_rate
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerDecayWithOffset"):
+      step = tf.cast(step, tf.float32)
+      lr_after_offset = tf.math.pow(
+          tf.math.maximum(step - self._offset, 1.0), self._power) * (
+              self._initial_learning_rate)
+
+      sign = tf.cast(step > self._offset, tf.float32)
+      lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
+      # Power may give infinitely large LR. So cap it with pre_offset_lr.
+      return tf.math.minimum(lr_combined, self._pre_offset_lr)
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "offset": self._offset,
+        "pre_offset_learning_rate": self._pre_offset_lr,
+        "name": self._name,
+    }
+
+
+class StepConsineDecayWithOffset(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Stepwise cosine learning rate decay with offset.
+
+  Learning rate is equivalent to one or more consine decay(s) starting and
+  ending at each interval.
+
+  ExampleL
+
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5]
+    lr_decayed_fn = (
+    lr_schedule.StepConsineDecayWithOffset(
+        boundaries,
+        values))
+    ```
+
+    from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
+    from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
+  """
+
+  def __init__(self,
+               boundaries,
+               values,
+               offset: int = 0,
+               name: str = "StepConsineDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      boundaries: A list of `Tensor`s or `int`s with strictly
+        increasing entries, and with all elements having the same type as the
+        optimizer step.
+      values: A list of `Tensor`s or `float`s that specifies the
+        values for the intervals defined by `boundaries`. It should have one
+        more element than `boundaries`, and all elements should have the same
+        type.
+      offset: The offset when computing the power decay.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self.values = values
+    self.boundaries = boundaries
+    self.offset = offset
+    self.name = name
+
+    if len(self.values) < 1:
+      raise ValueError(f"Expect non empty {self.values}")
+    if len(self.boundaries) != len(self.values):
+      raise ValueError(
+          "Boundaries length is equal to learning rate levels length"
+          f"{len(self.boundaries)} != {len(self.values)}")
+
+    self.total_steps = (
+        [boundaries[i + 1] - boundaries[i] for i in range(len(boundaries) - 1)
+        ] + [0])
+
+  def __call__(self, global_step):
+    with tf.name_scope(self.name or "StepConsineDecayWithOffset"):
+      global_step = tf.cast(global_step - self.offset, tf.float32)
+      lr_levels = self.values
+      lr_steps = self.boundaries
+      level_total_steps = self.total_steps
+      num_levels = len(lr_levels)
+
+      init_lr = lr_levels[0]
+      next_init_lr = lr_levels[1] if num_levels > 1 else 0.
+
+      init_total_steps = level_total_steps[0]
+
+      cosine_learning_rate = ((init_lr - next_init_lr) * (tf.cos(
+          tf.constant(math.pi) * (global_step) /
+          (init_total_steps)) + 1.0) / 2.0 + next_init_lr)
+      learning_rate = cosine_learning_rate
+      tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
+                                cosine_learning_rate)
+      tf.compat.v1.logging.info("DEBUG lr %r next lr %r inittotalstep %r",
+                                init_lr, next_init_lr, init_total_steps)
+
+      for i in range(1, num_levels):
+        next_init_lr = lr_levels[i]
+        next_start_step = lr_steps[i]
+        next_total_steps = level_total_steps[i]
+        next_next_init_lr = lr_levels[i + 1] if num_levels > i + 1 else 0.
+
+        tf.compat.v1.logging.info(
+            "DEBUG step %r nilr %r nss %r nts %r nnilr %r", global_step,
+            next_init_lr, next_start_step, next_total_steps, next_next_init_lr)
+        next_cosine_learning_rate = ((next_init_lr - next_next_init_lr) *
+                                     (tf.cos(
+                                         tf.constant(math.pi) *
+                                         (global_step - next_start_step) /
+                                         (next_total_steps)) + 1.0) / 2.0 +
+                                     next_next_init_lr)
+        learning_rate = tf.where(global_step >= next_start_step,
+                                 next_cosine_learning_rate, learning_rate)
+        tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
+                                  next_cosine_learning_rate)
+
+    return learning_rate
+
+  def get_config(self):
+    return {
+        "boundaries": self.boundaries,
+        "values": self.values,
+        "offset": self.offset,
+        "name": self.name
+    }
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/lr_schedule_test.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/lr_schedule_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for lr_schedule."""
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.modeling.optimization import lr_schedule
+
+
+class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name='power_only',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.0,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 1.0], [40, 1. / 40.], [60, 1. / 60],
+                    [100, 1. / 100]]),
+      dict(
+          testcase_name='linear_only',
+          init_lr=1.0,
+          power=0.0,
+          linear_decay_fraction=1.0,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 0.99], [40, 0.6], [60, 0.4], [100, 0.0]]),
+      dict(
+          testcase_name='general',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.5,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 1.0], [40, 1. / 40.],
+                    [60, 1. / 60. * 0.8], [100, 0.0]]),
+      dict(
+          testcase_name='offset',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.5,
+          total_decay_steps=100,
+          offset=90,
+          expected=[[0, 1.0], [90, 1.0], [91, 1.0], [130, 1. / 40.],
+                    [150, 1. / 60. * 0.8], [190, 0.0], [200, 0.0]]),
+  )
+  def test_power_linear_lr_schedule(self, init_lr, power, linear_decay_fraction,
+                                    total_decay_steps, offset, expected):
+    lr = lr_schedule.PowerAndLinearDecay(
+        initial_learning_rate=init_lr,
+        power=power,
+        linear_decay_fraction=linear_decay_fraction,
+        total_decay_steps=total_decay_steps,
+        offset=offset)
+    for step, value in expected:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+
+class OffsetLearningRateTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      dict(class_name=lr_schedule.PiecewiseConstantDecayWithOffset),
+      dict(class_name=lr_schedule.PolynomialDecayWithOffset),
+      dict(class_name=lr_schedule.ExponentialDecayWithOffset),
+      dict(class_name=lr_schedule.CosineDecayWithOffset),
+  )
+  def test_generated_docstring(self, class_name):
+    self.assertNotEmpty(class_name.__init__.__doc__)
+
+  @parameterized.parameters(
+      dict(
+          class_name=lr_schedule.PiecewiseConstantDecayWithOffset,
+          kwarg=dict(boundaries=[50, 80], values=[1.0, 0.5, 0.1])),
+      dict(
+          class_name=lr_schedule.PolynomialDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+      dict(
+          class_name=lr_schedule.ExponentialDecayWithOffset,
+          kwarg=dict(
+              initial_learning_rate=1.0, decay_steps=100, decay_rate=0.5)),
+      dict(
+          class_name=lr_schedule.CosineDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+  )
+  def test_offset(self, class_name, kwarg):
+    offset = 10
+    offset_lr = class_name(offset=offset, **kwarg)
+    base_lr = class_name.base_lr_class(**kwarg)
+    self.assertIsInstance(offset_lr, class_name)
+    for step in range(10, 101, 10):
+      self.assertEqual(offset_lr(step), base_lr(step - offset))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/optimizer_factory.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/optimizer_factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimizer factory class."""
+from typing import Callable, Optional, Union, List, Tuple
+
+import gin
+import tensorflow as tf
+import tensorflow_addons.optimizers as tfa_optimizers
+
+from official.modeling.optimization import slide_optimizer
+from official.modeling.optimization import adafactor_optimizer
+from official.modeling.optimization import ema_optimizer
+from official.modeling.optimization import lars_optimizer
+from official.modeling.optimization import lr_schedule
+from official.modeling.optimization.configs import optimization_config as opt_cfg
+from official.nlp import optimization as nlp_optimization
+
+OPTIMIZERS_CLS = {
+    'sgd': tf.keras.optimizers.SGD,
+    'adam': tf.keras.optimizers.Adam,
+    'adamw': nlp_optimization.AdamWeightDecay,
+    'lamb': tfa_optimizers.LAMB,
+    'rmsprop': tf.keras.optimizers.RMSprop,
+    'lars': lars_optimizer.LARS,
+    'adagrad': tf.keras.optimizers.Adagrad,
+    'slide': slide_optimizer.SLIDE,
+    'adafactor': adafactor_optimizer.Adafactor,
+}
+
+LR_CLS = {
+    'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
+    'polynomial': lr_schedule.PolynomialDecayWithOffset,
+    'exponential': lr_schedule.ExponentialDecayWithOffset,
+    'cosine': lr_schedule.CosineDecayWithOffset,
+    'power': lr_schedule.DirectPowerDecay,
+    'power_linear': lr_schedule.PowerAndLinearDecay,
+    'power_with_offset': lr_schedule.PowerDecayWithOffset,
+    'step_cosine_with_offset': lr_schedule.StepConsineDecayWithOffset,
+}
+
+WARMUP_CLS = {
+    'linear': lr_schedule.LinearWarmup,
+    'polynomial': lr_schedule.PolynomialWarmUp
+}
+
+
+def register_optimizer_cls(
+    key: str, optimizer_config_cls: tf.keras.optimizers.Optimizer):
+  """Register customize optimizer cls.
+
+  The user will still need to subclass data classes in
+  configs.optimization_config to be used with OptimizerFactory.
+
+  Args:
+    key: A string to that the optimizer_config_cls is registered with.
+    optimizer_config_cls: A class which inherits tf.keras.optimizers.Optimizer.
+  """
+  if key in OPTIMIZERS_CLS:
+    raise ValueError('%s already registered in OPTIMIZER_CLS.' % key)
+  OPTIMIZERS_CLS[key] = optimizer_config_cls
+
+
+class OptimizerFactory:
+  """Optimizer factory class.
+
+  This class builds learning rate and optimizer based on an optimization config.
+  To use this class, you need to do the following:
+  (1) Define optimization config, this includes optimizer, and learning rate
+      schedule.
+  (2) Initialize the class using the optimization config.
+  (3) Build learning rate.
+  (4) Build optimizer.
+
+  This is a typical example for using this class:
+  params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {'boundaries': [10000, 20000],
+                         'values': [0.1, 0.01, 0.001]}
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01}
+        }
+    }
+  opt_config = OptimizationConfig(params)
+  opt_factory = OptimizerFactory(opt_config)
+  lr = opt_factory.build_learning_rate()
+  optimizer = opt_factory.build_optimizer(lr)
+  """
+
+  def __init__(self, config: opt_cfg.OptimizationConfig):
+    """Initializing OptimizerFactory.
+
+    Args:
+      config: OptimizationConfig instance contain optimization config.
+    """
+    self._config = config
+    self._optimizer_config = config.optimizer.get()
+    self._optimizer_type = config.optimizer.type
+
+    self._use_ema = config.ema is not None
+    self._ema_config = config.ema
+
+    if self._optimizer_config is None:
+      raise ValueError('Optimizer type must be specified')
+
+    self._lr_config = config.learning_rate.get()
+    self._lr_type = config.learning_rate.type
+
+    if self._lr_type is None:
+      raise ValueError('Learning rate type must be specified')
+
+    self._warmup_config = config.warmup.get()
+    self._warmup_type = config.warmup.type
+
+  def build_learning_rate(self):
+    """Build learning rate.
+
+    Builds learning rate from config. Learning rate schedule is built according
+    to the learning rate config. If learning rate type is consant,
+    lr_config.learning_rate is returned.
+
+    Returns:
+      tf.keras.optimizers.schedules.LearningRateSchedule instance. If
+      learning rate type is consant, lr_config.learning_rate is returned.
+    """
+    if self._lr_type == 'constant':
+      lr = self._lr_config.learning_rate
+    else:
+      lr = LR_CLS[self._lr_type](**self._lr_config.as_dict())
+
+    if self._warmup_config:
+      lr = WARMUP_CLS[self._warmup_type](lr, **self._warmup_config.as_dict())
+
+    return lr
+
+  @gin.configurable
+  def build_optimizer(
+      self,
+      lr: Union[tf.keras.optimizers.schedules.LearningRateSchedule, float],
+      gradient_transformers: Optional[List[Callable[
+          [List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor, tf.Tensor]]
+      ]]] = None,
+      postprocessor: Optional[Callable[[tf.keras.optimizers.Optimizer],
+                                       tf.keras.optimizers.Optimizer]] = None):
+    """Build optimizer.
+
+    Builds optimizer from config. It takes learning rate as input, and builds
+    the optimizer according to the optimizer config. Typically, the learning
+    rate built using self.build_lr() is passed as an argument to this method.
+
+    Args:
+      lr: A floating point value, or a
+        tf.keras.optimizers.schedules.LearningRateSchedule instance.
+      gradient_transformers: Optional list of functions to use to transform
+        gradients before applying updates to Variables. The functions are
+        applied after gradient_aggregator. The functions should accept and
+        return a list of (gradient, variable) tuples. clipvalue, clipnorm,
+        global_clipnorm should not be set when gradient_transformers is passed.
+      postprocessor: An optional function for postprocessing the optimizer. It
+        takes an optimizer and returns an optimizer.
+
+    Returns:
+      tf.keras.optimizers.Optimizer instance.
+    """
+
+    optimizer_dict = self._optimizer_config.as_dict()
+    ## Delete clipnorm, clipvalue, global_clipnorm if None
+    if optimizer_dict['clipnorm'] is None:
+      del optimizer_dict['clipnorm']
+    if optimizer_dict['clipvalue'] is None:
+      del optimizer_dict['clipvalue']
+    if optimizer_dict['global_clipnorm'] is None:
+      del optimizer_dict['global_clipnorm']
+
+    optimizer_dict['learning_rate'] = lr
+    if gradient_transformers is not None:
+      optimizer_dict['gradient_transformers'] = gradient_transformers
+
+    optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
+
+    if self._use_ema:
+      optimizer = ema_optimizer.ExponentialMovingAverage(
+          optimizer, **self._ema_config.as_dict())
+    if postprocessor:
+      optimizer = postprocessor(optimizer)
+    assert isinstance(optimizer, tf.keras.optimizers.Optimizer), (
+        'OptimizerFactory.build_optimizer returning a non-optimizer object: '
+        '{}'.format(optimizer))
+
+    return optimizer
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/optimizer_factory_test.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/optimizer_factory_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for optimizer_factory.py."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.modeling.optimization import optimizer_factory
+from official.modeling.optimization.configs import optimization_config
+
+
+class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(('sgd'), ('rmsprop'), ('adam'), ('adamw'), ('lamb'),
+                            ('lars'), ('adagrad'))
+  def test_optimizers(self, optimizer_type):
+    params = {
+        'optimizer': {
+            'type': optimizer_type
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
+        }
+    }
+    optimizer_cls = optimizer_factory.OPTIMIZERS_CLS[optimizer_type]
+    expected_optimizer_config = optimizer_cls().get_config()
+    expected_optimizer_config['learning_rate'] = 0.1
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr, postprocessor=lambda x: x)
+
+    self.assertIsInstance(optimizer, optimizer_cls)
+    self.assertEqual(expected_optimizer_config, optimizer.get_config())
+
+  @parameterized.parameters((None, None), (1.0, None), (None, 1.0))
+  def test_gradient_clipping(self, clipnorm, clipvalue):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'clipnorm': clipnorm,
+                'clipvalue': clipvalue
+            }
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 1.0
+            }
+        }
+    }
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr)
+
+    var0 = tf.Variable([1.0, 2.0])
+    var1 = tf.Variable([3.0, 4.0])
+
+    grads0 = tf.constant([0.1, 0.1])
+    grads1 = tf.constant([2.0, 3.0])
+
+    grads_and_vars = list(zip([grads0, grads1], [var0, var1]))
+    optimizer.apply_gradients(grads_and_vars)
+
+    self.assertAllClose(np.array([0.9, 1.9]), var0.numpy())
+    if clipvalue is not None:
+      self.assertAllClose(np.array([2.0, 3.0]), var1.numpy())
+    elif clipnorm is not None:
+      self.assertAllClose(np.array([2.4452999, 3.1679497]), var1.numpy())
+    else:
+      self.assertAllClose(np.array([1.0, 1.0]), var1.numpy())
+
+  def test_missing_types(self):
+    params = {'optimizer': {'type': 'sgd', 'sgd': {'momentum': 0.9}}}
+    with self.assertRaises(ValueError):
+      optimizer_factory.OptimizerFactory(
+          optimization_config.OptimizationConfig(params))
+    params = {
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        }
+    }
+    with self.assertRaises(ValueError):
+      optimizer_factory.OptimizerFactory(
+          optimization_config.OptimizationConfig(params))
+
+
+# TODO(b/187559334) refactor lr_schedule tests into `lr_schedule_test.py`.
+
+  def test_stepwise_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.1], [5000, 0.1], [10000, 0.1],
+                               [10001, 0.01], [20000, 0.01], [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_stepwise_lr_with_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 500,
+                'warmup_learning_rate': 0.01
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5500, 0.1],
+                               [10000, 0.1], [10001, 0.01], [20000, 0.01],
+                               [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_exponential_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'exponential',
+            'exponential': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000,
+                'decay_rate': 0.96,
+                'staircase': True
+            }
+        }
+    }
+    expected_lr_step_values = [
+        [0, 0.1],
+        [999, 0.1],
+        [1000, 0.096],
+        [1999, 0.096],
+        [2000, 0.09216],
+    ]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_polynomial_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000,
+                'end_learning_rate': 0.001
+            }
+        }
+    }
+
+    expected_lr_step_values = [[0, 0.1], [500, 0.0505], [1000, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_cosine_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'cosine',
+            'cosine': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.1], [250, 0.08535534], [500, 0.04999999],
+                               [750, 0.01464466], [1000, 0]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_constant_lr_with_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 500,
+                'warmup_learning_rate': 0.01
+            }
+        }
+    }
+
+    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5000, 0.1],
+                               [10000, 0.1], [20000, 0.1]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_stepwise_lr_with_polynomial_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        },
+        'warmup': {
+            'type': 'polynomial',
+            'polynomial': {
+                'warmup_steps': 500,
+                'power': 2.
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.0], [250, 0.025], [500, 0.1], [5500, 0.1],
+                               [10000, 0.1], [10001, 0.01], [20000, 0.01],
+                               [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value, places=6)
+
+  def test_power_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power',
+            'power': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 1.0], [1, 1.0], [250, 1. / 250.]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_power_linear_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power_linear',
+            'power_linear': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0,
+                'linear_decay_fraction': 0.5,
+                'total_decay_steps': 100,
+                'offset': 0,
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 1.0], [1, 1.0], [40, 1. / 40.],
+                               [60, 1. / 60. * 0.8]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_power_with_offset_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power_with_offset',
+            'power_with_offset': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0,
+                'offset': 10,
+                'pre_offset_learning_rate': 3.0,
+            }
+        }
+    }
+    expected_lr_step_values = [[1, 3.0], [10, 3.0], [20, 1. / 10.]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_step_cosine_lr_schedule_with_warmup(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'step_cosine_with_offset',
+            'step_cosine_with_offset': {
+                'values': (0.0001, 0.00005),
+                'boundaries': (0, 500000),
+                'offset': 10000,
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 10000,
+                'warmup_learning_rate': 0.0
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.0], [5000, 1e-4/2.0], [10000, 1e-4],
+                               [20000, 9.994863e-05], [499999, 5e-05]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+
+class OptimizerFactoryRegistryTest(tf.test.TestCase):
+
+  def test_registry(self):
+
+    class MyClass():
+      pass
+    optimizer_factory.register_optimizer_cls('test', MyClass)
+    self.assertIn('test', optimizer_factory.OPTIMIZERS_CLS)
+    with self.assertRaisesRegex(ValueError, 'test already registered.*'):
+      optimizer_factory.register_optimizer_cls('test', MyClass)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/slide_optimizer.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/optimization/slide_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SLIDE optimizer.
+
+A new optimizer that will be open sourced soon.
+"""
+
+SLIDE = "Unimplemented"
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/performance.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/performance.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions and classes related to training performance."""
+
+import tensorflow as tf
+
+
+def configure_optimizer(optimizer,
+                        use_float16=False,
+                        use_graph_rewrite=False,
+                        loss_scale=None):
+  """Configures optimizer object with performance options."""
+  if use_float16:
+    if loss_scale in (None, 'dynamic'):
+      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
+    else:
+      # loss_scale is a number. We interpret that as a fixed loss scale.
+      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
+          optimizer, dynamic=False, initial_scale=loss_scale)
+  if use_graph_rewrite:
+    # Note: the model dtype must be 'float32', which will ensure
+    # tf.keras.mixed_precision and enable_mixed_precision_graph_rewrite do not
+    # double up.
+    optimizer = (
+        tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+            optimizer))
+  return optimizer
+
+
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets the global `tf.keras.mixed_precision.Policy`."""
+  # TODO(b/191894773): Remove loss_scale argument
+  assert loss_scale is None, (
+      'The loss_scale argument must be None. The argument exists for '
+      'historical reasons and will be removed soon.')
+  if dtype == tf.float16:
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+  elif dtype == tf.bfloat16:
+    tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+  elif dtype == tf.float32:
+    tf.keras.mixed_precision.set_global_policy('float32')
+  else:
+    raise ValueError('Unexpected dtype: %s' % dtype)
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/tf_utils.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/modeling/tf_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common TF utilities."""
+
+import six
+import tensorflow as tf
+
+from tensorflow.python.util import deprecation
+from official.modeling import activations
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed.")
+def pack_inputs(inputs):
+  """Pack a list of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is None, replace it with a special constant
+    tensor.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if x is None:
+      outputs.append(tf.constant(0, shape=[], dtype=tf.int32))
+    else:
+      outputs.append(x)
+  return tuple(outputs)
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed.")
+def unpack_inputs(inputs):
+  """unpack a tuple of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is a special constant tensor, replace it
+    with None.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if is_special_none_tensor(x):
+      outputs.append(None)
+    else:
+      outputs.append(x)
+  x = tuple(outputs)
+
+  # To trick the very pointless 'unbalanced-tuple-unpacking' pylint check
+  # from triggering.
+  if len(x) == 1:
+    return x[0]
+  return tuple(outputs)
+
+
+def is_special_none_tensor(tensor):
+  """Checks if a tensor is a special None Tensor."""
+  return tensor.shape.ndims == 0 and tensor.dtype == tf.int32
+
+
+def get_activation(identifier, use_keras_layer=False):
+  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+  It checks string first and if it is one of customized activation not in TF,
+  the corresponding activation will be returned. For non-customized activation
+  names and callable identifiers, always fallback to tf.keras.activations.get.
+
+  Prefers using keras layers when use_keras_layer=True. Now it only supports
+  'relu', 'linear', 'identity', 'swish'.
+
+  Args:
+    identifier: String name of the activation function or callable.
+    use_keras_layer: If True, use keras layer if identifier is allow-listed.
+
+  Returns:
+    A Python function corresponding to the activation function or a keras
+    activation layer when use_keras_layer=True.
+  """
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier).lower()
+    if use_keras_layer:
+      keras_layer_allowlist = {
+          "relu": "relu",
+          "linear": "linear",
+          "identity": "linear",
+          "swish": "swish",
+          "sigmoid": "sigmoid",
+          "relu6": tf.nn.relu6,
+      }
+      if identifier in keras_layer_allowlist:
+        return tf.keras.layers.Activation(keras_layer_allowlist[identifier])
+    name_to_fn = {
+        "gelu": activations.gelu,
+        "simple_swish": activations.simple_swish,
+        "hard_swish": activations.hard_swish,
+        "relu6": activations.relu6,
+        "hard_sigmoid": activations.hard_sigmoid,
+        "identity": activations.identity,
+    }
+    if identifier in name_to_fn:
+      return tf.keras.activations.get(name_to_fn[identifier])
+  return tf.keras.activations.get(identifier)
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+  """Returns a list of the shape of tensor, preferring static dimensions.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of.
+    expected_rank: (optional) int. The expected rank of `tensor`. If this is
+      specified and the `tensor` has a different rank, and exception will be
+      thrown.
+    name: Optional name of the tensor for the error message.
+
+  Returns:
+    A list of dimensions of the shape of tensor. All static dimensions will
+    be returned as python integers, and dynamic dimensions will be returned
+    as tf.Tensor scalars.
+  """
+  if expected_rank is not None:
+    assert_rank(tensor, expected_rank, name)
+
+  shape = tensor.shape.as_list()
+
+  non_static_indexes = []
+  for (index, dim) in enumerate(shape):
+    if dim is None:
+      non_static_indexes.append(index)
+
+  if not non_static_indexes:
+    return shape
+
+  dyn_shape = tf.shape(tensor)
+  for index in non_static_indexes:
+    shape[index] = dyn_shape[index]
+  return shape
+
+
+def assert_rank(tensor, expected_rank, name=None):
+  """Raises an exception if the tensor rank is not of the expected rank.
+
+  Args:
+    tensor: A tf.Tensor to check the rank of.
+    expected_rank: Python integer or list of integers, expected rank.
+    name: Optional name of the tensor for the error message.
+
+  Raises:
+    ValueError: If the expected shape doesn't match the actual shape.
+  """
+  expected_rank_dict = {}
+  if isinstance(expected_rank, six.integer_types):
+    expected_rank_dict[expected_rank] = True
+  else:
+    for x in expected_rank:
+      expected_rank_dict[x] = True
+
+  actual_rank = tensor.shape.ndims
+  if actual_rank not in expected_rank_dict:
+    raise ValueError(
+        "For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
+        "equal to the expected tensor rank `%s`" %
+        (name, actual_rank, str(tensor.shape), str(expected_rank)))
+
+
+def safe_mean(losses):
+  """Computes a safe mean of the losses.
+
+  Args:
+    losses: `Tensor` whose elements contain individual loss measurements.
+
+  Returns:
+    A scalar representing the mean of `losses`. If `num_present` is zero,
+      then zero is returned.
+  """
+  total = tf.reduce_sum(losses)
+  num_elements = tf.cast(tf.size(losses), dtype=losses.dtype)
+  return tf.math.divide_no_nan(total, num_elements)
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/README.md
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/README.md
+# TensorFlow NLP Modelling Toolkit
+
+This codebase provides a Natrual Language Processing modeling toolkit written in
+[TF2](https://www.tensorflow.org/guide/effective_tf2). It allows researchers and
+developers to reproduce state-of-the-art model results and train custom models
+to experiment new research ideas.
+
+## Features
+
+* Reusable and modularized modeling building blocks
+* State-of-the-art reproducible
+* Easy to customize and extend
+* End-to-end training
+* Distributed trainable on both GPUs and TPUs
+
+## Major components
+
+### Libraries
+
+We provide modeling library to allow users to train custom models for new
+research ideas. Detailed intructions can be found in READMEs in each folder.
+
+*   [modeling/](modeling): modeling library that provides building blocks
+    (e.g.,Layers, Networks, and Models) that can be assembled into
+    transformer-based achitectures .
+*   [data/](data): binaries and utils for input preprocessing, tokenization,
+    etc.
+
+### State-of-the-Art models and examples
+
+We provide SoTA model implementations, pre-trained models, training and
+evaluation examples, and command lines. Detail instructions can be found in the
+READMEs for specific papers.
+
+1.  [BERT](bert): [BERT: Pre-training of Deep Bidirectional Transformers for
+    Language Understanding](https://arxiv.org/abs/1810.04805) by Devlin et al.,
+    2018
+2.  [ALBERT](albert):
+    [A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
+    by Lan et al., 2019
+3.  [XLNet](xlnet):
+    [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
+    by Yang et al., 2019
+4.  [Transformer for translation](transformer):
+    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Vaswani et
+    al., 2017
+
+### Common Training Driver
+
+We provide a single common driver [train.py](train.py) to train above SoTA
+models on popluar tasks. Please see [docs/train.md](docs/train.md) for
+more details.
+
+
+### Pre-trained models with checkpoints and TF-Hub
+
+We provide a large collection of baselines and checkpoints for NLP pre-trained
+models. Please see [docs/pretrained_models.md](docs/pretrained_models.md) for
+more details.
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/__init__.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/README.md
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/README.md
+# ALBERT (ALBERT: A Lite BERT for Self-supervised Learning of Language Representations)
+
+**WARNING**: We are on the way to deprecate this directory.
+We will add documentation in `nlp/docs` to use the new code in `nlp/modeling`.
+
+The academic paper which describes ALBERT in detail and provides full results on
+a number of tasks can be found here: https://arxiv.org/abs/1909.11942.
+
+This repository contains TensorFlow 2.x implementation for ALBERT.
+
+## Contents
+  * [Contents](#contents)
+  * [Pre-trained Models](#pre-trained-models)
+    * [Restoring from Checkpoints](#restoring-from-checkpoints)
+  * [Set Up](#set-up)
+  * [Process Datasets](#process-datasets)
+  * [Fine-tuning with BERT](#fine-tuning-with-bert)
+    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
+    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
+    * [SQuAD 1.1](#squad-1.1)
+
+
+## Pre-trained Models
+
+We released both checkpoints and tf.hub modules as the pretrained models for
+fine-tuning. They are TF 2.x compatible and are converted from the ALBERT v2
+checkpoints released in TF 1.x official ALBERT repository
+[google-research/albert](https://github.com/google-research/albert)
+in order to keep consistent with ALBERT paper.
+
+Our current released checkpoints are exactly the same as TF 1.x official ALBERT
+repository.
+
+### Access to Pretrained Checkpoints
+
+Pretrained checkpoints can be found in the following links:
+
+**Note: We implemented ALBERT using Keras functional-style networks in [nlp/modeling](../modeling).
+ALBERT V2 models compatible with TF 2.x checkpoints are:**
+
+*   **[`ALBERT V2 Base`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base.tar.gz)**:
+    12-layer, 768-hidden, 12-heads, 12M parameters
+*   **[`ALBERT V2 Large`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_large.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 18M parameters
+*   **[`ALBERT V2 XLarge`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_xlarge.tar.gz)**:
+    24-layer, 2048-hidden, 32-heads, 60M parameters
+*   **[`ALBERT V2 XXLarge`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_xxlarge.tar.gz)**:
+    12-layer, 4096-hidden, 64-heads, 235M parameters
+
+We recommend to host checkpoints on Google Cloud storage buckets when you use
+Cloud GPU/TPU.
+
+### Restoring from Checkpoints
+
+`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
+weights from provided pre-trained checkpoints, you can use the following code:
+
+```python
+init_checkpoint='the pretrained model checkpoint path.'
+model=tf.keras.Model() # Bert pre-trained model as feature extractor.
+checkpoint = tf.train.Checkpoint(model=model)
+checkpoint.restore(init_checkpoint)
+```
+
+Checkpoints featuring native serialized Keras models
+(i.e. model.load()/load_weights()) will be available soon.
+
+### Access to Pretrained hub modules.
+
+Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
+following links:
+
+*   **[`ALBERT V2 Base`](https://tfhub.dev/tensorflow/albert_en_base/1)**:
+    12-layer, 768-hidden, 12-heads, 12M parameters
+*   **[`ALBERT V2 Large`](https://tfhub.dev/tensorflow/albert_en_large/1)**:
+    24-layer, 1024-hidden, 16-heads, 18M parameters
+*   **[`ALBERT V2 XLarge`](https://tfhub.dev/tensorflow/albert_en_xlarge/1)**:
+    24-layer, 2048-hidden, 32-heads, 60M parameters
+*   **[`ALBERT V2 XXLarge`](https://tfhub.dev/tensorflow/albert_en_xxlarge/1)**:
+    12-layer, 4096-hidden, 64-heads, 235M parameters
+
+## Set Up
+
+```shell
+export PYTHONPATH="$PYTHONPATH:/path/to/models"
+```
+
+Install `tf-nightly` to get latest updates:
+
+```shell
+pip install tf-nightly-gpu
+```
+
+With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
+TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
+
+```shell
+ctpu up -name <instance name> --tf-version=”nightly”
+```
+
+Second, you need to install TF 2 `tf-nightly` on your VM:
+
+```shell
+pip install tf-nightly
+```
+
+Warning: More details TPU-specific set-up instructions and tutorial should come
+along with official TF 2.x release for TPU. Note that this repo is not
+officially supported by Google Cloud TPU team yet until TF 2.1 released.
+
+## Process Datasets
+
+### Pre-training
+
+Pre-train ALBERT using TF2.x will come soon.
+For now, please use [ALBERT research repo](https://github.com/google-research/ALBERT)
+to pretrain the model and convert the checkpoint to TF2.x compatible ones using
+[tf2_albert_encoder_checkpoint_converter.py](tf2_albert_encoder_checkpoint_converter.py).
+
+
+
+### Fine-tuning
+
+To prepare the fine-tuning data for final model training, use the
+[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
+Note that different from BERT models that use word piece tokenzer,
+ALBERT models employ sentence piece tokenizer. So the FLAG tokenizer_impl has
+to be set to 'sentence_piece'.
+Resulting datasets in `tf_record` format and training meta data should be later
+passed to training or evaluation scripts. The task-specific arguments are
+described in following sections:
+
+* GLUE
+
+Users can download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```shell
+export GLUE_DIR=~/glue
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+
+export TASK_NAME=MNLI
+export OUTPUT_DIR=gs://some_bucket/datasets
+python ../data/create_finetuning_data.py \
+ --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+ --sp_model_file=${ALBERT_DIR}/30k-clean.model \
+ --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+ --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+ --fine_tuning_task_type=classification --max_seq_length=128 \
+ --classification_task_name=${TASK_NAME} \
+ --tokenization=SentencePiece
+```
+
+* SQUAD
+
+The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
+detailed information about the SQuAD datasets and evaluation.
+
+The necessary files can be found here:
+
+*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+```shell
+export SQUAD_DIR=~/squad
+export SQUAD_VERSION=v1.1
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export OUTPUT_DIR=gs://some_bucket/datasets
+
+python ../data/create_finetuning_data.py \
+ --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
+ --sp_model_file=${ALBERT_DIR}/30k-clean.model \
+ --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
+ --fine_tuning_task_type=squad --max_seq_length=384 \
+ --tokenization=SentencePiece
+```
+
+## Fine-tuning with ALBERT
+
+### Cloud GPUs and TPUs
+
+* Cloud Storage
+
+The unzipped pre-trained model files can also be found in the Google Cloud
+Storage folder `gs://cloud-tpu-checkpoints/albert/checkpoints`. For example:
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export MODEL_DIR=gs://some_bucket/my_output_dir
+```
+
+Currently, users are able to access to `tf-nightly` TPUs and the following TPU
+script should run with `tf-nightly`.
+
+* GPU -> TPU
+
+Just add the following flags to `run_classifier.py` or `run_squad.py`:
+
+```shell
+  --distribution_strategy=tpu
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+### Sentence and Sentence-pair Classification Tasks
+
+This example code fine-tunes `albert_v2_base` on the Microsoft Research
+Paraphrase Corpus (MRPC) corpus, which only contains 3,600 examples and can
+fine-tune in a few minutes on most GPUs.
+
+We use the `albert_v2_base` as an example throughout the
+workflow.
+
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${ALBERT_DIR}/albert_config.json \
+  --init_checkpoint=${ALBERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --eval_batch_size=4 \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Alternatively, instead of specifying `init_checkpoint`, you can specify
+`hub_module_url` to employ a pretraind BERT hub module, e.g.,
+` --hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/1`.
+
+To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
+information and use remote storage for model checkpoints.
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=$ALBERT_DIR/albert_config.json \
+  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+### SQuAD 1.1
+
+The Stanford Question Answering Dataset (SQuAD) is a popular question answering
+benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
+
+We use the `albert_v2_base` as an example throughout the
+workflow.
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export SQUAD_DIR=gs://some_bucket/datasets
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --sp_model_file=${ALBERT_DIR}/30k-clean.model \
+  --bert_config_file=$ALBERT_DIR/albert_config.json \
+  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
+  --train_batch_size=4 \
+  --predict_batch_size=4 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Similarily, you can replace `init_checkpoint` FLAGS with `hub_module_url` to
+specify a hub module path.
+
+To use TPU, you need switch distribution strategy type to `tpu` with TPU
+information.
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_DIR=gs://some_bucket/datasets
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --sp_model_file=${ALBERT_DIR}/30k-clean.model \
+  --bert_config_file=$ALBERT_DIR/albert_config.json \
+  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
+  --train_batch_size=32 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+The dev set predictions will be saved into a file called predictions.json in the
+model_dir:
+
+```shell
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
+```
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/__init__.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/configs.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/configs.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The ALBERT configurations."""
+
+import six
+
+from official.nlp.bert import configs
+
+
+class AlbertConfig(configs.BertConfig):
+  """Configuration for `ALBERT`."""
+
+  def __init__(self, num_hidden_groups=1, inner_group_num=1, **kwargs):
+    """Constructs AlbertConfig.
+
+    Args:
+      num_hidden_groups: Number of group for the hidden layers, parameters in
+        the same group are shared. Note that this value and also the following
+        'inner_group_num' has to be 1 for now, because all released ALBERT
+        models set them to 1. We may support arbitary valid values in future.
+      inner_group_num: Number of inner repetition of attention and ffn.
+      **kwargs: The remaining arguments are the same as above 'BertConfig'.
+    """
+    super(AlbertConfig, self).__init__(**kwargs)
+
+    # TODO(chendouble): 'inner_group_num' and 'num_hidden_groups' are always 1
+    # in the released ALBERT. Support other values in AlbertEncoder if needed.
+    if inner_group_num != 1 or num_hidden_groups != 1:
+      raise ValueError("We only support 'inner_group_num' and "
+                       "'num_hidden_groups' as 1.")
+
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `AlbertConfig` from a Python dictionary of parameters."""
+    config = AlbertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/run_classifier.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/run_classifier.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ALBERT classification finetuning runner in tf2.x."""
+
+import json
+import os
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from official.common import distribute_utils
+from official.nlp.albert import configs as albert_configs
+from official.nlp.bert import bert_models
+from official.nlp.bert import run_classifier as run_classifier_bert
+
+
+FLAGS = flags.FLAGS
+
+
+def predict(strategy, albert_config, input_meta_data, predict_input_fn):
+  """Function outputs both the ground truth predictions as .tsv files."""
+  with strategy.scope():
+    classifier_model = bert_models.classifier_model(
+        albert_config, input_meta_data['num_labels'])[0]
+    checkpoint = tf.train.Checkpoint(model=classifier_model)
+    latest_checkpoint_file = (
+        FLAGS.predict_checkpoint_path or
+        tf.train.latest_checkpoint(FLAGS.model_dir))
+    assert latest_checkpoint_file
+    logging.info('Checkpoint file %s found and restoring from '
+                 'checkpoint', latest_checkpoint_file)
+    checkpoint.restore(
+        latest_checkpoint_file).assert_existing_objects_matched()
+    preds, ground_truth = run_classifier_bert.get_predictions_and_labels(
+        strategy, classifier_model, predict_input_fn, return_probs=True)
+    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
+    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
+      logging.info('***** Predict results *****')
+      for probabilities in preds:
+        output_line = '\t'.join(
+            str(class_probability)
+            for class_probability in probabilities) + '\n'
+        writer.write(output_line)
+    ground_truth_labels_file = os.path.join(FLAGS.model_dir,
+                                            'output_labels.tsv')
+    with tf.io.gfile.GFile(ground_truth_labels_file, 'w') as writer:
+      logging.info('***** Ground truth results *****')
+      for label in ground_truth:
+        output_line = '\t'.join(str(label)) + '\n'
+        writer.write(output_line)
+  return
+
+
+def main(_):
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+
+  if not FLAGS.model_dir:
+    FLAGS.model_dir = '/tmp/bert20/'
+
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      tpu_address=FLAGS.tpu)
+  max_seq_length = input_meta_data['max_seq_length']
+  train_input_fn = run_classifier_bert.get_dataset_fn(
+      FLAGS.train_data_path,
+      max_seq_length,
+      FLAGS.train_batch_size,
+      is_training=True)
+  eval_input_fn = run_classifier_bert.get_dataset_fn(
+      FLAGS.eval_data_path,
+      max_seq_length,
+      FLAGS.eval_batch_size,
+      is_training=False)
+
+  albert_config = albert_configs.AlbertConfig.from_json_file(
+      FLAGS.bert_config_file)
+  if FLAGS.mode == 'train_and_eval':
+    run_classifier_bert.run_bert(strategy, input_meta_data, albert_config,
+                                 train_input_fn, eval_input_fn)
+  elif FLAGS.mode == 'predict':
+    predict(strategy, albert_config, input_meta_data, eval_input_fn)
+  else:
+    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
+  return
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('bert_config_file')
+  flags.mark_flag_as_required('input_meta_data_path')
+  flags.mark_flag_as_required('model_dir')
+  app.run(main)
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/run_squad.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/albert/run_squad.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run ALBERT on SQuAD 1.1 and SQuAD 2.0 in TF 2.x."""
+
+import json
+import os
+import time
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from official.common import distribute_utils
+from official.nlp.albert import configs as albert_configs
+from official.nlp.bert import run_squad_helper
+from official.nlp.bert import tokenization
+from official.nlp.data import squad_lib_sp
+
+flags.DEFINE_string(
+    'sp_model_file', None,
+    'The path to the sentence piece model. Used by sentence piece tokenizer '
+    'employed by ALBERT.')
+
+# More flags can be found in run_squad_helper.
+run_squad_helper.define_common_squad_flags()
+
+FLAGS = flags.FLAGS
+
+
+def train_squad(strategy,
+                input_meta_data,
+                custom_callbacks=None,
+                run_eagerly=False):
+  """Runs bert squad training."""
+  bert_config = albert_configs.AlbertConfig.from_json_file(
+      FLAGS.bert_config_file)
+  run_squad_helper.train_squad(strategy, input_meta_data, bert_config,
+                               custom_callbacks, run_eagerly)
+
+
+def predict_squad(strategy, input_meta_data):
+  """Makes predictions for the squad dataset."""
+  bert_config = albert_configs.AlbertConfig.from_json_file(
+      FLAGS.bert_config_file)
+  tokenizer = tokenization.FullSentencePieceTokenizer(
+      sp_model_file=FLAGS.sp_model_file)
+
+  run_squad_helper.predict_squad(strategy, input_meta_data, tokenizer,
+                                 bert_config, squad_lib_sp)
+
+
+def eval_squad(strategy, input_meta_data):
+  """Evaluate on the squad dataset."""
+  bert_config = albert_configs.AlbertConfig.from_json_file(
+      FLAGS.bert_config_file)
+  tokenizer = tokenization.FullSentencePieceTokenizer(
+      sp_model_file=FLAGS.sp_model_file)
+
+  eval_metrics = run_squad_helper.eval_squad(
+      strategy, input_meta_data, tokenizer, bert_config, squad_lib_sp)
+  return eval_metrics
+
+
+def export_squad(model_export_path, input_meta_data):
+  """Exports a trained model as a `SavedModel` for inference.
+
+  Args:
+    model_export_path: a string specifying the path to the SavedModel directory.
+    input_meta_data: dictionary containing meta data about input and model.
+
+  Raises:
+    Export path is not specified, got an empty string or None.
+  """
+  bert_config = albert_configs.AlbertConfig.from_json_file(
+      FLAGS.bert_config_file)
+  run_squad_helper.export_squad(model_export_path, input_meta_data, bert_config)
+
+
+def main(_):
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+
+  if FLAGS.mode == 'export_only':
+    export_squad(FLAGS.model_export_path, input_meta_data)
+    return
+
+  # Configures cluster spec for multi-worker distribution strategy.
+  if FLAGS.num_gpus > 0:
+    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      all_reduce_alg=FLAGS.all_reduce_alg,
+      tpu_address=FLAGS.tpu)
+
+  if 'train' in FLAGS.mode:
+    train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly)
+  if 'predict' in FLAGS.mode:
+    predict_squad(strategy, input_meta_data)
+  if 'eval' in FLAGS.mode:
+    eval_metrics = eval_squad(strategy, input_meta_data)
+    f1_score = eval_metrics['final_f1']
+    logging.info('SQuAD eval F1-score: %f', f1_score)
+    summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval')
+    summary_writer = tf.summary.create_file_writer(summary_dir)
+    with summary_writer.as_default():
+      # TODO(lehou): write to the correct step number.
+      tf.summary.scalar('F1-score', f1_score, step=0)
+      summary_writer.flush()
+    # Also write eval_metrics to json file.
+    squad_lib_sp.write_to_json_files(
+        eval_metrics, os.path.join(summary_dir, 'eval_metrics.json'))
+    time.sleep(60)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('bert_config_file')
+  flags.mark_flag_as_required('model_dir')
+  app.run(main)
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/README.md
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/README.md
+# BERT (Bidirectional Encoder Representations from Transformers)
+
+**WARNING**: We are on the way to deprecate most of the code in this directory.
+Please see
+[this link](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
+for the new tutorial and use the new code in `nlp/modeling`. This README is
+still correct for this legacy implementation.
+
+The academic paper which describes BERT in detail and provides full results on a
+number of tasks can be found here: https://arxiv.org/abs/1810.04805.
+
+This repository contains TensorFlow 2.x implementation for BERT.
+
+## Contents
+  * [Contents](#contents)
+  * [Pre-trained Models](#pre-trained-models)
+    * [Restoring from Checkpoints](#restoring-from-checkpoints)
+  * [Set Up](#set-up)
+  * [Process Datasets](#process-datasets)
+  * [Fine-tuning with BERT](#fine-tuning-with-bert)
+    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
+    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
+    * [SQuAD 1.1](#squad-1.1)
+
+
+## Pre-trained Models
+
+We released both checkpoints and tf.hub modules as the pretrained models for
+fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
+released in TF 1.x official BERT repository
+[google-research/bert](https://github.com/google-research/bert)
+in order to keep consistent with BERT paper.
+
+
+### Access to Pretrained Checkpoints
+
+Pretrained checkpoints can be found in the following links:
+
+**Note: We have switched BERT implementation
+to use Keras functional-style networks in [nlp/modeling](../modeling).
+The new checkpoints are:**
+
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Multilingual Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/multi_cased_L-12_H-768_A-12.tar.gz)**:
+    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+
+We recommend to host checkpoints on Google Cloud storage buckets when you use
+Cloud GPU/TPU.
+
+### Restoring from Checkpoints
+
+`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
+weights from provided pre-trained checkpoints, you can use the following code:
+
+```python
+init_checkpoint='the pretrained model checkpoint path.'
+model=tf.keras.Model() # Bert pre-trained model as feature extractor.
+checkpoint = tf.train.Checkpoint(model=model)
+checkpoint.restore(init_checkpoint)
+```
+
+Checkpoints featuring native serialized Keras models
+(i.e. model.load()/load_weights()) will be available soon.
+
+### Access to Pretrained hub modules.
+
+Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
+following links:
+
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/)**:
+    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/)**:
+    Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads,
+    110M parameters
+
+## Set Up
+
+```shell
+export PYTHONPATH="$PYTHONPATH:/path/to/models"
+```
+
+Install `tf-nightly` to get latest updates:
+
+```shell
+pip install tf-nightly-gpu
+```
+
+With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
+TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
+
+```shell
+ctpu up -name <instance name> --tf-version=”nightly”
+```
+
+Second, you need to install TF 2 `tf-nightly` on your VM:
+
+```shell
+pip install tf-nightly
+```
+
+## Process Datasets
+
+### Pre-training
+
+There is no change to generate pre-training data. Please use the script
+[`../data/create_pretraining_data.py`](../data/create_pretraining_data.py)
+which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
+to get processed pre-training data and it adapts to TF2 symbols and python3
+compatibility.
+
+Running the pre-training script requires an input and output directory, as well as a vocab file.  Note that max_seq_length will need to match the sequence length parameter you specify when you run pre-training.
+
+Example shell script to call create_pretraining_data.py
+```
+export WORKING_DIR='local disk or cloud location'
+export BERT_DIR='local disk or cloud location'
+python models/official/nlp/data/create_pretraining_data.py \
+  --input_file=$WORKING_DIR/input/input.txt \
+  --output_file=$WORKING_DIR/output/tf_examples.tfrecord \
+  --vocab_file=$BERT_DIR/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
+  --do_lower_case=True \
+  --max_seq_length=512 \
+  --max_predictions_per_seq=76 \
+  --masked_lm_prob=0.15 \
+  --random_seed=12345 \
+  --dupe_factor=5
+```
+
+### Fine-tuning
+
+To prepare the fine-tuning data for final model training, use the
+[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
+Resulting datasets in `tf_record` format and training meta data should be later
+passed to training or evaluation scripts. The task-specific arguments are
+described in following sections:
+
+* GLUE
+
+Users can download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+Also, users can download [Pretrained Checkpoint](#access-to-pretrained-checkpoints) and locate on some directory `$BERT_DIR` instead of using checkpoints on Google Cloud Storage.
+
+```shell
+export GLUE_DIR=~/glue
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+
+export TASK_NAME=MNLI
+export OUTPUT_DIR=gs://some_bucket/datasets
+python ../data/create_finetuning_data.py \
+ --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+ --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+ --fine_tuning_task_type=classification --max_seq_length=128 \
+ --classification_task_name=${TASK_NAME}
+```
+
+* SQUAD
+
+The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
+detailed information about the SQuAD datasets and evaluation.
+
+The necessary files can be found here:
+
+*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+```shell
+export SQUAD_DIR=~/squad
+export SQUAD_VERSION=v1.1
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export OUTPUT_DIR=gs://some_bucket/datasets
+
+python ../data/create_finetuning_data.py \
+ --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
+ --fine_tuning_task_type=squad --max_seq_length=384
+```
+
+Note: To create fine-tuning data with SQUAD 2.0, you need to add flag `--version_2_with_negative=True`.
+
+## Fine-tuning with BERT
+
+### Cloud GPUs and TPUs
+
+* Cloud Storage
+
+The unzipped pre-trained model files can also be found in the Google Cloud
+Storage folder `gs://cloud-tpu-checkpoints/bert/keras_bert`. For example:
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export MODEL_DIR=gs://some_bucket/my_output_dir
+```
+
+Currently, users are able to access to `tf-nightly` TPUs and the following TPU
+script should run with `tf-nightly`.
+
+* GPU -> TPU
+
+Just add the following flags to `run_classifier.py` or `run_squad.py`:
+
+```shell
+  --distribution_strategy=tpu
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+### Sentence and Sentence-pair Classification Tasks
+
+This example code fine-tunes `BERT-Large` on the Microsoft Research Paraphrase
+Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a
+few minutes on most GPUs.
+
+We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
+workflow.
+For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
+(uncased_L-12_H-768_A-12).
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --eval_batch_size=4 \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Alternatively, instead of specifying `init_checkpoint`, you can specify
+`hub_module_url` to employ a pretraind BERT hub module, e.g.,
+` --hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1`.
+
+After training a model, to get predictions from the classifier, you can set the
+`--mode=predict` and offer the test set tfrecords to `--eval_data_path`.
+Output will be created in file called test_results.tsv in the output folder.
+Each line will contain output for each sample, columns are the class
+probabilities.
+
+```shell
+python run_classifier.py \
+  --mode='predict' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --eval_batch_size=4 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
+information and use remote storage for model checkpoints.
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --steps_per_loop=1000 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+Note that, we specify `steps_per_loop=1000` for TPU, because running a loop of
+training steps inside a `tf.function` can significantly increase TPU utilization
+and callbacks will not be called inside the loop.
+
+### SQuAD 1.1
+
+The Stanford Question Answering Dataset (SQuAD) is a popular question answering
+benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
+
+We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
+workflow.
+For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
+(uncased_L-12_H-768_A-12).
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export SQUAD_DIR=gs://some_bucket/datasets
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --vocab_file=${BERT_DIR}/vocab.txt \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --predict_batch_size=4 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Similarily, you can replace `init_checkpoint` FLAG with `hub_module_url` to
+specify a hub module path.
+
+`run_squad.py` writes the prediction for `--predict_file` by default. If you set
+the `--model=predict` and offer the SQuAD test data, the scripts will generate
+the prediction json file.
+
+To use TPU, you need switch distribution strategy type to `tpu` with TPU
+information.
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_DIR=gs://some_bucket/datasets
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --vocab_file=${BERT_DIR}/vocab.txt \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=32 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+The dev set predictions will be saved into a file called predictions.json in the
+model_dir:
+
+```shell
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
+```
+
+
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/__init__.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/bert_cloud_tpu.md
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/bert_cloud_tpu.md
+# BERT FineTuning with Cloud TPU: Sentence and Sentence-Pair Classification Tasks (TF 2.1)
+This tutorial shows you how to train the Bidirectional Encoder Representations from Transformers (BERT) model on Cloud TPU.
+
+
+## Set up Cloud Storage and Compute Engine VM
+1. [Open a cloud shell window](https://console.cloud.google.com/?cloudshell=true&_ga=2.11844148.-1612541229.1552429951)
+2. Create a variable for the project's id:
+```
+export PROJECT_ID=your-project_id
+```
+3. Configure `gcloud` command-line tool to use the project where you want to create Cloud TPU.
+```
+gcloud config set project ${PROJECT_ID}
+```
+4. Create a Cloud Storage bucket using the following command:
+```
+gsutil mb -p ${PROJECT_ID} -c standard -l europe-west4 -b on gs://your-bucket-name
+```
+This Cloud Storage bucket stores the data you use to train your model and the training results.
+5. Launch a Compute Engine VM and Cloud TPU using the ctpu up command.
+```
+ctpu up --tpu-size=v3-8 \
+ --machine-type=n1-standard-8 \
+ --zone=europe-west4-a \
+ --tf-version=2.1 [optional flags: --project, --name]
+```
+6. The configuration you specified appears. Enter y to approve or n to cancel.
+7. When the ctpu up command has finished executing, verify that your shell prompt has changed from username@project to username@tpuname. This change shows that you are now logged into your Compute Engine VM.
+```
+gcloud compute ssh vm-name --zone=europe-west4-a
+(vm)$ export TPU_NAME=vm-name
+```
+As you continue these instructions, run each command that begins with `(vm)$` in your VM session window.
+
+## Prepare the Dataset
+1. From your Compute Engine virtual machine (VM), install requirements.txt.
+```
+(vm)$ cd /usr/share/models
+(vm)$ sudo pip3 install -r official/requirements.txt
+```
+2. Optional: download download_glue_data.py
+
+This tutorial uses the General Language Understanding Evaluation (GLUE) benchmark to evaluate and analyze the performance of the model. The GLUE data is provided for this tutorial at gs://cloud-tpu-checkpoints/bert/classification.
+
+## Define parameter values
+Next, define several parameter values that are required when you train and evaluate your model:
+
+```
+(vm)$ export PYTHONPATH="$PYTHONPATH:/usr/share/tpu/models"
+(vm)$ export STORAGE_BUCKET=gs://your-bucket-name
+(vm)$ export BERT_BASE_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+(vm)$ export MODEL_DIR=${STORAGE_BUCKET}/bert-output
+(vm)$ export GLUE_DIR=gs://cloud-tpu-checkpoints/bert/classification
+(vm)$ export TASK=mnli
+```
+
+## Train the model
+From your Compute Engine VM, run the following command.
+
+```
+(vm)$ python3 official/nlp/bert/run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=${TPU_NAME}
+```
+
+## Verify your results
+The training takes approximately 1 hour on a v3-8 TPU. When script completes, you should see results similar to the following:
+```
+Training Summary:
+{'train_loss': 0.28142181038856506,
+'last_train_metrics': 0.9467429518699646,
+'eval_metrics': 0.8599063158035278,
+'total_training_steps': 36813}
+```
+
+## Clean up
+To avoid incurring charges to your GCP account for the resources used in this topic:
+1. Disconnect from the Compute Engine VM:
+```
+(vm)$ exit
+```
+2. In your Cloud Shell, run ctpu delete with the --zone flag you used when you set up the Cloud TPU to delete your Compute Engine VM and your Cloud TPU:
+```
+$ ctpu delete --zone=your-zone
+```
+3. Run ctpu status specifying your zone to make sure you have no instances allocated to avoid unnecessary charges for TPU usage. The deletion might take several minutes. A response like the one below indicates there are no more allocated instances:
+```
+$ ctpu status --zone=your-zone
+```
+4. Run gsutil as shown, replacing your-bucket with the name of the Cloud Storage bucket you created for this tutorial:
+```
+$ gsutil rm -r gs://your-bucket
+```
+
+
+
+
+
+
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/bert_models.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/bert_models.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT models that are compatible with TF 2.0."""
+
+import gin
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from official.modeling import tf_utils
+from official.nlp.albert import configs as albert_configs
+from official.nlp.bert import configs
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
+
+
+class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
+  """Returns layer that computes custom loss and metrics for pretraining."""
+
+  def __init__(self, vocab_size, **kwargs):
+    super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self.config = {
+        'vocab_size': vocab_size,
+    }
+
+  def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
+                   lm_example_loss, sentence_output, sentence_labels,
+                   next_sentence_loss):
+    """Adds metrics."""
+    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+        lm_labels, lm_output)
+    numerator = tf.reduce_sum(masked_lm_accuracy * lm_label_weights)
+    denominator = tf.reduce_sum(lm_label_weights) + 1e-5
+    masked_lm_accuracy = numerator / denominator
+    self.add_metric(
+        masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
+
+    self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
+
+    if sentence_labels is not None:
+      next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+          sentence_labels, sentence_output)
+      self.add_metric(
+          next_sentence_accuracy,
+          name='next_sentence_accuracy',
+          aggregation='mean')
+
+    if next_sentence_loss is not None:
+      self.add_metric(
+          next_sentence_loss, name='next_sentence_loss', aggregation='mean')
+
+  def call(self,
+           lm_output_logits,
+           sentence_output_logits,
+           lm_label_ids,
+           lm_label_weights,
+           sentence_labels=None):
+    """Implements call() for the layer."""
+    lm_label_weights = tf.cast(lm_label_weights, tf.float32)
+    lm_output_logits = tf.cast(lm_output_logits, tf.float32)
+
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        lm_label_ids, lm_output_logits, from_logits=True)
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mask_label_loss = tf.math.divide_no_nan(lm_numerator_loss,
+                                            lm_denominator_loss)
+
+    if sentence_labels is not None:
+      sentence_output_logits = tf.cast(sentence_output_logits, tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels, sentence_output_logits, from_logits=True)
+      sentence_loss = tf.reduce_mean(sentence_loss)
+      loss = mask_label_loss + sentence_loss
+    else:
+      sentence_loss = None
+      loss = mask_label_loss
+
+    batch_shape = tf.slice(tf.shape(lm_label_ids), [0], [1])
+    # TODO(hongkuny): Avoids the hack and switches add_loss.
+    final_loss = tf.fill(batch_shape, loss)
+
+    self._add_metrics(lm_output_logits, lm_label_ids, lm_label_weights,
+                      mask_label_loss, sentence_output_logits, sentence_labels,
+                      sentence_loss)
+    return final_loss
+
+
+@gin.configurable
+def get_transformer_encoder(bert_config,
+                            sequence_length=None,
+                            transformer_encoder_cls=None,
+                            output_range=None):
+  """Gets a 'TransformerEncoder' object.
+
+  Args:
+    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
+    sequence_length: [Deprecated].
+    transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
+      default BERT encoder implementation.
+    output_range: the sequence output range, [0, output_range). Default setting
+      is to return the entire sequence output.
+
+  Returns:
+    A encoder object.
+  """
+  del sequence_length
+  if transformer_encoder_cls is not None:
+    # TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
+    embedding_cfg = dict(
+        vocab_size=bert_config.vocab_size,
+        type_vocab_size=bert_config.type_vocab_size,
+        hidden_size=bert_config.hidden_size,
+        max_seq_length=bert_config.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+        dropout_rate=bert_config.hidden_dropout_prob,
+    )
+    hidden_cfg = dict(
+        num_attention_heads=bert_config.num_attention_heads,
+        intermediate_size=bert_config.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(bert_config.hidden_act),
+        dropout_rate=bert_config.hidden_dropout_prob,
+        attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+    )
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=bert_config.num_hidden_layers,
+        pooled_output_dim=bert_config.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range))
+
+    # Relies on gin configuration to define the Transformer encoder arguments.
+    return transformer_encoder_cls(**kwargs)
+
+  kwargs = dict(
+      vocab_size=bert_config.vocab_size,
+      hidden_size=bert_config.hidden_size,
+      num_layers=bert_config.num_hidden_layers,
+      num_attention_heads=bert_config.num_attention_heads,
+      intermediate_size=bert_config.intermediate_size,
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      dropout_rate=bert_config.hidden_dropout_prob,
+      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+      max_sequence_length=bert_config.max_position_embeddings,
+      type_vocab_size=bert_config.type_vocab_size,
+      embedding_width=bert_config.embedding_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=bert_config.initializer_range))
+  if isinstance(bert_config, albert_configs.AlbertConfig):
+    return networks.AlbertEncoder(**kwargs)
+  else:
+    assert isinstance(bert_config, configs.BertConfig)
+    kwargs['output_range'] = output_range
+    return networks.BertEncoder(**kwargs)
+
+
+def pretrain_model(bert_config,
+                   seq_length,
+                   max_predictions_per_seq,
+                   initializer=None,
+                   use_next_sentence_label=True,
+                   return_core_pretrainer_model=False):
+  """Returns model to be used for pre-training.
+
+  Args:
+      bert_config: Configuration that defines the core BERT model.
+      seq_length: Maximum sequence length of the training data.
+      max_predictions_per_seq: Maximum number of tokens in sequence to mask out
+        and use for pretraining.
+      initializer: Initializer for weights in BertPretrainer.
+      use_next_sentence_label: Whether to use the next sentence label.
+      return_core_pretrainer_model: Whether to also return the `BertPretrainer`
+        object.
+
+  Returns:
+      A Tuple of (1) Pretraining model, (2) core BERT submodel from which to
+      save weights after pretraining, and (3) optional core `BertPretrainer`
+      object if argument `return_core_pretrainer_model` is True.
+  """
+  input_word_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
+  input_mask = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_mask', dtype=tf.int32)
+  input_type_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
+  masked_lm_positions = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_positions',
+      dtype=tf.int32)
+  masked_lm_ids = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
+  masked_lm_weights = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_weights',
+      dtype=tf.int32)
+
+  if use_next_sentence_label:
+    next_sentence_labels = tf.keras.layers.Input(
+        shape=(1,), name='next_sentence_labels', dtype=tf.int32)
+  else:
+    next_sentence_labels = None
+
+  transformer_encoder = get_transformer_encoder(bert_config, seq_length)
+  if initializer is None:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  pretrainer_model = models.BertPretrainer(
+      network=transformer_encoder,
+      embedding_table=transformer_encoder.get_embedding_table(),
+      num_classes=2,  # The next sentence prediction label has two classes.
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      num_token_predictions=max_predictions_per_seq,
+      initializer=initializer,
+      output='logits')
+
+  outputs = pretrainer_model(
+      [input_word_ids, input_mask, input_type_ids, masked_lm_positions])
+  lm_output = outputs['masked_lm']
+  sentence_output = outputs['classification']
+  pretrain_loss_layer = BertPretrainLossAndMetricLayer(
+      vocab_size=bert_config.vocab_size)
+  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
+                                    masked_lm_weights, next_sentence_labels)
+  inputs = {
+      'input_word_ids': input_word_ids,
+      'input_mask': input_mask,
+      'input_type_ids': input_type_ids,
+      'masked_lm_positions': masked_lm_positions,
+      'masked_lm_ids': masked_lm_ids,
+      'masked_lm_weights': masked_lm_weights,
+  }
+  if use_next_sentence_label:
+    inputs['next_sentence_labels'] = next_sentence_labels
+
+  keras_model = tf.keras.Model(inputs=inputs, outputs=output_loss)
+  if return_core_pretrainer_model:
+    return keras_model, transformer_encoder, pretrainer_model
+  else:
+    return keras_model, transformer_encoder
+
+
+def squad_model(bert_config,
+                max_seq_length,
+                initializer=None,
+                hub_module_url=None,
+                hub_module_trainable=True):
+  """Returns BERT Squad model along with core BERT model to import weights.
+
+  Args:
+    bert_config: BertConfig, the config defines the core Bert model.
+    max_seq_length: integer, the maximum input sequence length.
+    initializer: Initializer for the final dense layer in the span labeler.
+      Defaulted to TruncatedNormal initializer.
+    hub_module_url: TF-Hub path/url to Bert module.
+    hub_module_trainable: True to finetune layers in the hub module.
+
+  Returns:
+    A tuple of (1) keras model that outputs start logits and end logits and
+    (2) the core BERT transformer encoder.
+  """
+  if initializer is None:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  if not hub_module_url:
+    bert_encoder = get_transformer_encoder(bert_config, max_seq_length)
+    return models.BertSpanLabeler(
+        network=bert_encoder, initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+  core_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
+  pooled_output, sequence_output = core_model(
+      [input_word_ids, input_mask, input_type_ids])
+  bert_encoder = tf.keras.Model(
+      inputs={
+          'input_word_ids': input_word_ids,
+          'input_mask': input_mask,
+          'input_type_ids': input_type_ids,
+      },
+      outputs=[sequence_output, pooled_output],
+      name='core_model')
+  return models.BertSpanLabeler(
+      network=bert_encoder, initializer=initializer), bert_encoder
+
+
+def classifier_model(bert_config,
+                     num_labels,
+                     max_seq_length=None,
+                     final_layer_initializer=None,
+                     hub_module_url=None,
+                     hub_module_trainable=True):
+  """BERT classifier model in functional API style.
+
+  Construct a Keras model for predicting `num_labels` outputs from an input with
+  maximum sequence length `max_seq_length`.
+
+  Args:
+    bert_config: BertConfig or AlbertConfig, the config defines the core BERT or
+      ALBERT model.
+    num_labels: integer, the number of classes.
+    max_seq_length: integer, the maximum input sequence length.
+    final_layer_initializer: Initializer for final dense layer. Defaulted
+      TruncatedNormal initializer.
+    hub_module_url: TF-Hub path/url to Bert module.
+    hub_module_trainable: True to finetune layers in the hub module.
+
+  Returns:
+    Combined prediction model (words, mask, type) -> (one-hot labels)
+    BERT sub-model (words, mask, type) -> (bert_outputs)
+  """
+  if final_layer_initializer is not None:
+    initializer = final_layer_initializer
+  else:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+
+  if not hub_module_url:
+    bert_encoder = get_transformer_encoder(
+        bert_config, max_seq_length, output_range=1)
+    return models.BertClassifier(
+        bert_encoder,
+        num_classes=num_labels,
+        dropout_rate=bert_config.hidden_dropout_prob,
+        initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+  bert_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
+  pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
+  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
+      pooled_output)
+
+  output = tf.keras.layers.Dense(
+      num_labels, kernel_initializer=initializer, name='output')(
+          output)
+  return tf.keras.Model(
+      inputs={
+          'input_word_ids': input_word_ids,
+          'input_mask': input_mask,
+          'input_type_ids': input_type_ids
+      },
+      outputs=output), bert_model
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/bert_models_test.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/bert_models_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+from official.nlp.bert import bert_models
+from official.nlp.bert import configs as bert_configs
+from official.nlp.modeling import networks
+
+
+class BertModelsTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(BertModelsTest, self).setUp()
+    self._bert_test_config = bert_configs.BertConfig(
+        attention_probs_dropout_prob=0.0,
+        hidden_act='gelu',
+        hidden_dropout_prob=0.0,
+        hidden_size=16,
+        initializer_range=0.02,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=2,
+        type_vocab_size=2,
+        vocab_size=30522)
+
+  def test_pretrain_model(self):
+    model, encoder = bert_models.pretrain_model(
+        self._bert_test_config,
+        seq_length=5,
+        max_predictions_per_seq=2,
+        initializer=None,
+        use_next_sentence_label=True)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(encoder, networks.BertEncoder)
+
+    # model has one scalar output: loss value.
+    self.assertEqual(model.output.shape.as_list(), [
+        None,
+    ])
+
+    # Expect two output from encoder: sequence and classification output.
+    self.assertIsInstance(encoder.output, list)
+    self.assertLen(encoder.output, 2)
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(encoder.output[1].shape.as_list(), [None, 16])
+
+  def test_squad_model(self):
+    model, core_model = bert_models.squad_model(
+        self._bert_test_config,
+        max_seq_length=5,
+        initializer=None,
+        hub_module_url=None,
+        hub_module_trainable=None)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(core_model, tf.keras.Model)
+
+    # Expect two output from model: start positions and end positions
+    self.assertIsInstance(model.output, list)
+    self.assertLen(model.output, 2)
+
+    # Expect two output from core_model: sequence and classification output.
+    self.assertIsInstance(core_model.output, list)
+    self.assertLen(core_model.output, 2)
+    # shape should be [batch size, None, hidden_size]
+    self.assertEqual(core_model.output[0].shape.as_list(), [None, None, 16])
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])
+
+  def test_classifier_model(self):
+    model, core_model = bert_models.classifier_model(
+        self._bert_test_config,
+        num_labels=3,
+        max_seq_length=5,
+        final_layer_initializer=None,
+        hub_module_url=None,
+        hub_module_trainable=None)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(core_model, tf.keras.Model)
+
+    # model has one classification output with num_labels=3.
+    self.assertEqual(model.output.shape.as_list(), [None, 3])
+
+    # Expect two output from core_model: sequence and classification output.
+    self.assertIsInstance(core_model.output, list)
+    self.assertLen(core_model.output, 2)
+    # shape should be [batch size, None, hidden_size]
+    self.assertEqual(core_model.output[0].shape.as_list(), [None, None, 16])
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/common_flags.py
+++ b/TensorFlow2x/Accuracy_Validation/ResNet50_Official/official/nlp/bert/common_flags.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defining common flags used across all BERT models/applications."""
+
+from absl import flags
+import tensorflow as tf
+
+from official.utils import hyperparams_flags
+from official.utils.flags import core as flags_core
+
+
+def define_common_bert_flags():
+  """Define common flags for BERT tasks."""
+  flags_core.define_base(
+      data_dir=False,
+      model_dir=True,
+      clean=False,
+      train_epochs=False,
+      epochs_between_evals=False,
+      stop_threshold=False,
+      batch_size=False,
+      num_gpu=True,
+      export_dir=False,
+      distribution_strategy=True,
+      run_eagerly=True)
+  flags_core.define_distribution()
+  flags.DEFINE_string('bert_config_file', None,
+                      'Bert configuration file to define core bert layers.')
+  flags.DEFINE_string(
+      'model_export_path', None,
+      'Path to the directory, where trainined model will be '
+      'exported.')
+  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
+  flags.DEFINE_string(
+      'init_checkpoint', None,
+      'Initial checkpoint (usually from a pre-trained BERT model).')
+  flags.DEFINE_integer('num_train_epochs', 3,
+                       'Total number of training epochs to perform.')
+  flags.DEFINE_integer(
+      'steps_per_loop', None,
+      'Number of steps per graph-mode loop. Only training step '
+      'happens inside the loop. Callbacks will not be called '
+      'inside. If not set the value will be configured depending on the '
+      'devices available.')
+  flags.DEFINE_float('learning_rate', 5e-5,
+                     'The initial learning rate for Adam.')
+  flags.DEFINE_float('end_lr', 0.0,
+                     'The end learning rate for learning rate decay.')
+  flags.DEFINE_string('optimizer_type', 'adamw',
+                      'The type of optimizer to use for training (adamw|lamb)')
+  flags.DEFINE_boolean(
+      'scale_loss', False,
+      'Whether to divide the loss by number of replica inside the per-replica '
+      'loss function.')
+  flags.DEFINE_boolean(
+      'use_keras_compile_fit', False,
+      'If True, uses Keras compile/fit() API for training logic. Otherwise '
+      'use custom training loop.')
+  flags.DEFINE_string(
+      'hub_module_url', None, 'TF-Hub path/url to Bert module. '
+      'If specified, init_checkpoint flag should not be used.')
+  flags.DEFINE_bool('hub_module_trainable', True,
+                    'True to make keras layers in the hub module trainable.')
+  flags.DEFINE_string(
+      'sub_model_export_name', None,
+      'If set, `sub_model` checkpoints are exported into '
+      'FLAGS.model_dir/FLAGS.sub_model_export_name.')
+  flags.DEFINE_bool('explicit_allreduce', False,
+                    'True to use explicit allreduce instead of the implicit '
+                    'allreduce in optimizer.apply_gradients(). If fp16 mixed '
+                    'precision training is used, this also enables allreduce '
+                    'gradients in fp16.')
+  flags.DEFINE_integer('allreduce_bytes_per_pack', 0,
+                       'Number of bytes of a gradient pack for allreduce. '
+                       'Should be positive integer, if set to 0, all '
+                       'gradients are in one pack. Breaking gradient into '
+                       'packs could enable overlap between allreduce and '
+                       'backprop computation. This flag only takes effect '
+                       'when explicit_allreduce is set to True.')
+
+  flags_core.define_log_steps()
+
+  # Adds flags for mixed precision and multi-worker training.
+  flags_core.define_performance(
+      num_parallel_calls=False,
+      inter_op=False,
+      intra_op=False,
+      synthetic_data=False,
+      max_train_steps=False,
+      dtype=True,
+      loss_scale=True,
+      all_reduce_alg=True,
+      num_packs=False,
+      tf_gpu_thread_mode=True,
+      datasets_num_private_threads=True,
+      enable_xla=True,
+      fp16_implementation=True,
+  )
+
+  # Adds gin configuration flags.
+  hyperparams_flags.define_gin_flags()
+
+
+def dtype():
+  return flags_core.get_tf_dtype(flags.FLAGS)
+
+
+def use_float16():
+  return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16
+
+
+def use_graph_rewrite():
+  return flags.FLAGS.fp16_implementation == 'graph_rewrite'
+
+
+def get_loss_scale():
+  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')