update TensorFlow2x test method

a32ffa95 · qianyj · e286da17 · a32ffa95 · a32ffa95 · a32ffa95
Commit a32ffa95 authored Feb 03, 2023 by qianyj
20 changed files
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/__init__.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/learning_rate_config.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/learning_rate_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for learning rate schedule config."""
+from typing import List, Optional
+
+import dataclasses
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class ConstantLrConfig(base_config.Config):
+  """Configuration for constant learning rate.
+
+  This class is a containers for the constant learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to Constant.
+    learning_rate: A float. The learning rate. Defaults to 0.1.
+  """
+  name: str = 'Constant'
+  learning_rate: float = 0.1
+
+
+@dataclasses.dataclass
+class StepwiseLrConfig(base_config.Config):
+  """Configuration for stepwise learning rate decay.
+
+  This class is a container for the piecewise constant learning rate scheduling
+  configs. It will configure an instance of PiecewiseConstantDecay keras
+  learning rate schedule.
+
+  An example (from keras docs): use a learning rate that's 1.0 for the first
+  100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5, 0.1]
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
+    boundaries: A list of ints of strictly increasing entries. Defaults to None.
+    values: A list of floats that specifies the values for the intervals defined
+      by `boundaries`. It should have one more element than `boundaries`.
+            The learning rate is computed as follows: [0, boundaries[0]] ->
+              values[0] [boundaries[0], boundaries[1]]     -> values[1]
+              [boundaries[n-1], boundaries[n]]   -> values[n] [boundaries[n],
+              end]               -> values[n+1] Defaults to None.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PiecewiseConstantDecay'
+  boundaries: Optional[List[int]] = None
+  values: Optional[List[float]] = None
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class ExponentialLrConfig(base_config.Config):
+  """Configuration for exponential learning rate decay.
+
+  This class is a containers for the exponential learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to ExponentialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    decay_rate: A float. Defaults to None.
+    staircase: A boolean, if true, learning rate is decreased at discreate
+      intervals. Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'ExponentialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  decay_rate: Optional[float] = None
+  staircase: Optional[bool] = None
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class PolynomialLrConfig(base_config.Config):
+  """Configuration for polynomial learning rate decay.
+
+  This class is a containers for the polynomial learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PolynomialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    end_learning_rate: A float.  The minimal end learning rate.
+    power: A float.  The power of the polynomial. Defaults to linear, 1.0.
+    cycle: A boolean, whether or not it should cycle beyond decay_steps.
+      Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PolynomialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  end_learning_rate: float = 0.0001
+  power: float = 1.0
+  cycle: bool = False
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class CosineLrConfig(base_config.Config):
+  """Configuration for Cosine learning rate decay.
+
+  This class is a containers for the cosine learning rate decay configs,
+  tf.keras.experimental.CosineDecay.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to CosineDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    alpha: A float.  Minimum learning rate value as a fraction of
+      initial_learning_rate.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'CosineDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  alpha: float = 0.0
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class DirectPowerLrConfig(base_config.Config):
+  """Configuration for DirectPower learning rate decay.
+
+  This class configures a schedule following follows lr * (step)^power.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to DirectPowerDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+  """
+  name: str = 'DirectPowerDecay'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+
+
+@dataclasses.dataclass
+class PowerAndLinearDecayLrConfig(base_config.Config):
+  """Configuration for DirectPower learning rate decay.
+
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to
+      PowerAndLinearDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    total_decay_steps: An int. The total number of steps for power + linear
+      decay. Defaults to None.
+    power: A float. The order of the polynomial. Defaults to -0.5, for sqrt
+      decay.
+    linear_decay_fraction: A float. In the last `linear_decay_fraction` steps,
+      the learning rate will be multiplied by a linear decay. Defaults to 0.1.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PowerAndLinearDecay'
+  initial_learning_rate: Optional[float] = None
+  total_decay_steps: Optional[int] = None
+  power: float = -0.5
+  linear_decay_fraction: float = 0.1
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class PowerDecayWithOffsetLrConfig(base_config.Config):
+  """Configuration for power learning rate decay with step offset.
+
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to
+      PowerDecayWithOffset.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+    offset: An integer. Power decay happens after `offset` steps.
+    pre_offset_learning_rate: A float. The constant learning rate before
+      `offset` steps.
+  """
+  name: str = 'PowerDecayWithOffset'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+  offset: int = 0
+  pre_offset_learning_rate: float = 1.0e6
+
+
+@dataclasses.dataclass
+class StepCosineLrConfig(base_config.Config):
+  """Configuration for stepwise learning rate decay.
+
+  This class is a container for the piecewise cosine learning rate scheduling
+  configs. It will configure an instance of StepConsineDecayWithOffset keras
+  learning rate schedule.
+
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5]
+    lr_decayed_fn = (
+    lr_schedule.StepConsineDecayWithOffset(
+        boundaries,
+        values))
+    ```
+    from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
+    from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
+    boundaries: A list of ints of strictly increasing entries. Defaults to None.
+    values: A list of floats that specifies the values for the intervals defined
+      by `boundaries`. It should have one more element than `boundaries`.
+            The learning rate is computed as follows:
+              [0, boundaries[0]] -> cosine from values[0] to values[1]
+              [boundaries[0], boundaries[1]]     -> values[1] to values[2]
+              ...
+              [boundaries[n-1], boundaries[n]]   -> values[n] to values[n+1]
+              [boundaries[n], end]               -> values[n+1] to 0.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'StepConsineDecayWithOffset'
+  boundaries: Optional[List[int]] = None
+  values: Optional[List[float]] = None
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class LinearWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+
+  This class is a container for the linear warmup schedule configs.
+  Warmup_learning_rate is the initial learning rate, the final learning rate of
+  the warmup period is the learning_rate of the optimizer in use. The learning
+  rate at each step linearly increased according to the following formula:
+    warmup_learning_rate = warmup_learning_rate +
+    step / warmup_steps * (final_learning_rate - warmup_learning_rate).
+  Using warmup overrides the learning rate schedule by the number of warmup
+  steps.
+
+  Attributes:
+    name: The name of warmup schedule. Defaults to linear.
+    warmup_learning_rate: Initial learning rate for the warmup. Defaults to 0.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'linear'
+  warmup_learning_rate: float = 0
+  warmup_steps: Optional[int] = None
+
+
+@dataclasses.dataclass
+class PolynomialWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+
+  This class is a container for the polynomial warmup schedule configs.
+
+  Attributes:
+    name: The name of warmup schedule. Defaults to Polynomial.
+    power: Polynomial power. Defaults to 1.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'polynomial'
+  power: float = 1
+  warmup_steps: Optional[int] = None
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimization_config.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimization_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for optimization configs.
+
+This file define the dataclass for optimization configs (OptimizationConfig).
+It also has two helper functions get_optimizer_config, and get_lr_config from
+an OptimizationConfig class.
+"""
+from typing import Optional
+
+import dataclasses
+
+from official.modeling.hyperparams import base_config
+from official.modeling.hyperparams import oneof
+from official.modeling.optimization.configs import learning_rate_config as lr_cfg
+from official.modeling.optimization.configs import optimizer_config as opt_cfg
+
+
+@dataclasses.dataclass
+class OptimizerConfig(oneof.OneOfConfig):
+  """Configuration for optimizer.
+
+  Attributes:
+    type: 'str', type of optimizer to be used, on the of fields below.
+    sgd: sgd optimizer config.
+    adam: adam optimizer config.
+    adamw: adam with weight decay.
+    lamb: lamb optimizer.
+    rmsprop: rmsprop optimizer.
+    lars: lars optimizer.
+    adagrad: adagrad optimizer.
+    slide: slide optimizer.
+  """
+  type: Optional[str] = None
+  sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
+  adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
+  adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
+  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
+  rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()
+  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
+  adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
+  slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
+  adafactor: opt_cfg.AdafactorConfig = opt_cfg.AdafactorConfig()
+
+
+@dataclasses.dataclass
+class LrConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+
+  Attributes:
+    type: 'str', type of lr schedule to be used, one of the fields below.
+    constant: constant learning rate config.
+    stepwise: stepwise learning rate config.
+    exponential: exponential learning rate config.
+    polynomial: polynomial learning rate config.
+    cosine: cosine learning rate config.
+    power: step^power learning rate config.
+    power_linear: learning rate config of step^power followed by
+      step^power*linear.
+    power_with_offset: power decay with a step offset.
+    step_cosine_with_offset: Step cosine with a step offset.
+  """
+  type: Optional[str] = None
+  constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
+  stepwise: lr_cfg.StepwiseLrConfig = lr_cfg.StepwiseLrConfig()
+  exponential: lr_cfg.ExponentialLrConfig = lr_cfg.ExponentialLrConfig()
+  polynomial: lr_cfg.PolynomialLrConfig = lr_cfg.PolynomialLrConfig()
+  cosine: lr_cfg.CosineLrConfig = lr_cfg.CosineLrConfig()
+  power: lr_cfg.DirectPowerLrConfig = lr_cfg.DirectPowerLrConfig()
+  power_linear: lr_cfg.PowerAndLinearDecayLrConfig = (
+      lr_cfg.PowerAndLinearDecayLrConfig())
+  power_with_offset: lr_cfg.PowerDecayWithOffsetLrConfig = (
+      lr_cfg.PowerDecayWithOffsetLrConfig())
+  step_cosine_with_offset: lr_cfg.StepCosineLrConfig = (
+      lr_cfg.StepCosineLrConfig())
+
+
+@dataclasses.dataclass
+class WarmupConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+
+  Attributes:
+    type: 'str', type of warmup schedule to be used, one of the fields below.
+    linear: linear warmup config.
+    polynomial: polynomial warmup config.
+  """
+  type: Optional[str] = None
+  linear: lr_cfg.LinearWarmupConfig = lr_cfg.LinearWarmupConfig()
+  polynomial: lr_cfg.PolynomialWarmupConfig = lr_cfg.PolynomialWarmupConfig()
+
+
+@dataclasses.dataclass
+class OptimizationConfig(base_config.Config):
+  """Configuration for optimizer and learning rate schedule.
+
+  Attributes:
+    optimizer: optimizer oneof config.
+    ema: optional exponential moving average optimizer config, if specified, ema
+      optimizer will be used.
+    learning_rate: learning rate oneof config.
+    warmup: warmup oneof config.
+  """
+  optimizer: OptimizerConfig = OptimizerConfig()
+  ema: Optional[opt_cfg.EMAConfig] = None
+  learning_rate: LrConfig = LrConfig()
+  warmup: WarmupConfig = WarmupConfig()
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimization_config_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimization_config_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for optimization_config.py."""
+
+import tensorflow as tf
+
+from official.modeling.optimization.configs import learning_rate_config as lr_cfg
+from official.modeling.optimization.configs import optimization_config
+from official.modeling.optimization.configs import optimizer_config as opt_cfg
+
+
+class OptimizerConfigTest(tf.test.TestCase):
+
+  def test_no_optimizer(self):
+    optimizer = optimization_config.OptimizationConfig({}).optimizer.get()
+    self.assertIsNone(optimizer)
+
+  def test_no_lr_schedule(self):
+    lr = optimization_config.OptimizationConfig({}).learning_rate.get()
+    self.assertIsNone(lr)
+
+  def test_no_warmup_schedule(self):
+    warmup = optimization_config.OptimizationConfig({}).warmup.get()
+    self.assertIsNone(warmup)
+
+  def test_config(self):
+    opt_config = optimization_config.OptimizationConfig({
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {}  # default config
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {}
+        },
+        'warmup': {
+            'type': 'linear'
+        }
+    })
+    self.assertEqual(opt_config.optimizer.get(), opt_cfg.SGDConfig())
+    self.assertEqual(opt_config.learning_rate.get(),
+                     lr_cfg.PolynomialLrConfig())
+    self.assertEqual(opt_config.warmup.get(), lr_cfg.LinearWarmupConfig())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimizer_config.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimizer_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for optimizer configs."""
+from typing import List, Optional
+
+import dataclasses
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class BaseOptimizerConfig(base_config.Config):
+  """Base optimizer config.
+
+  Attributes:
+    clipnorm: float >= 0 or None. If not None, Gradients will be clipped when
+      their L2 norm exceeds this value.
+    clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
+      their absolute value exceeds this value.
+    global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
+      clipped so that their global norm is no higher than this value
+  """
+  clipnorm: Optional[float] = None
+  clipvalue: Optional[float] = None
+  global_clipnorm: Optional[float] = None
+
+
+@dataclasses.dataclass
+class SGDConfig(BaseOptimizerConfig):
+  """Configuration for SGD optimizer.
+
+  The attributes for this class matches the arguments of tf.keras.optimizer.SGD.
+
+  Attributes:
+    name: name of the optimizer.
+    decay: decay rate for SGD optimizer.
+    nesterov: nesterov for SGD optimizer.
+    momentum: momentum for SGD optimizer.
+  """
+  name: str = "SGD"
+  decay: float = 0.0
+  nesterov: bool = False
+  momentum: float = 0.0
+
+
+@dataclasses.dataclass
+class RMSPropConfig(BaseOptimizerConfig):
+  """Configuration for RMSProp optimizer.
+
+  The attributes for this class matches the arguments of
+  tf.keras.optimizers.RMSprop.
+
+  Attributes:
+    name: name of the optimizer.
+    rho: discounting factor for RMSprop optimizer.
+    momentum: momentum for RMSprop optimizer.
+    epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
+    centered: Whether to normalize gradients or not.
+  """
+  name: str = "RMSprop"
+  rho: float = 0.9
+  momentum: float = 0.0
+  epsilon: float = 1e-7
+  centered: bool = False
+
+
+@dataclasses.dataclass
+class AdagradConfig(BaseOptimizerConfig):
+  """Configuration for Adagrad optimizer.
+
+  The attributes of this class match the arguments of
+  tf.keras.optimizer.Adagrad.
+
+  Attributes:
+    name: name of the optimizer.
+    initial_accumulator_value: A floating point value. Starting value for the
+      accumulators, must be non-negative.
+    epsilon: A small floating point value to avoid zero denominator.
+  """
+  name: str = "Adagrad"
+  initial_accumulator_value: float = 0.1
+  epsilon: float = 1e-07
+
+
+@dataclasses.dataclass
+class AdamConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer.
+
+  The attributes for this class matches the arguments of
+  tf.keras.optimizer.Adam.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in Adam optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+  """
+  name: str = "Adam"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+
+
+@dataclasses.dataclass
+class AdamWeightDecayConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer with weight decay.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in the optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    include_in_weight_decay: list[str], or None. List of weight names to include
+      in weight decay.
+    exclude_from_weight_decay: list[str], or None. List of weight names to not
+      include in weight decay.
+    gradient_clip_norm: A positive float. Clips the gradients to this maximum
+      L2-norm. Default to 1.0.
+  """
+  name: str = "AdamWeightDecay"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+  weight_decay_rate: float = 0.0
+  include_in_weight_decay: Optional[List[str]] = None
+  exclude_from_weight_decay: Optional[List[str]] = None
+  gradient_clip_norm: float = 1.0
+
+
+@dataclasses.dataclass
+class LAMBConfig(BaseOptimizerConfig):
+  """Configuration for LAMB optimizer.
+
+  The attributes for this class matches the arguments of
+  tensorflow_addons.optimizers.LAMB.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in LAMB optimizer.
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    exclude_from_weight_decay: List of regex patterns of variables excluded from
+      weight decay. Variables whose name contain a substring matching the
+      pattern will be excluded.
+    exclude_from_layer_adaptation: List of regex patterns of variables excluded
+      from layer adaptation. Variables whose name contain a substring matching
+      the pattern will be excluded.
+  """
+  name: str = "LAMB"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+
+
+@dataclasses.dataclass
+class EMAConfig(BaseOptimizerConfig):
+  """Exponential moving average optimizer config.
+
+  Attributes:
+    name: 'str', name of the optimizer.
+    trainable_weights_only: 'bool', if True, only model trainable weights will
+      be updated. Otherwise, all model weights will be updated. This mainly
+      affects batch normalization parameters.
+    average_decay: 'float', average decay value.
+    start_step: 'int', start step to apply moving average.
+    dynamic_decay: 'bool', whether to apply dynamic decay or not.
+  """
+  name: str = "ExponentialMovingAverage"
+  trainable_weights_only: bool = True
+  average_decay: float = 0.99
+  start_step: int = 0
+  dynamic_decay: bool = True
+
+
+@dataclasses.dataclass
+class LARSConfig(BaseOptimizerConfig):
+  """Layer-wise adaptive rate scaling config.
+
+  Attributes:
+    name: 'str', name of the optimizer.
+    momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
+      the relevant direction and dampens oscillations. Defaults to 0.9.
+    eeta: `float` LARS coefficient as used in the paper. Default set to LARS
+      coefficient from the paper. (eeta / weight_decay) determines the highest
+      scaling factor in LARS..
+    weight_decay_rate: `float` for weight decay.
+    nesterov: 'boolean' for whether to use nesterov momentum.
+    classic_momentum: `boolean` for whether to use classic (or popular)
+      momentum. The learning rate is applied during momentum update in classic
+      momentum, but after momentum for popular momentum.
+    exclude_from_weight_decay: A list of `string` for variable screening, if any
+      of the string appears in a variable's name, the variable will be excluded
+      for computing weight decay. For example, one could specify the list like
+      ['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
+    exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
+      layer adaptation. If it is None, it will be defaulted the same as
+      exclude_from_weight_decay.
+  """
+  name: str = "LARS"
+  momentum: float = 0.9
+  eeta: float = 0.001
+  weight_decay_rate: float = 0.0
+  nesterov: bool = False
+  classic_momentum: bool = True
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+
+
+@dataclasses.dataclass
+class SLIDEConfig(BaseOptimizerConfig):
+  """Configuration for SLIDE optimizer.
+
+  Details coming soon.
+  """
+  name: str = "SLIDE"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  weight_decay_type: str = "inner"
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+  include_in_sparse_layer_adaptation: Optional[List[str]] = None
+  sparse_layer_learning_rate: float = 0.1
+  do_gradient_rescaling: bool = True
+  norm_type: str = "layer"
+  ratio_clip_norm: float = 1e5
+
+
+@dataclasses.dataclass
+class AdafactorConfig(BaseOptimizerConfig):
+  """Configuration for Adafactor optimizer.
+
+  The attributes for this class matches the arguments of the Adafactor
+  implementation.
+  """
+  name: str = "Adafactor"
+  factored: bool = True
+  multiply_by_parameter_scale: bool = True
+  beta1: Optional[float] = None
+  decay_rate: float = 0.8
+  step_offset: int = 0
+  clipping_threshold: float = 1.0
+  min_dim_size_to_factor: int = 128
+  epsilon1: float = 1e-30
+  epsilon2: float = 1e-3
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/ema_optimizer.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/ema_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Exponential moving average optimizer."""
+
+from typing import List, Optional, Text
+
+import tensorflow as tf
+
+# pylint: disable=protected-access
+
+
+class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
+  """Optimizer that computes an exponential moving average of the variables.
+
+  Empirically it has been found that using the moving average of the trained
+  parameters of a deep network is better than using its trained parameters
+  directly. This optimizer allows you to compute this moving average and swap
+  the variables at save time so that any code outside of the training loop
+  will use by default the average values instead of the original ones.
+
+  Example of usage for training:
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = ExponentialMovingAverage(opt)
+
+  opt.shadow_copy(model)
+  ```
+
+  At test time, swap the shadow variables to evaluate on the averaged weights:
+  ```python
+  opt.swap_weights()
+  # Test eval the model here
+  opt.swap_weights()
+  ```
+  """
+
+  def __init__(self,
+               optimizer: tf.keras.optimizers.Optimizer,
+               trainable_weights_only: bool = True,
+               average_decay: float = 0.99,
+               start_step: int = 0,
+               dynamic_decay: bool = True,
+               name: Text = 'ExponentialMovingAverage',
+               **kwargs):
+    """Construct a new ExponentialMovingAverage optimizer.
+
+    Args:
+      optimizer: `tf.keras.optimizers.Optimizer` that will be
+        used to compute and apply gradients.
+      trainable_weights_only: 'bool', if True, only model trainable weights will
+        be updated. Otherwise, all model weights will be updated. This mainly
+        affects batch normalization parameters.
+      average_decay: float. Decay to use to maintain the moving averages
+        of trained variables.
+      start_step: int. What step to start the moving average.
+      dynamic_decay: bool. Whether to change the decay based on the number
+        of optimizer updates. Decay will start at 0.1 and gradually increase
+        up to `average_decay` after each optimizer update. This behavior is
+        similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
+      name: Optional name for the operations created when applying
+        gradients. Defaults to "moving_average".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+        `clipvalue`, `lr`, `decay`}.
+    """
+    super().__init__(name, **kwargs)
+    self._average_decay = average_decay
+    self._trainable_weights_only = trainable_weights_only
+    self._start_step = tf.constant(start_step, tf.float32)
+    self._dynamic_decay = dynamic_decay
+    self._optimizer = optimizer
+    self._track_trackable(self._optimizer, 'base_optimizer')
+    self._average_weights = None
+    self._model_weights = None
+
+  def shadow_copy(self, model: tf.keras.Model):
+    """Creates shadow variables for the given model weights."""
+
+    if self._trainable_weights_only:
+      self._model_weights = model.trainable_variables
+    else:
+      self._model_weights = model.variables
+    for var in self._model_weights:
+      self.add_slot(var, 'average', initializer='zeros')
+
+    self._average_weights = [
+        self.get_slot(var, 'average') for var in self._model_weights
+    ]
+
+  @property
+  def has_shadow_copy(self):
+    """Whether this optimizer has created shadow variables."""
+    return self._model_weights is not None and self._average_weights is not None
+
+  def _create_slots(self, var_list):
+    self._optimizer._create_slots(var_list=var_list)  # pylint: disable=protected-access
+
+  def apply_gradients(self, grads_and_vars, name: Optional[Text] = None):
+    result = self._optimizer.apply_gradients(grads_and_vars, name)
+    self.update_average(self.iterations)
+    return result
+
+  @tf.function
+  def update_average(self, step: tf.Tensor):
+    step = tf.cast(step, tf.float32)
+    if step < self._start_step:
+      decay = tf.constant(0., tf.float32)
+    elif self._dynamic_decay:
+      decay = step - self._start_step
+      decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
+    else:
+      decay = self._average_decay
+
+    def _apply_moving(v_moving, v_normal):
+      diff = v_moving - v_normal
+      v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
+      return v_moving
+
+    def _update(strategy, v_moving_and_v_normal):
+      for v_moving, v_normal in v_moving_and_v_normal:
+        strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
+
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(_update, args=(zip(self._average_weights,
+                                             self._model_weights),))
+
+  def swap_weights(self):
+    """Swap the average and moving weights.
+
+    This is a convenience method to allow one to evaluate the averaged weights
+    at test time. Loads the weights stored in `self._average` into the model,
+    keeping a copy of the original model weights. Swapping twice will return
+    the original weights.
+    """
+    if tf.distribute.in_cross_replica_context():
+      strategy = tf.distribute.get_strategy()
+      strategy.run(self._swap_weights, args=())
+    else:
+      raise ValueError('Swapping weights must occur under a '
+                       'tf.distribute.Strategy')
+
+  @tf.function
+  def _swap_weights(self):
+    def fn_0(a, b):
+      a.assign_add(b)
+      return a
+    def fn_1(b, a):
+      b.assign(a - b)
+      return b
+    def fn_2(a, b):
+      a.assign_sub(b)
+      return a
+
+    def swap(strategy, a_and_b):
+      """Swap `a` and `b` and mirror to all devices."""
+      for a, b in a_and_b:
+        strategy.extended.update(a, fn_0, args=(b,))  # a = a + b
+        strategy.extended.update(b, fn_1, args=(a,))  # b = a - b
+        strategy.extended.update(a, fn_2, args=(b,))  # a = a - b
+
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(
+        swap, args=(zip(self._average_weights, self._model_weights),))
+
+  def assign_average_vars(self, var_list: List[tf.Variable]):
+    """Assign variables in var_list with their respective averages.
+
+    Args:
+      var_list: List of model variables to be assigned to their average.
+    Returns:
+      assign_op: The op corresponding to the assignment operation of
+        variables to their average.
+    """
+    assign_op = tf.group([
+        var.assign(self.get_slot(var, 'average')) for var in var_list
+        if var.trainable
+    ])
+    return assign_op
+
+  def _create_hypers(self):
+    self._optimizer._create_hypers()  # pylint: disable=protected-access
+
+  def _prepare(self, var_list):
+    return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
+
+  @property
+  def iterations(self):
+    return self._optimizer.iterations
+
+  @iterations.setter
+  def iterations(self, variable):
+    self._optimizer.iterations = variable
+
+  @property
+  def weights(self):
+    # return self._weights + self._optimizer.weights
+    return self._optimizer.weights
+
+  def variables(self):
+    return self._weights + [self.iterations]
+
+  @property
+  def lr(self):
+    return self._optimizer._get_hyper('learning_rate')
+
+  @lr.setter
+  def lr(self, lr):
+    self._optimizer._set_hyper('learning_rate', lr)
+
+  @property
+  def learning_rate(self):
+    return self._optimizer._get_hyper('learning_rate')
+
+  @learning_rate.setter
+  def learning_rate(self, learning_rate):  # pylint: disable=redefined-outer-name
+    self._optimizer._set_hyper('learning_rate', learning_rate)
+
+  def _resource_apply_dense(self, grad, var):
+    return self._optimizer._resource_apply_dense(grad, var)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse(grad, var, indices)
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse_duplicate_indices(
+        grad, var, indices)
+
+  def get_config(self):
+    config = {
+        'optimizer': tf.keras.optimizers.serialize(self._optimizer),
+        'average_decay': self._average_decay,
+        'start_step': self._start_step,
+        'dynamic_decay': self._dynamic_decay,
+    }
+    base_config = super(ExponentialMovingAverage, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    optimizer = tf.keras.optimizers.deserialize(
+        config.pop('optimizer'),
+        custom_objects=custom_objects,
+    )
+    return cls(optimizer, **config)
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lars_optimizer.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lars_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Layer-wise adaptive rate scaling optimizer."""
+import re
+from typing import Text, List, Optional
+
+import tensorflow as tf
+
+
+# pylint: disable=protected-access
+
+
+class LARS(tf.keras.optimizers.Optimizer):
+  """Layer-wise Adaptive Rate Scaling for large batch training.
+
+  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+  """
+
+  def __init__(self,
+               learning_rate: float = 0.01,
+               momentum: float = 0.9,
+               weight_decay_rate: float = 0.0,
+               eeta: float = 0.001,
+               nesterov: bool = False,
+               classic_momentum: bool = True,
+               exclude_from_weight_decay: Optional[List[Text]] = None,
+               exclude_from_layer_adaptation: Optional[List[Text]] = None,
+               name: Text = "LARS",
+               **kwargs):
+    """Constructs a LARSOptimizer.
+
+    Args:
+      learning_rate: `float` for learning rate. Defaults to 0.01.
+      momentum: `float` hyperparameter >= 0 that accelerates gradient descent
+          in the relevant direction and dampens oscillations. Defaults to 0.9.
+      weight_decay_rate: `float` for weight decay.
+      eeta: `float` LARS coefficient as used in the paper. Default set to LARS
+          coefficient from the paper. (eeta / weight_decay) determines the
+          highest scaling factor in LARS..
+      nesterov: 'boolean' for whether to use nesterov momentum.
+      classic_momentum: `boolean` for whether to use classic (or popular)
+          momentum. The learning rate is applied during momentum update in
+          classic momentum, but after momentum for popular momentum.
+      exclude_from_weight_decay: A list of `string` for variable screening, if
+          any of the string appears in a variable's name, the variable will be
+          excluded for computing weight decay. For example, one could specify
+          the list like ['batch_normalization', 'bias'] to exclude BN and bias
+          from weight decay.
+      exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
+          for layer adaptation. If it is None, it will be defaulted the same as
+          exclude_from_weight_decay.
+      name: `Text` as optional name for the operations created when applying
+        gradients. Defaults to "LARS".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for
+        backward compatibility, recommended to use `learning_rate` instead.
+    """
+    super(LARS, self).__init__(name, **kwargs)
+
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
+    self.momentum = momentum
+    self.weight_decay_rate = weight_decay_rate
+    self.eeta = eeta
+    self.nesterov = nesterov
+    self.classic_momentum = classic_momentum
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+    # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
+    # arg is None.
+    if exclude_from_layer_adaptation:
+      self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
+    else:
+      self.exclude_from_layer_adaptation = exclude_from_weight_decay
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self.add_slot(v, "momentum")
+
+  def _resource_apply_dense(self, grad, param, apply_state=None):
+    if grad is None or param is None:
+      return tf.no_op()
+
+    var_device, var_dtype = param.device, param.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
+                    self._fallback_apply_state(var_device, var_dtype))
+    learning_rate = coefficients["lr_t"]
+
+    param_name = param.name
+
+    v = self.get_slot(param, "momentum")
+
+    if self._use_weight_decay(param_name):
+      grad += self.weight_decay_rate * param
+
+    if self.classic_momentum:
+      trust_ratio = 1.0
+      if self._do_layer_adaptation(param_name):
+        w_norm = tf.norm(param, ord=2)
+        g_norm = tf.norm(grad, ord=2)
+        trust_ratio = tf.where(
+            tf.greater(w_norm, 0),
+            tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0),
+            1.0)
+      scaled_lr = learning_rate * trust_ratio
+
+      next_v = tf.multiply(self.momentum, v) + scaled_lr * grad
+      if self.nesterov:
+        update = tf.multiply(self.momentum, next_v) + scaled_lr * grad
+      else:
+        update = next_v
+      next_param = param - update
+    else:
+      next_v = tf.multiply(self.momentum, v) + grad
+      if self.nesterov:
+        update = tf.multiply(self.momentum, next_v) + grad
+      else:
+        update = next_v
+
+      trust_ratio = 1.0
+      if self._do_layer_adaptation(param_name):
+        w_norm = tf.norm(param, ord=2)
+        v_norm = tf.norm(update, ord=2)
+        trust_ratio = tf.where(
+            tf.greater(w_norm, 0),
+            tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0),
+            1.0)
+      scaled_lr = trust_ratio * learning_rate
+      next_param = param - scaled_lr * update
+
+    return tf.group(*[
+        param.assign(next_param, use_locking=False),
+        v.assign(next_v, use_locking=False)
+    ])
+
+  def _resource_apply_sparse(self, grad, handle, indices, apply_state):
+    raise NotImplementedError("Applying sparse gradients is not implemented.")
+
+  def _use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _do_layer_adaptation(self, param_name):
+    """Whether to do layer-wise learning rate adaptation for `param_name`."""
+    if self.exclude_from_layer_adaptation:
+      for r in self.exclude_from_layer_adaptation:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def get_config(self):
+    config = super(LARS, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
+        "momentum": self.momentum,
+        "classic_momentum": self.classic_momentum,
+        "weight_decay_rate": self.weight_decay_rate,
+        "eeta": self.eeta,
+        "nesterov": self.nesterov,
+    })
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lr_schedule.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lr_schedule.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Learning rate schedule classes."""
+
+import math
+from typing import Mapping, Any, Union, Optional
+
+import tensorflow as tf
+
+
+def _make_offset_wrapper(new_class_name: str, base_lr_class):
+  """Generates a offset wrapper of learning rate schedule.
+
+  It will returns a subclass of the the `base_lr_class`, the subclass takes an
+  `offset` argument in the constructor. When the new class instance is called,
+  the behavior is:
+    new_class_object(step) = base_lr_class_object(step - offset)
+
+  Example:
+    CosineDecayWithOffset = _make_offset_wrapper(
+                     'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
+    # Use the lr:
+    lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
+                               decay_steps=1000)
+    lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
+
+  Args:
+    new_class_name: the name of the new class.
+    base_lr_class: the base learning rate schedule class. Should be subclass of
+      tf.keras.optimizers.schedules.LearningRateSchedule
+
+  Returns:
+    A new class (subclass of the base_lr_class) that can take an offset.
+  """
+  assert issubclass(base_lr_class,
+                    tf.keras.optimizers.schedules.LearningRateSchedule), (
+                        "base_lr_class should be subclass of keras "
+                        f"LearningRateSchedule, got {base_lr_class}")
+
+  # pylint: disable=protected-access,pointless-statement
+  def offset_learning_rate_init(self, offset=0, **kwargs):
+    """Construct learning rate schedule object.
+
+    When this object is called, its behavior is
+       self.__call__(step) == base_lr_class.__call__(step - offset)
+    Args:
+      self: this object.
+      offset: The offset when computing the learning rate schedule.
+      **kwargs: Pass through to base learning rate class constructor.
+    """
+    base_lr_class.__init__(self, **kwargs)
+    self._offset = offset
+
+  def offset_learning_rate_call(self, step):
+    step = tf.cast(step - self._offset, tf.float32)
+    return base_lr_class.__call__(self, step)
+
+  # pylint: enable=protected-access,pointless-statement
+
+  return type(
+      new_class_name, (base_lr_class,), {
+          "base_lr_class": base_lr_class,
+          "__init__": offset_learning_rate_init,
+          "__call__": offset_learning_rate_call
+      })
+
+
+PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
+    "PiecewiseConstantDecayWithOffset",
+    tf.keras.optimizers.schedules.PiecewiseConstantDecay)
+PolynomialDecayWithOffset = _make_offset_wrapper(
+    "PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
+ExponentialDecayWithOffset = _make_offset_wrapper(
+    "ExponentialDecayWithOffset",
+    tf.keras.optimizers.schedules.ExponentialDecay)
+CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
+                                             tf.keras.experimental.CosineDecay)
+
+
+class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Linear warmup schedule."""
+
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               warmup_learning_rate: float,
+               name: Optional[str] = None):
+    """Add linear warmup schedule to a learning rate schedule.
+
+    warmup_lr is the initial learning rate, the final learning rate of the
+    init_warmup period is the initial learning rate of lr_schedule in use.
+    The learning rate at each step linearly increased according to the following
+    formula:
+      learning_rate = warmup_lr + step / warmup_steps
+                    * (final_warmup_lr - warmup_lr).
+    Using warmup overrides the learning rate schedule by the number of warmup
+    steps.
+
+    Args:
+      after_warmup_lr_sched: tf.keras.optimizers.schedules .LearningRateSchedule
+        or a constant.
+      warmup_steps: Number of the warmup steps.
+      warmup_learning_rate: Initial learning rate for the warmup.
+      name: Optional, name of warmup schedule.
+    """
+    super().__init__()
+    self._name = name
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._warmup_steps = warmup_steps
+    self._init_warmup_lr = warmup_learning_rate
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32)
+
+  def __call__(self, step: int):
+
+    global_step = tf.cast(step, dtype=tf.float32)
+
+    linear_warmup_lr = (
+        self._init_warmup_lr + global_step / self._warmup_steps *
+        (self._final_warmup_lr - self._init_warmup_lr))
+
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      after_warmup_lr = self._after_warmup_lr_sched(step)
+    else:
+      after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+    lr = tf.cond(global_step < self._warmup_steps,
+                 lambda: linear_warmup_lr,
+                 lambda: after_warmup_lr)
+    return lr
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "warmup_learning_rate": self._init_warmup_lr,
+        "name": self._name
+    })
+    return config
+
+
+class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applies polynomial warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               power: float = 1.0,
+               name: str = "PolynomialWarmup"):
+    super().__init__()
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._initial_learning_rate = tf.cast(
+          after_warmup_lr_sched, dtype=tf.float32)
+
+    self._warmup_steps = warmup_steps
+    self._power = power
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PolynomialWarmUp") as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)
+
+      if self._warmup_steps <= 0:
+        warmup_percent_done = 1.0
+      else:
+        # A zero `step` may cause Inf. So make `step` positive.
+        step_non_zero = tf.math.maximum(global_step_float, 1.0)
+        warmup_percent_done = step_non_zero / warmup_steps_float
+
+      warmup_learning_rate = (
+          self._initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self._power))
+
+      if isinstance(self._after_warmup_lr_sched,
+                    tf.keras.optimizers.schedules.LearningRateSchedule):
+        after_warmup_lr = self._after_warmup_lr_sched(step)
+      else:
+        after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: after_warmup_lr,
+          name=name)
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "power": self._power,
+        "name": self._name
+    })
+    return config
+
+
+class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule follows lr * (step)^power."""
+
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               name: str = "DirectPowerDecay"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "DirectPowerDecay"):
+      step = tf.cast(step, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "name": self._name,
+    }
+
+
+class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule with multiplied by linear decay at the end.
+
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+  """
+
+  def __init__(self,
+               initial_learning_rate: float,
+               total_decay_steps: int,
+               power: float = 1.0,
+               linear_decay_fraction: float = 0.1,
+               offset: int = 0,
+               name: str = "PowerAndLinearDecay"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      total_decay_steps: The total number of steps for power + linear decay.
+      power: The order of the polynomial.
+      linear_decay_fraction: In the last `linear_decay_fraction` steps, the
+        learning rate will be multiplied by a linear decay.
+      offset: The offset applied to steps.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._total_decay_steps = total_decay_steps
+    self._power = power
+    self._linear_decay_fraction = linear_decay_fraction
+    self._offset = offset
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerAndLinearDecay"):
+      step = tf.cast(step - self._offset, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      if self._total_decay_steps * self._linear_decay_fraction > 0:
+        learning_rate *= tf.minimum(
+            1.0, (self._total_decay_steps - step) /
+            (self._total_decay_steps * self._linear_decay_fraction))
+        learning_rate = tf.maximum(0.0, learning_rate)
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "total_decay_steps": self._total_decay_steps,
+        "power": self._power,
+        "linear_decay_fraction": self._linear_decay_fraction,
+        "offset": self._offset,
+        "name": self._name,
+    }
+
+
+class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Power learning rate decay with offset.
+
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+  """
+
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               offset: int = 0,
+               pre_offset_learning_rate: float = 1.0e6,
+               name: str = "PowerDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      offset: The offset when computing the power decay.
+      pre_offset_learning_rate: The maximum learning rate we'll use.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._offset = offset
+    self._pre_offset_lr = pre_offset_learning_rate
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerDecayWithOffset"):
+      step = tf.cast(step, tf.float32)
+      lr_after_offset = tf.math.pow(
+          tf.math.maximum(step - self._offset, 1.0), self._power) * (
+              self._initial_learning_rate)
+
+      sign = tf.cast(step > self._offset, tf.float32)
+      lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
+      # Power may give infinitely large LR. So cap it with pre_offset_lr.
+      return tf.math.minimum(lr_combined, self._pre_offset_lr)
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "offset": self._offset,
+        "pre_offset_learning_rate": self._pre_offset_lr,
+        "name": self._name,
+    }
+
+
+class StepConsineDecayWithOffset(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Stepwise cosine learning rate decay with offset.
+
+  Learning rate is equivalent to one or more consine decay(s) starting and
+  ending at each interval.
+
+  ExampleL
+
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5]
+    lr_decayed_fn = (
+    lr_schedule.StepConsineDecayWithOffset(
+        boundaries,
+        values))
+    ```
+
+    from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
+    from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
+  """
+
+  def __init__(self,
+               boundaries,
+               values,
+               offset: int = 0,
+               name: str = "StepConsineDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      boundaries: A list of `Tensor`s or `int`s with strictly
+        increasing entries, and with all elements having the same type as the
+        optimizer step.
+      values: A list of `Tensor`s or `float`s that specifies the
+        values for the intervals defined by `boundaries`. It should have one
+        more element than `boundaries`, and all elements should have the same
+        type.
+      offset: The offset when computing the power decay.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self.values = values
+    self.boundaries = boundaries
+    self.offset = offset
+    self.name = name
+
+    if len(self.values) < 1:
+      raise ValueError(f"Expect non empty {self.values}")
+    if len(self.boundaries) != len(self.values):
+      raise ValueError(
+          "Boundaries length is equal to learning rate levels length"
+          f"{len(self.boundaries)} != {len(self.values)}")
+
+    self.total_steps = (
+        [boundaries[i + 1] - boundaries[i] for i in range(len(boundaries) - 1)
+        ] + [0])
+
+  def __call__(self, global_step):
+    with tf.name_scope(self.name or "StepConsineDecayWithOffset"):
+      global_step = tf.cast(global_step - self.offset, tf.float32)
+      lr_levels = self.values
+      lr_steps = self.boundaries
+      level_total_steps = self.total_steps
+      num_levels = len(lr_levels)
+
+      init_lr = lr_levels[0]
+      next_init_lr = lr_levels[1] if num_levels > 1 else 0.
+
+      init_total_steps = level_total_steps[0]
+
+      cosine_learning_rate = ((init_lr - next_init_lr) * (tf.cos(
+          tf.constant(math.pi) * (global_step) /
+          (init_total_steps)) + 1.0) / 2.0 + next_init_lr)
+      learning_rate = cosine_learning_rate
+      tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
+                                cosine_learning_rate)
+      tf.compat.v1.logging.info("DEBUG lr %r next lr %r inittotalstep %r",
+                                init_lr, next_init_lr, init_total_steps)
+
+      for i in range(1, num_levels):
+        next_init_lr = lr_levels[i]
+        next_start_step = lr_steps[i]
+        next_total_steps = level_total_steps[i]
+        next_next_init_lr = lr_levels[i + 1] if num_levels > i + 1 else 0.
+
+        tf.compat.v1.logging.info(
+            "DEBUG step %r nilr %r nss %r nts %r nnilr %r", global_step,
+            next_init_lr, next_start_step, next_total_steps, next_next_init_lr)
+        next_cosine_learning_rate = ((next_init_lr - next_next_init_lr) *
+                                     (tf.cos(
+                                         tf.constant(math.pi) *
+                                         (global_step - next_start_step) /
+                                         (next_total_steps)) + 1.0) / 2.0 +
+                                     next_next_init_lr)
+        learning_rate = tf.where(global_step >= next_start_step,
+                                 next_cosine_learning_rate, learning_rate)
+        tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
+                                  next_cosine_learning_rate)
+
+    return learning_rate
+
+  def get_config(self):
+    return {
+        "boundaries": self.boundaries,
+        "values": self.values,
+        "offset": self.offset,
+        "name": self.name
+    }
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lr_schedule_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lr_schedule_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for lr_schedule."""
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.modeling.optimization import lr_schedule
+
+
+class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name='power_only',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.0,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 1.0], [40, 1. / 40.], [60, 1. / 60],
+                    [100, 1. / 100]]),
+      dict(
+          testcase_name='linear_only',
+          init_lr=1.0,
+          power=0.0,
+          linear_decay_fraction=1.0,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 0.99], [40, 0.6], [60, 0.4], [100, 0.0]]),
+      dict(
+          testcase_name='general',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.5,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 1.0], [40, 1. / 40.],
+                    [60, 1. / 60. * 0.8], [100, 0.0]]),
+      dict(
+          testcase_name='offset',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.5,
+          total_decay_steps=100,
+          offset=90,
+          expected=[[0, 1.0], [90, 1.0], [91, 1.0], [130, 1. / 40.],
+                    [150, 1. / 60. * 0.8], [190, 0.0], [200, 0.0]]),
+  )
+  def test_power_linear_lr_schedule(self, init_lr, power, linear_decay_fraction,
+                                    total_decay_steps, offset, expected):
+    lr = lr_schedule.PowerAndLinearDecay(
+        initial_learning_rate=init_lr,
+        power=power,
+        linear_decay_fraction=linear_decay_fraction,
+        total_decay_steps=total_decay_steps,
+        offset=offset)
+    for step, value in expected:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+
+class OffsetLearningRateTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      dict(class_name=lr_schedule.PiecewiseConstantDecayWithOffset),
+      dict(class_name=lr_schedule.PolynomialDecayWithOffset),
+      dict(class_name=lr_schedule.ExponentialDecayWithOffset),
+      dict(class_name=lr_schedule.CosineDecayWithOffset),
+  )
+  def test_generated_docstring(self, class_name):
+    self.assertNotEmpty(class_name.__init__.__doc__)
+
+  @parameterized.parameters(
+      dict(
+          class_name=lr_schedule.PiecewiseConstantDecayWithOffset,
+          kwarg=dict(boundaries=[50, 80], values=[1.0, 0.5, 0.1])),
+      dict(
+          class_name=lr_schedule.PolynomialDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+      dict(
+          class_name=lr_schedule.ExponentialDecayWithOffset,
+          kwarg=dict(
+              initial_learning_rate=1.0, decay_steps=100, decay_rate=0.5)),
+      dict(
+          class_name=lr_schedule.CosineDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+  )
+  def test_offset(self, class_name, kwarg):
+    offset = 10
+    offset_lr = class_name(offset=offset, **kwarg)
+    base_lr = class_name.base_lr_class(**kwarg)
+    self.assertIsInstance(offset_lr, class_name)
+    for step in range(10, 101, 10):
+      self.assertEqual(offset_lr(step), base_lr(step - offset))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/optimizer_factory.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/optimizer_factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimizer factory class."""
+from typing import Callable, Optional, Union, List, Tuple
+
+import gin
+import tensorflow as tf
+import tensorflow_addons.optimizers as tfa_optimizers
+
+from official.modeling.optimization import slide_optimizer
+from official.modeling.optimization import adafactor_optimizer
+from official.modeling.optimization import ema_optimizer
+from official.modeling.optimization import lars_optimizer
+from official.modeling.optimization import lr_schedule
+from official.modeling.optimization.configs import optimization_config as opt_cfg
+from official.nlp import optimization as nlp_optimization
+
+OPTIMIZERS_CLS = {
+    'sgd': tf.keras.optimizers.SGD,
+    'adam': tf.keras.optimizers.Adam,
+    'adamw': nlp_optimization.AdamWeightDecay,
+    'lamb': tfa_optimizers.LAMB,
+    'rmsprop': tf.keras.optimizers.RMSprop,
+    'lars': lars_optimizer.LARS,
+    'adagrad': tf.keras.optimizers.Adagrad,
+    'slide': slide_optimizer.SLIDE,
+    'adafactor': adafactor_optimizer.Adafactor,
+}
+
+LR_CLS = {
+    'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
+    'polynomial': lr_schedule.PolynomialDecayWithOffset,
+    'exponential': lr_schedule.ExponentialDecayWithOffset,
+    'cosine': lr_schedule.CosineDecayWithOffset,
+    'power': lr_schedule.DirectPowerDecay,
+    'power_linear': lr_schedule.PowerAndLinearDecay,
+    'power_with_offset': lr_schedule.PowerDecayWithOffset,
+    'step_cosine_with_offset': lr_schedule.StepConsineDecayWithOffset,
+}
+
+WARMUP_CLS = {
+    'linear': lr_schedule.LinearWarmup,
+    'polynomial': lr_schedule.PolynomialWarmUp
+}
+
+
+def register_optimizer_cls(
+    key: str, optimizer_config_cls: tf.keras.optimizers.Optimizer):
+  """Register customize optimizer cls.
+
+  The user will still need to subclass data classes in
+  configs.optimization_config to be used with OptimizerFactory.
+
+  Args:
+    key: A string to that the optimizer_config_cls is registered with.
+    optimizer_config_cls: A class which inherits tf.keras.optimizers.Optimizer.
+  """
+  if key in OPTIMIZERS_CLS:
+    raise ValueError('%s already registered in OPTIMIZER_CLS.' % key)
+  OPTIMIZERS_CLS[key] = optimizer_config_cls
+
+
+class OptimizerFactory:
+  """Optimizer factory class.
+
+  This class builds learning rate and optimizer based on an optimization config.
+  To use this class, you need to do the following:
+  (1) Define optimization config, this includes optimizer, and learning rate
+      schedule.
+  (2) Initialize the class using the optimization config.
+  (3) Build learning rate.
+  (4) Build optimizer.
+
+  This is a typical example for using this class:
+  params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {'boundaries': [10000, 20000],
+                         'values': [0.1, 0.01, 0.001]}
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01}
+        }
+    }
+  opt_config = OptimizationConfig(params)
+  opt_factory = OptimizerFactory(opt_config)
+  lr = opt_factory.build_learning_rate()
+  optimizer = opt_factory.build_optimizer(lr)
+  """
+
+  def __init__(self, config: opt_cfg.OptimizationConfig):
+    """Initializing OptimizerFactory.
+
+    Args:
+      config: OptimizationConfig instance contain optimization config.
+    """
+    self._config = config
+    self._optimizer_config = config.optimizer.get()
+    self._optimizer_type = config.optimizer.type
+
+    self._use_ema = config.ema is not None
+    self._ema_config = config.ema
+
+    if self._optimizer_config is None:
+      raise ValueError('Optimizer type must be specified')
+
+    self._lr_config = config.learning_rate.get()
+    self._lr_type = config.learning_rate.type
+
+    if self._lr_type is None:
+      raise ValueError('Learning rate type must be specified')
+
+    self._warmup_config = config.warmup.get()
+    self._warmup_type = config.warmup.type
+
+  def build_learning_rate(self):
+    """Build learning rate.
+
+    Builds learning rate from config. Learning rate schedule is built according
+    to the learning rate config. If learning rate type is consant,
+    lr_config.learning_rate is returned.
+
+    Returns:
+      tf.keras.optimizers.schedules.LearningRateSchedule instance. If
+      learning rate type is consant, lr_config.learning_rate is returned.
+    """
+    if self._lr_type == 'constant':
+      lr = self._lr_config.learning_rate
+    else:
+      lr = LR_CLS[self._lr_type](**self._lr_config.as_dict())
+
+    if self._warmup_config:
+      lr = WARMUP_CLS[self._warmup_type](lr, **self._warmup_config.as_dict())
+
+    return lr
+
+  @gin.configurable
+  def build_optimizer(
+      self,
+      lr: Union[tf.keras.optimizers.schedules.LearningRateSchedule, float],
+      gradient_transformers: Optional[List[Callable[
+          [List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor, tf.Tensor]]
+      ]]] = None,
+      postprocessor: Optional[Callable[[tf.keras.optimizers.Optimizer],
+                                       tf.keras.optimizers.Optimizer]] = None):
+    """Build optimizer.
+
+    Builds optimizer from config. It takes learning rate as input, and builds
+    the optimizer according to the optimizer config. Typically, the learning
+    rate built using self.build_lr() is passed as an argument to this method.
+
+    Args:
+      lr: A floating point value, or a
+        tf.keras.optimizers.schedules.LearningRateSchedule instance.
+      gradient_transformers: Optional list of functions to use to transform
+        gradients before applying updates to Variables. The functions are
+        applied after gradient_aggregator. The functions should accept and
+        return a list of (gradient, variable) tuples. clipvalue, clipnorm,
+        global_clipnorm should not be set when gradient_transformers is passed.
+      postprocessor: An optional function for postprocessing the optimizer. It
+        takes an optimizer and returns an optimizer.
+
+    Returns:
+      tf.keras.optimizers.Optimizer instance.
+    """
+
+    optimizer_dict = self._optimizer_config.as_dict()
+    ## Delete clipnorm, clipvalue, global_clipnorm if None
+    if optimizer_dict['clipnorm'] is None:
+      del optimizer_dict['clipnorm']
+    if optimizer_dict['clipvalue'] is None:
+      del optimizer_dict['clipvalue']
+    if optimizer_dict['global_clipnorm'] is None:
+      del optimizer_dict['global_clipnorm']
+
+    optimizer_dict['learning_rate'] = lr
+    if gradient_transformers is not None:
+      optimizer_dict['gradient_transformers'] = gradient_transformers
+
+    optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
+
+    if self._use_ema:
+      optimizer = ema_optimizer.ExponentialMovingAverage(
+          optimizer, **self._ema_config.as_dict())
+    if postprocessor:
+      optimizer = postprocessor(optimizer)
+    assert isinstance(optimizer, tf.keras.optimizers.Optimizer), (
+        'OptimizerFactory.build_optimizer returning a non-optimizer object: '
+        '{}'.format(optimizer))
+
+    return optimizer
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/optimizer_factory_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/optimizer_factory_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for optimizer_factory.py."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.modeling.optimization import optimizer_factory
+from official.modeling.optimization.configs import optimization_config
+
+
+class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(('sgd'), ('rmsprop'), ('adam'), ('adamw'), ('lamb'),
+                            ('lars'), ('adagrad'))
+  def test_optimizers(self, optimizer_type):
+    params = {
+        'optimizer': {
+            'type': optimizer_type
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
+        }
+    }
+    optimizer_cls = optimizer_factory.OPTIMIZERS_CLS[optimizer_type]
+    expected_optimizer_config = optimizer_cls().get_config()
+    expected_optimizer_config['learning_rate'] = 0.1
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr, postprocessor=lambda x: x)
+
+    self.assertIsInstance(optimizer, optimizer_cls)
+    self.assertEqual(expected_optimizer_config, optimizer.get_config())
+
+  @parameterized.parameters((None, None), (1.0, None), (None, 1.0))
+  def test_gradient_clipping(self, clipnorm, clipvalue):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'clipnorm': clipnorm,
+                'clipvalue': clipvalue
+            }
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 1.0
+            }
+        }
+    }
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr)
+
+    var0 = tf.Variable([1.0, 2.0])
+    var1 = tf.Variable([3.0, 4.0])
+
+    grads0 = tf.constant([0.1, 0.1])
+    grads1 = tf.constant([2.0, 3.0])
+
+    grads_and_vars = list(zip([grads0, grads1], [var0, var1]))
+    optimizer.apply_gradients(grads_and_vars)
+
+    self.assertAllClose(np.array([0.9, 1.9]), var0.numpy())
+    if clipvalue is not None:
+      self.assertAllClose(np.array([2.0, 3.0]), var1.numpy())
+    elif clipnorm is not None:
+      self.assertAllClose(np.array([2.4452999, 3.1679497]), var1.numpy())
+    else:
+      self.assertAllClose(np.array([1.0, 1.0]), var1.numpy())
+
+  def test_missing_types(self):
+    params = {'optimizer': {'type': 'sgd', 'sgd': {'momentum': 0.9}}}
+    with self.assertRaises(ValueError):
+      optimizer_factory.OptimizerFactory(
+          optimization_config.OptimizationConfig(params))
+    params = {
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        }
+    }
+    with self.assertRaises(ValueError):
+      optimizer_factory.OptimizerFactory(
+          optimization_config.OptimizationConfig(params))
+
+
+# TODO(b/187559334) refactor lr_schedule tests into `lr_schedule_test.py`.
+
+  def test_stepwise_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.1], [5000, 0.1], [10000, 0.1],
+                               [10001, 0.01], [20000, 0.01], [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_stepwise_lr_with_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 500,
+                'warmup_learning_rate': 0.01
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5500, 0.1],
+                               [10000, 0.1], [10001, 0.01], [20000, 0.01],
+                               [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_exponential_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'exponential',
+            'exponential': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000,
+                'decay_rate': 0.96,
+                'staircase': True
+            }
+        }
+    }
+    expected_lr_step_values = [
+        [0, 0.1],
+        [999, 0.1],
+        [1000, 0.096],
+        [1999, 0.096],
+        [2000, 0.09216],
+    ]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_polynomial_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000,
+                'end_learning_rate': 0.001
+            }
+        }
+    }
+
+    expected_lr_step_values = [[0, 0.1], [500, 0.0505], [1000, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_cosine_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'cosine',
+            'cosine': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.1], [250, 0.08535534], [500, 0.04999999],
+                               [750, 0.01464466], [1000, 0]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_constant_lr_with_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 500,
+                'warmup_learning_rate': 0.01
+            }
+        }
+    }
+
+    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5000, 0.1],
+                               [10000, 0.1], [20000, 0.1]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_stepwise_lr_with_polynomial_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        },
+        'warmup': {
+            'type': 'polynomial',
+            'polynomial': {
+                'warmup_steps': 500,
+                'power': 2.
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.0], [250, 0.025], [500, 0.1], [5500, 0.1],
+                               [10000, 0.1], [10001, 0.01], [20000, 0.01],
+                               [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value, places=6)
+
+  def test_power_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power',
+            'power': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 1.0], [1, 1.0], [250, 1. / 250.]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_power_linear_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power_linear',
+            'power_linear': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0,
+                'linear_decay_fraction': 0.5,
+                'total_decay_steps': 100,
+                'offset': 0,
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 1.0], [1, 1.0], [40, 1. / 40.],
+                               [60, 1. / 60. * 0.8]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_power_with_offset_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power_with_offset',
+            'power_with_offset': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0,
+                'offset': 10,
+                'pre_offset_learning_rate': 3.0,
+            }
+        }
+    }
+    expected_lr_step_values = [[1, 3.0], [10, 3.0], [20, 1. / 10.]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_step_cosine_lr_schedule_with_warmup(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'step_cosine_with_offset',
+            'step_cosine_with_offset': {
+                'values': (0.0001, 0.00005),
+                'boundaries': (0, 500000),
+                'offset': 10000,
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 10000,
+                'warmup_learning_rate': 0.0
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.0], [5000, 1e-4/2.0], [10000, 1e-4],
+                               [20000, 9.994863e-05], [499999, 5e-05]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+
+class OptimizerFactoryRegistryTest(tf.test.TestCase):
+
+  def test_registry(self):
+
+    class MyClass():
+      pass
+    optimizer_factory.register_optimizer_cls('test', MyClass)
+    self.assertIn('test', optimizer_factory.OPTIMIZERS_CLS)
+    with self.assertRaisesRegex(ValueError, 'test already registered.*'):
+      optimizer_factory.register_optimizer_cls('test', MyClass)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/slide_optimizer.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/slide_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SLIDE optimizer.
+
+A new optimizer that will be open sourced soon.
+"""
+
+SLIDE = "Unimplemented"
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/performance.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/performance.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions and classes related to training performance."""
+
+import tensorflow as tf
+
+
+def configure_optimizer(optimizer,
+                        use_float16=False,
+                        use_graph_rewrite=False,
+                        loss_scale=None):
+  """Configures optimizer object with performance options."""
+  if use_float16:
+    if loss_scale in (None, 'dynamic'):
+      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
+    else:
+      # loss_scale is a number. We interpret that as a fixed loss scale.
+      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
+          optimizer, dynamic=False, initial_scale=loss_scale)
+  if use_graph_rewrite:
+    # Note: the model dtype must be 'float32', which will ensure
+    # tf.keras.mixed_precision and enable_mixed_precision_graph_rewrite do not
+    # double up.
+    optimizer = (
+        tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+            optimizer))
+  return optimizer
+
+
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets the global `tf.keras.mixed_precision.Policy`."""
+  # TODO(b/191894773): Remove loss_scale argument
+  assert loss_scale is None, (
+      'The loss_scale argument must be None. The argument exists for '
+      'historical reasons and will be removed soon.')
+  if dtype == tf.float16:
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+  elif dtype == tf.bfloat16:
+    tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+  elif dtype == tf.float32:
+    tf.keras.mixed_precision.set_global_policy('float32')
+  else:
+    raise ValueError('Unexpected dtype: %s' % dtype)
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/tf_utils.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/tf_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common TF utilities."""
+
+import six
+import tensorflow as tf
+
+from tensorflow.python.util import deprecation
+from official.modeling import activations
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed.")
+def pack_inputs(inputs):
+  """Pack a list of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is None, replace it with a special constant
+    tensor.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if x is None:
+      outputs.append(tf.constant(0, shape=[], dtype=tf.int32))
+    else:
+      outputs.append(x)
+  return tuple(outputs)
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed.")
+def unpack_inputs(inputs):
+  """unpack a tuple of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is a special constant tensor, replace it
+    with None.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if is_special_none_tensor(x):
+      outputs.append(None)
+    else:
+      outputs.append(x)
+  x = tuple(outputs)
+
+  # To trick the very pointless 'unbalanced-tuple-unpacking' pylint check
+  # from triggering.
+  if len(x) == 1:
+    return x[0]
+  return tuple(outputs)
+
+
+def is_special_none_tensor(tensor):
+  """Checks if a tensor is a special None Tensor."""
+  return tensor.shape.ndims == 0 and tensor.dtype == tf.int32
+
+
+def get_activation(identifier, use_keras_layer=False):
+  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+  It checks string first and if it is one of customized activation not in TF,
+  the corresponding activation will be returned. For non-customized activation
+  names and callable identifiers, always fallback to tf.keras.activations.get.
+
+  Prefers using keras layers when use_keras_layer=True. Now it only supports
+  'relu', 'linear', 'identity', 'swish'.
+
+  Args:
+    identifier: String name of the activation function or callable.
+    use_keras_layer: If True, use keras layer if identifier is allow-listed.
+
+  Returns:
+    A Python function corresponding to the activation function or a keras
+    activation layer when use_keras_layer=True.
+  """
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier).lower()
+    if use_keras_layer:
+      keras_layer_allowlist = {
+          "relu": "relu",
+          "linear": "linear",
+          "identity": "linear",
+          "swish": "swish",
+          "sigmoid": "sigmoid",
+          "relu6": tf.nn.relu6,
+      }
+      if identifier in keras_layer_allowlist:
+        return tf.keras.layers.Activation(keras_layer_allowlist[identifier])
+    name_to_fn = {
+        "gelu": activations.gelu,
+        "simple_swish": activations.simple_swish,
+        "hard_swish": activations.hard_swish,
+        "relu6": activations.relu6,
+        "hard_sigmoid": activations.hard_sigmoid,
+        "identity": activations.identity,
+    }
+    if identifier in name_to_fn:
+      return tf.keras.activations.get(name_to_fn[identifier])
+  return tf.keras.activations.get(identifier)
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+  """Returns a list of the shape of tensor, preferring static dimensions.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of.
+    expected_rank: (optional) int. The expected rank of `tensor`. If this is
+      specified and the `tensor` has a different rank, and exception will be
+      thrown.
+    name: Optional name of the tensor for the error message.
+
+  Returns:
+    A list of dimensions of the shape of tensor. All static dimensions will
+    be returned as python integers, and dynamic dimensions will be returned
+    as tf.Tensor scalars.
+  """
+  if expected_rank is not None:
+    assert_rank(tensor, expected_rank, name)
+
+  shape = tensor.shape.as_list()
+
+  non_static_indexes = []
+  for (index, dim) in enumerate(shape):
+    if dim is None:
+      non_static_indexes.append(index)
+
+  if not non_static_indexes:
+    return shape
+
+  dyn_shape = tf.shape(tensor)
+  for index in non_static_indexes:
+    shape[index] = dyn_shape[index]
+  return shape
+
+
+def assert_rank(tensor, expected_rank, name=None):
+  """Raises an exception if the tensor rank is not of the expected rank.
+
+  Args:
+    tensor: A tf.Tensor to check the rank of.
+    expected_rank: Python integer or list of integers, expected rank.
+    name: Optional name of the tensor for the error message.
+
+  Raises:
+    ValueError: If the expected shape doesn't match the actual shape.
+  """
+  expected_rank_dict = {}
+  if isinstance(expected_rank, six.integer_types):
+    expected_rank_dict[expected_rank] = True
+  else:
+    for x in expected_rank:
+      expected_rank_dict[x] = True
+
+  actual_rank = tensor.shape.ndims
+  if actual_rank not in expected_rank_dict:
+    raise ValueError(
+        "For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
+        "equal to the expected tensor rank `%s`" %
+        (name, actual_rank, str(tensor.shape), str(expected_rank)))
+
+
+def safe_mean(losses):
+  """Computes a safe mean of the losses.
+
+  Args:
+    losses: `Tensor` whose elements contain individual loss measurements.
+
+  Returns:
+    A scalar representing the mean of `losses`. If `num_present` is zero,
+      then zero is returned.
+  """
+  total = tf.reduce_sum(losses)
+  num_elements = tf.cast(tf.size(losses), dtype=losses.dtype)
+  return tf.math.divide_no_nan(total, num_elements)
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/README.md
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/README.md
+# TensorFlow NLP Modelling Toolkit
+
+This codebase provides a Natrual Language Processing modeling toolkit written in
+[TF2](https://www.tensorflow.org/guide/effective_tf2). It allows researchers and
+developers to reproduce state-of-the-art model results and train custom models
+to experiment new research ideas.
+
+## Features
+
+* Reusable and modularized modeling building blocks
+* State-of-the-art reproducible
+* Easy to customize and extend
+* End-to-end training
+* Distributed trainable on both GPUs and TPUs
+
+## Major components
+
+### Libraries
+
+We provide modeling library to allow users to train custom models for new
+research ideas. Detailed intructions can be found in READMEs in each folder.
+
+*   [modeling/](modeling): modeling library that provides building blocks
+    (e.g.,Layers, Networks, and Models) that can be assembled into
+    transformer-based achitectures .
+*   [data/](data): binaries and utils for input preprocessing, tokenization,
+    etc.
+
+### State-of-the-Art models and examples
+
+We provide SoTA model implementations, pre-trained models, training and
+evaluation examples, and command lines. Detail instructions can be found in the
+READMEs for specific papers.
+
+1.  [BERT](bert): [BERT: Pre-training of Deep Bidirectional Transformers for
+    Language Understanding](https://arxiv.org/abs/1810.04805) by Devlin et al.,
+    2018
+2.  [ALBERT](albert):
+    [A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
+    by Lan et al., 2019
+3.  [XLNet](xlnet):
+    [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
+    by Yang et al., 2019
+4.  [Transformer for translation](transformer):
+    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Vaswani et
+    al., 2017
+
+### Common Training Driver
+
+We provide a single common driver [train.py](train.py) to train above SoTA
+models on popluar tasks. Please see [docs/train.md](docs/train.md) for
+more details.
+
+
+### Pre-trained models with checkpoints and TF-Hub
+
+We provide a large collection of baselines and checkpoints for NLP pre-trained
+models. Please see [docs/pretrained_models.md](docs/pretrained_models.md) for
+more details.
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/__init__.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/README.md
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/README.md
+# ALBERT (ALBERT: A Lite BERT for Self-supervised Learning of Language Representations)
+
+**WARNING**: We are on the way to deprecate this directory.
+We will add documentation in `nlp/docs` to use the new code in `nlp/modeling`.
+
+The academic paper which describes ALBERT in detail and provides full results on
+a number of tasks can be found here: https://arxiv.org/abs/1909.11942.
+
+This repository contains TensorFlow 2.x implementation for ALBERT.
+
+## Contents
+  * [Contents](#contents)
+  * [Pre-trained Models](#pre-trained-models)
+    * [Restoring from Checkpoints](#restoring-from-checkpoints)
+  * [Set Up](#set-up)
+  * [Process Datasets](#process-datasets)
+  * [Fine-tuning with BERT](#fine-tuning-with-bert)
+    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
+    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
+    * [SQuAD 1.1](#squad-1.1)
+
+
+## Pre-trained Models
+
+We released both checkpoints and tf.hub modules as the pretrained models for
+fine-tuning. They are TF 2.x compatible and are converted from the ALBERT v2
+checkpoints released in TF 1.x official ALBERT repository
+[google-research/albert](https://github.com/google-research/albert)
+in order to keep consistent with ALBERT paper.
+
+Our current released checkpoints are exactly the same as TF 1.x official ALBERT
+repository.
+
+### Access to Pretrained Checkpoints
+
+Pretrained checkpoints can be found in the following links:
+
+**Note: We implemented ALBERT using Keras functional-style networks in [nlp/modeling](../modeling).
+ALBERT V2 models compatible with TF 2.x checkpoints are:**
+
+*   **[`ALBERT V2 Base`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base.tar.gz)**:
+    12-layer, 768-hidden, 12-heads, 12M parameters
+*   **[`ALBERT V2 Large`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_large.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 18M parameters
+*   **[`ALBERT V2 XLarge`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_xlarge.tar.gz)**:
+    24-layer, 2048-hidden, 32-heads, 60M parameters
+*   **[`ALBERT V2 XXLarge`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_xxlarge.tar.gz)**:
+    12-layer, 4096-hidden, 64-heads, 235M parameters
+
+We recommend to host checkpoints on Google Cloud storage buckets when you use
+Cloud GPU/TPU.
+
+### Restoring from Checkpoints
+
+`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
+weights from provided pre-trained checkpoints, you can use the following code:
+
+```python
+init_checkpoint='the pretrained model checkpoint path.'
+model=tf.keras.Model() # Bert pre-trained model as feature extractor.
+checkpoint = tf.train.Checkpoint(model=model)
+checkpoint.restore(init_checkpoint)
+```
+
+Checkpoints featuring native serialized Keras models
+(i.e. model.load()/load_weights()) will be available soon.
+
+### Access to Pretrained hub modules.
+
+Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
+following links:
+
+*   **[`ALBERT V2 Base`](https://tfhub.dev/tensorflow/albert_en_base/1)**:
+    12-layer, 768-hidden, 12-heads, 12M parameters
+*   **[`ALBERT V2 Large`](https://tfhub.dev/tensorflow/albert_en_large/1)**:
+    24-layer, 1024-hidden, 16-heads, 18M parameters
+*   **[`ALBERT V2 XLarge`](https://tfhub.dev/tensorflow/albert_en_xlarge/1)**:
+    24-layer, 2048-hidden, 32-heads, 60M parameters
+*   **[`ALBERT V2 XXLarge`](https://tfhub.dev/tensorflow/albert_en_xxlarge/1)**:
+    12-layer, 4096-hidden, 64-heads, 235M parameters
+
+## Set Up
+
+```shell
+export PYTHONPATH="$PYTHONPATH:/path/to/models"
+```
+
+Install `tf-nightly` to get latest updates:
+
+```shell
+pip install tf-nightly-gpu
+```
+
+With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
+TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
+
+```shell
+ctpu up -name <instance name> --tf-version=”nightly”
+```
+
+Second, you need to install TF 2 `tf-nightly` on your VM:
+
+```shell
+pip install tf-nightly
+```
+
+Warning: More details TPU-specific set-up instructions and tutorial should come
+along with official TF 2.x release for TPU. Note that this repo is not
+officially supported by Google Cloud TPU team yet until TF 2.1 released.
+
+## Process Datasets
+
+### Pre-training
+
+Pre-train ALBERT using TF2.x will come soon.
+For now, please use [ALBERT research repo](https://github.com/google-research/ALBERT)
+to pretrain the model and convert the checkpoint to TF2.x compatible ones using
+[tf2_albert_encoder_checkpoint_converter.py](tf2_albert_encoder_checkpoint_converter.py).
+
+
+
+### Fine-tuning
+
+To prepare the fine-tuning data for final model training, use the
+[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
+Note that different from BERT models that use word piece tokenzer,
+ALBERT models employ sentence piece tokenizer. So the FLAG tokenizer_impl has
+to be set to 'sentence_piece'.
+Resulting datasets in `tf_record` format and training meta data should be later
+passed to training or evaluation scripts. The task-specific arguments are
+described in following sections:
+
+* GLUE
+
+Users can download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```shell
+export GLUE_DIR=~/glue
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+
+export TASK_NAME=MNLI
+export OUTPUT_DIR=gs://some_bucket/datasets
+python ../data/create_finetuning_data.py \
+ --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+ --sp_model_file=${ALBERT_DIR}/30k-clean.model \
+ --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+ --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+ --fine_tuning_task_type=classification --max_seq_length=128 \
+ --classification_task_name=${TASK_NAME} \
+ --tokenization=SentencePiece
+```
+
+* SQUAD
+
+The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
+detailed information about the SQuAD datasets and evaluation.
+
+The necessary files can be found here:
+
+*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+```shell
+export SQUAD_DIR=~/squad
+export SQUAD_VERSION=v1.1
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export OUTPUT_DIR=gs://some_bucket/datasets
+
+python ../data/create_finetuning_data.py \
+ --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
+ --sp_model_file=${ALBERT_DIR}/30k-clean.model \
+ --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
+ --fine_tuning_task_type=squad --max_seq_length=384 \
+ --tokenization=SentencePiece
+```
+
+## Fine-tuning with ALBERT
+
+### Cloud GPUs and TPUs
+
+* Cloud Storage
+
+The unzipped pre-trained model files can also be found in the Google Cloud
+Storage folder `gs://cloud-tpu-checkpoints/albert/checkpoints`. For example:
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export MODEL_DIR=gs://some_bucket/my_output_dir
+```
+
+Currently, users are able to access to `tf-nightly` TPUs and the following TPU
+script should run with `tf-nightly`.
+
+* GPU -> TPU
+
+Just add the following flags to `run_classifier.py` or `run_squad.py`:
+
+```shell
+  --distribution_strategy=tpu
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+### Sentence and Sentence-pair Classification Tasks
+
+This example code fine-tunes `albert_v2_base` on the Microsoft Research
+Paraphrase Corpus (MRPC) corpus, which only contains 3,600 examples and can
+fine-tune in a few minutes on most GPUs.
+
+We use the `albert_v2_base` as an example throughout the
+workflow.
+
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${ALBERT_DIR}/albert_config.json \
+  --init_checkpoint=${ALBERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --eval_batch_size=4 \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Alternatively, instead of specifying `init_checkpoint`, you can specify
+`hub_module_url` to employ a pretraind BERT hub module, e.g.,
+` --hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/1`.
+
+To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
+information and use remote storage for model checkpoints.
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=$ALBERT_DIR/albert_config.json \
+  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+### SQuAD 1.1
+
+The Stanford Question Answering Dataset (SQuAD) is a popular question answering
+benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
+
+We use the `albert_v2_base` as an example throughout the
+workflow.
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export SQUAD_DIR=gs://some_bucket/datasets
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --sp_model_file=${ALBERT_DIR}/30k-clean.model \
+  --bert_config_file=$ALBERT_DIR/albert_config.json \
+  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
+  --train_batch_size=4 \
+  --predict_batch_size=4 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Similarily, you can replace `init_checkpoint` FLAGS with `hub_module_url` to
+specify a hub module path.
+
+To use TPU, you need switch distribution strategy type to `tpu` with TPU
+information.
+
+```shell
+export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_DIR=gs://some_bucket/datasets
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --sp_model_file=${ALBERT_DIR}/30k-clean.model \
+  --bert_config_file=$ALBERT_DIR/albert_config.json \
+  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
+  --train_batch_size=32 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+The dev set predictions will be saved into a file called predictions.json in the
+model_dir:
+
+```shell
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
+```
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/__init__.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/configs.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/configs.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The ALBERT configurations."""
+
+import six
+
+from official.nlp.bert import configs
+
+
+class AlbertConfig(configs.BertConfig):
+  """Configuration for `ALBERT`."""
+
+  def __init__(self, num_hidden_groups=1, inner_group_num=1, **kwargs):
+    """Constructs AlbertConfig.
+
+    Args:
+      num_hidden_groups: Number of group for the hidden layers, parameters in
+        the same group are shared. Note that this value and also the following
+        'inner_group_num' has to be 1 for now, because all released ALBERT
+        models set them to 1. We may support arbitary valid values in future.
+      inner_group_num: Number of inner repetition of attention and ffn.
+      **kwargs: The remaining arguments are the same as above 'BertConfig'.
+    """
+    super(AlbertConfig, self).__init__(**kwargs)
+
+    # TODO(chendouble): 'inner_group_num' and 'num_hidden_groups' are always 1
+    # in the released ALBERT. Support other values in AlbertEncoder if needed.
+    if inner_group_num != 1 or num_hidden_groups != 1:
+      raise ValueError("We only support 'inner_group_num' and "
+                       "'num_hidden_groups' as 1.")
+
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `AlbertConfig` from a Python dictionary of parameters."""
+    config = AlbertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
--- a/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/run_classifier.py
+++ b/TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/run_classifier.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ALBERT classification finetuning runner in tf2.x."""
+
+import json
+import os
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from official.common import distribute_utils
+from official.nlp.albert import configs as albert_configs
+from official.nlp.bert import bert_models
+from official.nlp.bert import run_classifier as run_classifier_bert
+
+
+FLAGS = flags.FLAGS
+
+
+def predict(strategy, albert_config, input_meta_data, predict_input_fn):
+  """Function outputs both the ground truth predictions as .tsv files."""
+  with strategy.scope():
+    classifier_model = bert_models.classifier_model(
+        albert_config, input_meta_data['num_labels'])[0]
+    checkpoint = tf.train.Checkpoint(model=classifier_model)
+    latest_checkpoint_file = (
+        FLAGS.predict_checkpoint_path or
+        tf.train.latest_checkpoint(FLAGS.model_dir))
+    assert latest_checkpoint_file
+    logging.info('Checkpoint file %s found and restoring from '
+                 'checkpoint', latest_checkpoint_file)
+    checkpoint.restore(
+        latest_checkpoint_file).assert_existing_objects_matched()
+    preds, ground_truth = run_classifier_bert.get_predictions_and_labels(
+        strategy, classifier_model, predict_input_fn, return_probs=True)
+    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
+    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
+      logging.info('***** Predict results *****')
+      for probabilities in preds:
+        output_line = '\t'.join(
+            str(class_probability)
+            for class_probability in probabilities) + '\n'
+        writer.write(output_line)
+    ground_truth_labels_file = os.path.join(FLAGS.model_dir,
+                                            'output_labels.tsv')
+    with tf.io.gfile.GFile(ground_truth_labels_file, 'w') as writer:
+      logging.info('***** Ground truth results *****')
+      for label in ground_truth:
+        output_line = '\t'.join(str(label)) + '\n'
+        writer.write(output_line)
+  return
+
+
+def main(_):
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+
+  if not FLAGS.model_dir:
+    FLAGS.model_dir = '/tmp/bert20/'
+
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      tpu_address=FLAGS.tpu)
+  max_seq_length = input_meta_data['max_seq_length']
+  train_input_fn = run_classifier_bert.get_dataset_fn(
+      FLAGS.train_data_path,
+      max_seq_length,
+      FLAGS.train_batch_size,
+      is_training=True)
+  eval_input_fn = run_classifier_bert.get_dataset_fn(
+      FLAGS.eval_data_path,
+      max_seq_length,
+      FLAGS.eval_batch_size,
+      is_training=False)
+
+  albert_config = albert_configs.AlbertConfig.from_json_file(
+      FLAGS.bert_config_file)
+  if FLAGS.mode == 'train_and_eval':
+    run_classifier_bert.run_bert(strategy, input_meta_data, albert_config,
+                                 train_input_fn, eval_input_fn)
+  elif FLAGS.mode == 'predict':
+    predict(strategy, albert_config, input_meta_data, eval_input_fn)
+  else:
+    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
+  return
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('bert_config_file')
+  flags.mark_flag_as_required('input_meta_data_path')
+  flags.mark_flag_as_required('model_dir')
+  app.run(main)