Update code to v2.11.0

32e4ca51 · qianyj · 9485aa1d · 71060f67 · 32e4ca51 · 32e4ca51
Commit 32e4ca51 authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/modeling/optimization/legacy_adamw.py
+++ b/official/modeling/optimization/legacy_adamw.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Adam optimizer with weight decay that exactly matches the original BERT."""
+
+import re
+
+from absl import logging
+import tensorflow as tf
+
+
+class AdamWeightDecay(tf.keras.optimizers.legacy.Adam):
+  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+
+  [Warning!]: Keras optimizer supports gradient clipping and has an AdamW
+  implementation. Please consider evaluating the choice in Keras package.
+
+  Just adding the square of the weights to the loss function is *not* the
+  correct way of using L2 regularization/weight decay with Adam, since that will
+  interact with the m and v parameters in strange ways.
+
+  Instead we want to decay the weights in a manner that doesn't interact with
+  the m/v parameters. This is equivalent to adding the square of the weights to
+  the loss with plain (non-momentum) SGD.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               amsgrad=False,
+               weight_decay_rate=0.0,
+               include_in_weight_decay=None,
+               exclude_from_weight_decay=None,
+               gradient_clip_norm=1.0,
+               name='AdamWeightDecay',
+               **kwargs):
+    super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2,
+                                          epsilon, amsgrad, name, **kwargs)
+    self.weight_decay_rate = weight_decay_rate
+    self.gradient_clip_norm = gradient_clip_norm
+    self._include_in_weight_decay = include_in_weight_decay
+    self._exclude_from_weight_decay = exclude_from_weight_decay
+    logging.info('AdamWeightDecay gradient_clip_norm=%f', gradient_clip_norm)
+
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,  # pytype: disable=attribute-error  # typed-keras
+                                                apply_state)
+    apply_state[(var_device, var_dtype)]['weight_decay_rate'] = tf.constant(
+        self.weight_decay_rate, name='adam_weight_decay_rate')
+
+  def _decay_weights_op(self, var, learning_rate, apply_state):
+    do_decay = self._do_use_weight_decay(var.name)
+    if do_decay:
+      return var.assign_sub(
+          learning_rate * var *
+          apply_state[(var.device, var.dtype.base_dtype)]['weight_decay_rate'],
+          use_locking=self._use_locking)
+    return tf.no_op()
+
+  def apply_gradients(self,
+                      grads_and_vars,
+                      name=None,
+                      experimental_aggregate_gradients=True):
+    grads, tvars = list(zip(*grads_and_vars))
+    if experimental_aggregate_gradients and self.gradient_clip_norm > 0.0:
+      # when experimental_aggregate_gradients = False, apply_gradients() no
+      # longer implicitly allreduce gradients, users manually allreduce gradient
+      # and passed the allreduced grads_and_vars. For now, the
+      # clip_by_global_norm will be moved to before the explicit allreduce to
+      # keep the math the same as TF 1 and pre TF 2.2 implementation.
+      (grads, _) = tf.clip_by_global_norm(
+          grads, clip_norm=self.gradient_clip_norm)
+    return super(AdamWeightDecay, self).apply_gradients(
+        zip(grads, tvars),
+        name=name,
+        experimental_aggregate_gradients=experimental_aggregate_gradients)
+
+  def _get_lr(self, var_device, var_dtype, apply_state):
+    """Retrieves the learning rate with the given state."""
+    if apply_state is None:
+      return self._decayed_lr_t[var_dtype], {}
+
+    apply_state = apply_state or {}
+    coefficients = apply_state.get((var_device, var_dtype))
+    if coefficients is None:
+      coefficients = self._fallback_apply_state(var_device, var_dtype)
+      apply_state[(var_device, var_dtype)] = coefficients
+
+    return coefficients['lr_t'], dict(apply_state=apply_state)
+
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay,
+                   self)._resource_apply_dense(grad, var, **kwargs)  # pytype: disable=attribute-error  # typed-keras
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay,
+                   self)._resource_apply_sparse(grad, var, indices, **kwargs)  # pytype: disable=attribute-error  # typed-keras
+
+  def get_config(self):
+    config = super(AdamWeightDecay, self).get_config()
+    config.update({
+        'weight_decay_rate': self.weight_decay_rate,
+    })
+    return config
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if self.weight_decay_rate == 0:
+      return False
+
+    if self._include_in_weight_decay:
+      for r in self._include_in_weight_decay:
+        if re.search(r, param_name) is not None:
+          return True
+
+    if self._exclude_from_weight_decay:
+      for r in self._exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
--- a/official/modeling/optimization/lr_schedule.py
+++ b/official/modeling/optimization/lr_schedule.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ import tensorflow as tf
 def _make_offset_wrapper(new_class_name: str, base_lr_class):
  """Generates a offset wrapper of learning rate schedule.

-  It will returns a subclass of the the `base_lr_class`, the subclass takes an
+  It will returns a subclass of the `base_lr_class`, the subclass takes an
  `offset` argument in the constructor. When the new class instance is called,
  the behavior is:
    new_class_object(step) = base_lr_class_object(step - offset)
@@ -386,11 +386,11 @@ class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
    }


-class StepConsineDecayWithOffset(
+class StepCosineDecayWithOffset(
    tf.keras.optimizers.schedules.LearningRateSchedule):
  """Stepwise cosine learning rate decay with offset.

-  Learning rate is equivalent to one or more consine decay(s) starting and
+  Learning rate is equivalent to one or more cosine decay(s) starting and
  ending at each interval.

  ExampleL
@@ -399,7 +399,7 @@ class StepConsineDecayWithOffset(
    boundaries: [100000, 110000]
    values: [1.0, 0.5]
    lr_decayed_fn = (
-    lr_schedule.StepConsineDecayWithOffset(
+    lr_schedule.StepCosineDecayWithOffset(
        boundaries,
        values))
    ```
@@ -412,7 +412,7 @@ class StepConsineDecayWithOffset(
               boundaries,
               values,
               offset: int = 0,
-               name: str = "StepConsineDecayWithOffset"):
+               name: str = "StepCosineDecayWithOffset"):
    """Initialize configuration of the learning rate schedule.

    Args:
@@ -444,7 +444,7 @@ class StepConsineDecayWithOffset(
        ] + [0])

  def __call__(self, global_step):
-    with tf.name_scope(self.name or "StepConsineDecayWithOffset"):
+    with tf.name_scope(self.name or "StepCosineDecayWithOffset"):
      global_step = tf.cast(global_step - self.offset, tf.float32)
      lr_levels = self.values
      lr_steps = self.boundaries

--- a/official/modeling/optimization/lr_schedule_test.py
+++ b/official/modeling/optimization/lr_schedule_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,22 +23,38 @@ from official.modeling.optimization import slide_optimizer
 from official.modeling.optimization import adafactor_optimizer
 from official.modeling.optimization import ema_optimizer
 from official.modeling.optimization import lars_optimizer
+from official.modeling.optimization import legacy_adamw
 from official.modeling.optimization import lr_schedule
 from official.modeling.optimization.configs import optimization_config as opt_cfg
-from official.nlp import optimization as nlp_optimization

-OPTIMIZERS_CLS = {
-    'sgd': tf.keras.optimizers.SGD,
-    'adam': tf.keras.optimizers.Adam,
-    'adamw': nlp_optimization.AdamWeightDecay,
+# Optimizer CLS to be used in both legacy and new path.
+SHARED_OPTIMIZERS = {
+    'sgd_experimental': tf.keras.optimizers.experimental.SGD,
+    'adam_experimental': tf.keras.optimizers.experimental.Adam,
+    'adamw': legacy_adamw.AdamWeightDecay,
+    'adamw_experimental': tf.keras.optimizers.experimental.AdamW,
    'lamb': tfa_optimizers.LAMB,
-    'rmsprop': tf.keras.optimizers.RMSprop,
    'lars': lars_optimizer.LARS,
-    'adagrad': tf.keras.optimizers.Adagrad,
    'slide': slide_optimizer.SLIDE,
    'adafactor': adafactor_optimizer.Adafactor,
 }

+LEGACY_OPTIMIZERS_CLS = {
+    'sgd': tf.keras.optimizers.legacy.SGD,
+    'adam': tf.keras.optimizers.legacy.Adam,
+    'rmsprop': tf.keras.optimizers.legacy.RMSprop,
+    'adagrad': tf.keras.optimizers.legacy.Adagrad,
+}
+LEGACY_OPTIMIZERS_CLS.update(SHARED_OPTIMIZERS)
+
+NEW_OPTIMIZERS_CLS = {
+    'sgd': tf.keras.optimizers.experimental.SGD,
+    'adam': tf.keras.optimizers.experimental.Adam,
+    'rmsprop': tf.keras.optimizers.experimental.RMSprop,
+    'adagrad': tf.keras.optimizers.experimental.Adagrad,
+}
+NEW_OPTIMIZERS_CLS.update(SHARED_OPTIMIZERS)
+
 LR_CLS = {
    'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
    'polynomial': lr_schedule.PolynomialDecayWithOffset,
@@ -47,7 +63,7 @@ LR_CLS = {
    'power': lr_schedule.DirectPowerDecay,
    'power_linear': lr_schedule.PowerAndLinearDecay,
    'power_with_offset': lr_schedule.PowerDecayWithOffset,
-    'step_cosine_with_offset': lr_schedule.StepConsineDecayWithOffset,
+    'step_cosine_with_offset': lr_schedule.StepCosineDecayWithOffset,
 }

 WARMUP_CLS = {
@@ -56,8 +72,13 @@ WARMUP_CLS = {
 }


-def register_optimizer_cls(
-    key: str, optimizer_config_cls: tf.keras.optimizers.Optimizer):
+def register_optimizer_cls(key: str,
+                           optimizer_config_cls: Union[
+                               tf.keras.optimizers.Optimizer,
+                               tf.keras.optimizers.legacy.Optimizer,
+                               tf.keras.optimizers.experimental.Optimizer
+                           ],
+                           use_legacy_optimizer: bool = True):
  """Register customize optimizer cls.

  The user will still need to subclass data classes in
@@ -66,10 +87,16 @@ def register_optimizer_cls(
  Args:
    key: A string to that the optimizer_config_cls is registered with.
    optimizer_config_cls: A class which inherits tf.keras.optimizers.Optimizer.
+    use_legacy_optimizer: A boolean that indicates if using legacy optimizers.
  """
-  if key in OPTIMIZERS_CLS:
-    raise ValueError('%s already registered in OPTIMIZER_CLS.' % key)
-  OPTIMIZERS_CLS[key] = optimizer_config_cls
+  if use_legacy_optimizer:
+    if key in LEGACY_OPTIMIZERS_CLS:
+      raise ValueError('%s already registered in LEGACY_OPTIMIZERS_CLS.' % key)
+    LEGACY_OPTIMIZERS_CLS[key] = optimizer_config_cls
+  else:
+    if key in NEW_OPTIMIZERS_CLS:
+      raise ValueError('%s already registered in NEW_OPTIMIZERS_CLS.' % key)
+    NEW_OPTIMIZERS_CLS[key] = optimizer_config_cls


 class OptimizerFactory:
@@ -84,6 +111,8 @@ class OptimizerFactory:
  (4) Build optimizer.

  This is a typical example for using this class:
+
+  ```
  params = {
        'optimizer': {
            'type': 'sgd',
@@ -103,6 +132,7 @@ class OptimizerFactory:
  opt_factory = OptimizerFactory(opt_config)
  lr = opt_factory.build_learning_rate()
  optimizer = opt_factory.build_optimizer(lr)
+  ```
  """

  def __init__(self, config: opt_cfg.OptimizationConfig):
@@ -155,11 +185,15 @@ class OptimizerFactory:
  def build_optimizer(
      self,
      lr: Union[tf.keras.optimizers.schedules.LearningRateSchedule, float],
+      gradient_aggregator: Optional[Callable[
+          [List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor,
+                                                          tf.Tensor]]]] = None,
      gradient_transformers: Optional[List[Callable[
-          [List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor, tf.Tensor]]
-      ]]] = None,
+          [List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor,
+                                                          tf.Tensor]]]]] = None,
      postprocessor: Optional[Callable[[tf.keras.optimizers.Optimizer],
-                                       tf.keras.optimizers.Optimizer]] = None):
+                                       tf.keras.optimizers.Optimizer]] = None,
+      use_legacy_optimizer: bool = True):
    """Build optimizer.

    Builds optimizer from config. It takes learning rate as input, and builds
@@ -169,6 +203,7 @@ class OptimizerFactory:
    Args:
      lr: A floating point value, or a
        tf.keras.optimizers.schedules.LearningRateSchedule instance.
+      gradient_aggregator: Optional function to overwrite gradient aggregation.
      gradient_transformers: Optional list of functions to use to transform
        gradients before applying updates to Variables. The functions are
        applied after gradient_aggregator. The functions should accept and
@@ -176,9 +211,11 @@ class OptimizerFactory:
        global_clipnorm should not be set when gradient_transformers is passed.
      postprocessor: An optional function for postprocessing the optimizer. It
        takes an optimizer and returns an optimizer.
+      use_legacy_optimizer: A boolean that indicates if using legacy optimizers.

    Returns:
-      tf.keras.optimizers.Optimizer instance.
+      `tf.keras.optimizers.legacy.Optimizer` or
+      `tf.keras.optimizers.experimental.Optimizer` instance.
    """

    optimizer_dict = self._optimizer_config.as_dict()
@@ -191,18 +228,39 @@ class OptimizerFactory:
      del optimizer_dict['global_clipnorm']

    optimizer_dict['learning_rate'] = lr
+    if gradient_aggregator is not None:
+      optimizer_dict['gradient_aggregator'] = gradient_aggregator
    if gradient_transformers is not None:
      optimizer_dict['gradient_transformers'] = gradient_transformers

-    optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
+    if use_legacy_optimizer:
+      optimizer = LEGACY_OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
+    else:
+      if 'decay' in optimizer_dict:
+        raise ValueError(
+            '`decay` is deprecated in new Keras optimizer, please reflect the '
+            'decay logic in `lr` or set `use_legacy_optimizer=True` to use the '
+            'legacy optimizer.')
+      optimizer = NEW_OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)

    if self._use_ema:
+      if not use_legacy_optimizer:
+        raise ValueError(
+            'EMA can only work with the legacy optimizer, please set '
+            '`use_legacy_optimizer=True`.')
      optimizer = ema_optimizer.ExponentialMovingAverage(
          optimizer, **self._ema_config.as_dict())
    if postprocessor:
      optimizer = postprocessor(optimizer)
-    assert isinstance(optimizer, tf.keras.optimizers.Optimizer), (
-        'OptimizerFactory.build_optimizer returning a non-optimizer object: '
-        '{}'.format(optimizer))
-
-    return optimizer
+    if isinstance(optimizer, tf.keras.optimizers.Optimizer):
+      return optimizer
+    # The following check makes sure the function won't break in older TF
+    # version because of missing the experimental/legacy package.
+    if hasattr(tf.keras.optimizers, 'experimental'):
+      if isinstance(optimizer, tf.keras.optimizers.experimental.Optimizer):
+        return optimizer
+    if hasattr(tf.keras.optimizers, 'legacy'):
+      if isinstance(optimizer, tf.keras.optimizers.legacy.Optimizer):
+        return optimizer
+    raise TypeError('OptimizerFactory.build_optimizer returning a '
+                    'non-optimizer object: {}'.format(optimizer))
--- a/official/modeling/optimization/optimizer_factory_test.py
+++ b/official/modeling/optimization/optimizer_factory_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
            }
        }
    }
-    optimizer_cls = optimizer_factory.OPTIMIZERS_CLS[optimizer_type]
+    optimizer_cls = optimizer_factory.LEGACY_OPTIMIZERS_CLS[optimizer_type]
    expected_optimizer_config = optimizer_cls().get_config()
    expected_optimizer_config['learning_rate'] = 0.1

@@ -49,6 +49,72 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    self.assertIsInstance(optimizer, optimizer_cls)
    self.assertEqual(expected_optimizer_config, optimizer.get_config())

+  @parameterized.parameters(('sgd'), ('rmsprop'), ('adam'), ('adamw'), ('lamb'),
+                            ('lars'), ('adagrad'))
+  def test_new_optimizers(self, optimizer_type):
+    params = {
+        'optimizer': {
+            'type': optimizer_type
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
+        }
+    }
+    optimizer_cls = optimizer_factory.NEW_OPTIMIZERS_CLS[optimizer_type]
+    expected_optimizer_config = optimizer_cls().get_config()
+    expected_optimizer_config['learning_rate'] = 0.1
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    if optimizer_type == 'sgd':
+      # Delete unsupported arg `decay` from SGDConfig.
+      delattr(opt_config.optimizer.sgd, 'decay')
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(
+        lr, postprocessor=lambda x: x, use_legacy_optimizer=False)
+
+    self.assertIsInstance(optimizer, optimizer_cls)
+    self.assertEqual(expected_optimizer_config, optimizer.get_config())
+
+  def test_gradient_aggregator(self):
+    params = {
+        'optimizer': {
+            'type': 'adam',
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 1.0
+            }
+        }
+    }
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    # Dummy function to zero out gradients.
+    zero_grads = lambda gv: [(tf.zeros_like(g), v) for g, v in gv]
+
+    optimizer = opt_factory.build_optimizer(lr, gradient_aggregator=zero_grads)
+    if isinstance(optimizer, tf.keras.optimizers.experimental.Optimizer):
+      self.skipTest('New Keras optimizer does not support '
+                    '`gradient_aggregator` arg.')
+
+    var0 = tf.Variable([1.0, 2.0])
+    var1 = tf.Variable([3.0, 4.0])
+
+    grads0 = tf.constant([1.0, 1.0])
+    grads1 = tf.constant([1.0, 1.0])
+
+    grads_and_vars = list(zip([grads0, grads1], [var0, var1]))
+    optimizer.apply_gradients(grads_and_vars)
+
+    self.assertAllClose(np.array([1.0, 2.0]), var0.numpy())
+    self.assertAllClose(np.array([3.0, 4.0]), var1.numpy())
+
  @parameterized.parameters((None, None), (1.0, None), (None, 1.0))
  def test_gradient_clipping(self, clipnorm, clipvalue):
    params = {
@@ -107,6 +173,25 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
      optimizer_factory.OptimizerFactory(
          optimization_config.OptimizationConfig(params))

+  def test_wrong_return_type(self):
+    optimizer_type = 'sgd'
+    params = {
+        'optimizer': {
+            'type': optimizer_type
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
+        }
+    }
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    with self.assertRaises(TypeError):
+      _ = opt_factory.build_optimizer(0.1, postprocessor=lambda x: None)
+

 # TODO(b/187559334) refactor lr_schedule tests into `lr_schedule_test.py`.

@@ -418,7 +503,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
            }
        }
    }
-    expected_lr_step_values = [[0, 0.0], [5000, 1e-4/2.0], [10000, 1e-4],
+    expected_lr_step_values = [[0, 0.0], [5000, 1e-4 / 2.0], [10000, 1e-4],
                               [20000, 9.994863e-05], [499999, 5e-05]]
    opt_config = optimization_config.OptimizationConfig(params)
    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
@@ -434,10 +519,12 @@ class OptimizerFactoryRegistryTest(tf.test.TestCase):

    class MyClass():
      pass
+
    optimizer_factory.register_optimizer_cls('test', MyClass)
-    self.assertIn('test', optimizer_factory.OPTIMIZERS_CLS)
+    self.assertIn('test', optimizer_factory.LEGACY_OPTIMIZERS_CLS)
    with self.assertRaisesRegex(ValueError, 'test already registered.*'):
      optimizer_factory.register_optimizer_cls('test', MyClass)

+
 if __name__ == '__main__':
  tf.test.main()
--- a/official/modeling/optimization/slide_optimizer.py
+++ b/official/modeling/optimization/slide_optimizer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/performance.py
+++ b/official/modeling/performance.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/privacy/__init__.py
+++ b/official/modeling/privacy/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/modeling/privacy/configs.py
+++ b/official/modeling/privacy/configs.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configs for differential privacy."""
+import dataclasses
+
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class DifferentialPrivacyConfig(base_config.Config):
+  # Applied to the gradients
+  # Setting to a large number so nothing is clipped.
+  clipping_norm: float = 100000000.0  # 10^9
+  noise_multiplier: float = 0.0
--- a/official/modeling/privacy/configs_test.py
+++ b/official/modeling/privacy/configs_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for configs."""
+
+import tensorflow as tf
+from official.modeling.privacy import configs
+
+
+class ConfigsTest(tf.test.TestCase):
+
+  def test_clipping_norm_default(self):
+    clipping_norm = configs.DifferentialPrivacyConfig().clipping_norm
+    self.assertEqual(100000000.0, clipping_norm)
+
+  def test_noise_multiplier_default(self):
+    noise_multiplier = configs.DifferentialPrivacyConfig().noise_multiplier
+    self.assertEqual(0.0, noise_multiplier)
+
+  def test_config(self):
+    dp_config = configs.DifferentialPrivacyConfig(
+        clipping_norm=1.0,
+        noise_multiplier=1.0,
+    )
+    self.assertEqual(1.0, dp_config.clipping_norm)
+    self.assertEqual(1.0, dp_config.noise_multiplier)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/privacy/ops.py
+++ b/official/modeling/privacy/ops.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Ops for differential privacy (gradient) transforms."""
+
+from typing import List, Tuple
+import tensorflow as tf
+
+
+def clip_l2_norm(grads_vars: List[Tuple[tf.Tensor, tf.Tensor]],
+                 l2_norm_clip: float) -> List[Tuple[tf.Tensor, tf.Tensor]]:
+  """Clip gradients by global norm."""
+
+  gradients = []
+  variables = []
+  for (g, v) in grads_vars:
+    gradients.append(g)
+    variables.append(v)
+  clipped_gradients = tf.clip_by_global_norm(gradients, l2_norm_clip)[0]
+  return list(zip(clipped_gradients, variables))
+
+
+def add_noise(grads_vars: List[Tuple[tf.Tensor, tf.Tensor]],
+              noise_stddev: float) -> List[Tuple[tf.Tensor, tf.Tensor]]:
+  """Add noise to gradients."""
+  ret = []
+  for (g, v) in grads_vars:
+    noise = tf.random.normal(tf.shape(g), stddev=noise_stddev)
+    ret.append((g + noise, v))
+  return ret
+
--- a/official/modeling/privacy/ops_test.py
+++ b/official/modeling/privacy/ops_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ops."""
+
+from unittest import mock
+
+import tensorflow as tf
+
+from official.modeling.privacy import ops
+
+
+class OpsTest(tf.test.TestCase):
+
+  def test_clip_l2_norm(self):
+    x = tf.constant([4.0, 3.0])
+    y = tf.constant([[12.0]])
+    tensors = [(x, x), (y, y)]
+    clipped = ops.clip_l2_norm(tensors, 1.0)
+    for a, b in zip(clipped, tensors):
+      self.assertAllClose(a[0], b[0] / 13.0)  # sqrt(4^2 + 3^2 + 12 ^3) = 13
+      self.assertAllClose(a[1], b[1])
+
+  @mock.patch.object(tf.random,
+                     'normal',
+                     autospec=True)
+  def test_add_noise(self, mock_random):
+    x = tf.constant([0.0, 0.0])
+    y = tf.constant([[0.0]])
+    tensors = [(x, x), (y, y)]
+    mock_random.side_effect = [tf.constant([1.0, 1.0]), tf.constant([[1.0]])]
+    added = ops.add_noise(tensors, 10.0)
+    for a, b in zip(added, tensors):
+      self.assertAllClose(a[0], b[0] + 1.0)
+      self.assertAllClose(a[1], b[1])
+    _, kwargs = mock_random.call_args
+    self.assertEqual(kwargs['stddev'], 10.0)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/tf_utils.py
+++ b/official/modeling/tf_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@

 """Common TF utilities."""

+import functools
 import six
 import tensorflow as tf

@@ -82,19 +83,22 @@ def is_special_none_tensor(tensor):
  return tensor.shape.ndims == 0 and tensor.dtype == tf.int32


-def get_activation(identifier, use_keras_layer=False):
-  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
+def get_activation(identifier, use_keras_layer=False, **kwargs):
+  """Maps an identifier to a Python function, e.g., "relu" => `tf.nn.relu`.

  It checks string first and if it is one of customized activation not in TF,
  the corresponding activation will be returned. For non-customized activation
  names and callable identifiers, always fallback to tf.keras.activations.get.

  Prefers using keras layers when use_keras_layer=True. Now it only supports
-  'relu', 'linear', 'identity', 'swish'.
+  'relu', 'linear', 'identity', 'swish', 'mish', 'leaky_relu', and 'gelu'.

  Args:
    identifier: String name of the activation function or callable.
    use_keras_layer: If True, use keras layer if identifier is allow-listed.
+    **kwargs: Keyword arguments to use to instantiate an activation function.
+      Available only for 'leaky_relu' and 'gelu' when using keras layers.
+      For example: get_activation('leaky_relu', use_keras_layer=True, alpha=0.1)

  Returns:
    A Python function corresponding to the activation function or a keras
@@ -110,8 +114,11 @@ def get_activation(identifier, use_keras_layer=False):
          "swish": "swish",
          "sigmoid": "sigmoid",
          "relu6": tf.nn.relu6,
+          "leaky_relu": functools.partial(tf.nn.leaky_relu, **kwargs),
          "hard_swish": activations.hard_swish,
          "hard_sigmoid": activations.hard_sigmoid,
+          "mish": activations.mish,
+          "gelu": functools.partial(tf.nn.gelu, **kwargs),
      }
      if identifier in keras_layer_allowlist:
        return tf.keras.layers.Activation(keras_layer_allowlist[identifier])
@@ -122,6 +129,7 @@ def get_activation(identifier, use_keras_layer=False):
        "relu6": activations.relu6,
        "hard_sigmoid": activations.hard_sigmoid,
        "identity": activations.identity,
+        "mish": activations.mish,
    }
    if identifier in name_to_fn:
      return tf.keras.activations.get(name_to_fn[identifier])
@@ -201,3 +209,85 @@ def safe_mean(losses):
  total = tf.reduce_sum(losses)
  num_elements = tf.cast(tf.size(losses), dtype=losses.dtype)
  return tf.math.divide_no_nan(total, num_elements)
+
+
+def get_replica_id():
+  """Gets replica id depending on the environment."""
+  context = tf.distribute.get_replica_context()
+  if context is not None:
+    return context.replica_id_in_sync_group
+  else:
+    raise RuntimeError("Unknown replica context. The `get_replica_id` method "
+                       "relies on TF 2.x tf.distribute API.")
+
+
+def cross_replica_concat(value, axis, name="cross_replica_concat"):
+  """Concatenates the given `value` across (GPU/TPU) cores, along `axis`.
+
+  In general, each core ("replica") will pass a
+  replica-specific value as `value` (corresponding to some element of a
+  data-parallel computation taking place across replicas).
+
+  The resulting concatenated `Tensor` will have the same shape as `value` for
+  all dimensions except `axis`, where it will be larger by a factor of the
+  number of replicas. It will also have the same `dtype` as `value`.
+
+  The position of a given replica's `value` within the resulting concatenation
+  is determined by that replica's replica ID. For
+  example:
+
+  With `value` for replica 0 given as
+
+      0 0 0
+      0 0 0
+
+  and `value` for replica 1 given as
+
+      1 1 1
+      1 1 1
+
+  the resulting concatenation along axis 0 will be
+
+      0 0 0
+      0 0 0
+      1 1 1
+      1 1 1
+
+  and this result will be identical across all replicas.
+
+  Note that this API only works in TF2 with `tf.distribute`.
+
+  Args:
+    value: The `Tensor` to concatenate across replicas. Each replica will have a
+      different value for this `Tensor`, and these replica-specific values will
+      be concatenated.
+    axis: The axis along which to perform the concatenation as a Python integer
+      (not a `Tensor`). E.g., `axis=0` to concatenate along the batch dimension.
+    name: A name for the operation (used to create a name scope).
+
+  Returns:
+    The result of concatenating `value` along `axis` across replicas.
+
+  Raises:
+    RuntimeError: when the batch (0-th) dimension is None.
+  """
+  with tf.name_scope(name):
+    context = tf.distribute.get_replica_context()
+    # Typically this could be hit only if the tensor is derived from a
+    # dataset with finite epochs and drop_remainder=False, where the last
+    # batch could of different batch size and then the dim-0 is of dynamic
+    # shape.
+    if value.shape.as_list()[0] is None:
+      raise RuntimeError(f"{value} has unknown batch.")
+    return context.all_gather(value, axis=axis)
+
+
+def clone_initializer(initializer):
+  # Keras initializer is going to be stateless, which mean reusing the same
+  # initializer will produce same init value when the shapes are the same.
+  if isinstance(initializer, tf.keras.initializers.Initializer):
+    return initializer.__class__.from_config(initializer.get_config())
+  # When the input is string/dict or other serialized configs, caller will
+  # create a new keras Initializer instance based on that, and we don't need to
+  # do anything
+  return initializer
--- a/official/modeling/tf_utils_test.py
+++ b/official/modeling/tf_utils_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tf_utils."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.modeling import tf_utils
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      strategy=[
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      mode='eager',
+  )
+
+
+class TFUtilsTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_cross_replica_concat(self, strategy):
+    num_cores = strategy.num_replicas_in_sync
+
+    shape = (2, 3, 4)
+
+    def concat(axis):
+
+      @tf.function
+      def function():
+        replica_value = tf.fill(shape, tf_utils.get_replica_id())
+        return tf_utils.cross_replica_concat(replica_value, axis=axis)
+
+      return function
+
+    def expected(axis):
+      values = [np.full(shape, i) for i in range(num_cores)]
+      return np.concatenate(values, axis=axis)
+
+    per_replica_results = strategy.run(concat(axis=0))
+    replica_0_result = per_replica_results.values[0].numpy()
+    for value in per_replica_results.values[1:]:
+      self.assertAllClose(value.numpy(), replica_0_result)
+    self.assertAllClose(replica_0_result, expected(axis=0))
+
+    replica_0_result = strategy.run(concat(axis=1)).values[0].numpy()
+    self.assertAllClose(replica_0_result, expected(axis=1))
+
+    replica_0_result = strategy.run(concat(axis=2)).values[0].numpy()
+    self.assertAllClose(replica_0_result, expected(axis=2))
+
+  @combinations.generate(all_strategy_combinations())
+  def test_cross_replica_concat_gradient(self, strategy):
+    num_cores = strategy.num_replicas_in_sync
+
+    shape = (10, 5)
+
+    @tf.function
+    def function():
+      replica_value = tf.random.normal(shape)
+      with tf.GradientTape() as tape:
+        tape.watch(replica_value)
+        concat_value = tf_utils.cross_replica_concat(replica_value, axis=0)
+        output = tf.reduce_sum(concat_value)
+      return tape.gradient(output, replica_value)
+
+    per_replica_gradients = strategy.run(function)
+    for gradient in per_replica_gradients.values:
+      self.assertAllClose(gradient, num_cores * tf.ones(shape))
+
+  @parameterized.parameters(('relu', True), ('relu', False),
+                            ('leaky_relu', False), ('leaky_relu', True),
+                            ('mish', True), ('mish', False), ('gelu', True))
+  def test_get_activations(self, name, use_keras_layer):
+    fn = tf_utils.get_activation(name, use_keras_layer)
+    self.assertIsNotNone(fn)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_get_leaky_relu_layer(self, strategy):
+    @tf.function
+    def forward(x):
+      fn = tf_utils.get_activation(
+          'leaky_relu', use_keras_layer=True, alpha=0.1)
+      return strategy.run(fn, args=(x,)).values[0]
+
+    got = forward(tf.constant([-1]))
+    self.assertAllClose(got, tf.constant([-0.1]))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nightly_requirements.txt
+++ b/official/nightly_requirements.txt
+six
+google-api-python-client>=1.6.7
+kaggle>=1.3.9
+numpy>=1.20
+oauth2client
+pandas>=0.22.0
+psutil>=5.4.3
+py-cpuinfo>=3.3.0
+scipy>=0.19.1
+tensorflow-hub>=0.6.0
+tensorflow-model-optimization>=0.4.1
+tensorflow-datasets
+tfa-nightly
+gin-config
+tf_slim>=1.1.0
+Cython
+matplotlib
+# Loader becomes a required positional argument in 6.0 in yaml.load
+pyyaml>=5.1,<6.0
+# CV related dependencies
+opencv-python-headless==4.5.2.52
+Pillow
+pycocotools
+# NLP related dependencies
+seqeval
+sentencepiece
+sacrebleu
+# Projects/vit dependencies
+immutabledict
--- a/official/nlp/MODEL_GARDEN.md
+++ b/official/nlp/MODEL_GARDEN.md
@@ -2,53 +2,69 @@

 ## Introduction

-This TF-NLP library provides a collection of scripts for the training and
-evaluation of transformer-based models, on various tasks such as sentence
+The TF-NLP library provides a collection of scripts for training and
+evaluating transformer-based models, on various tasks such as sentence
 classification, question answering, and translation. Additionally, we provide
 checkpoints of pretrained models which can be finetuned on downstream tasks.

 ### How to Train Models

-Model Garden can be easily installed using PIP
-(`pip install tf-models-nightly`). After installation, check out
+Model Garden can be easily installed with
+`pip install tf-models-nightly`. After installation, check out
 [this instruction](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
 on how to train models with this codebase.

-## Available Tasks

-There are two available model configs (we will add more) under
-`configs/experiments/`:
+By default, the experiment runs on GPUs. To run on TPUs, one should overwrite
+`runtime.distribution_strategy` and set the tpu address. See [RuntimeConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py) for details.
+
+In general, the experiments can run with the folloing command by setting the
+corresponding `${TASK}`, `${TASK_CONFIG}`, `${MODEL_CONFIG}`.
+```
+EXPERIMENT=???
+TASK_CONFIG=???
+MODEL_CONFIG=???
+EXRTRA_PARAMS=???
+MODEL_DIR=???  # a-folder-to-hold-checkpoints-and-logs
+python3 train.py \
+  --experiment=${EXPERIMENT} \
+  --mode=train_and_eval \
+  --model_dir=${MODEL_DIR} \ 
+  --config_file=${TASK_CONFIG} \
+  --config_file=${MODEL_CONFIG} \
+  --params_override=${EXRTRA_PARAMS}
+``` 
+
+* `EXPERIMENT` can be found under `configs/`
+* `TASK_CONFIG` can be found under `configs/experiments/`
+* `MODEL_CONFIG` can be found under `configs/models/`
+
+#### Order of params override:
+1. `train.py` looks up the registered `ExperimentConfig` with `${EXPERIMENT}`
+2. Overrides params in `TaskConfig` in `${TASK_CONFIG}`
+3. Overrides params `model` in `TaskConfig` with `${MODEL_CONFIG}`
+4. Overrides any params in `ExperimentConfig` with `${EXTRA_PARAMS}`
+
+Note that 
+1. `${TASK_CONFIG}`, `${MODEL_CONFIG}`, `${EXTRA_PARAMS}` can be optional when EXPERIMENT default is enough.
+2. `${TASK_CONFIG}`, `${MODEL_CONFIG}`, `${EXTRA_PARAMS}` are only guaranteed to be compatible to it's `${EXPERIMENT}` that defines it.
+
+## Experiments
+
+| NAME          | EXPERIMENT                     | TASK_CONFIG  | MODEL_CONFIG | EXRTRA_PARAMS |
+| ----------------- | ------------------------ | ------- | -------- | ----------- |
+| BERT-base GLUE/MNLI-matched finetune | [bert/sentence_prediction](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py) | [glue_mnli_matched.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/glue_mnli_matched.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4 </details> |
+| BERT-base GLUE/MNLI-matched finetune | [bert/sentence_prediction](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py) | [glue_mnli_matched.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/glue_mnli_matched.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base ckpt init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.init_checkpoint=gs://tf_model_garden/nlp/bert/uncased_L-12_H-768_A-12/bert_model.ckpt </details> |
+| BERT-base SQuAD v1.1 finetune        | [bert/squad](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py)               | [squad_v1.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/squad_v1.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4 </details> |
+|ALBERT-base SQuAD v1.1 finetune | [bert/squad](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py)   | [squad_v1.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/squad_v1.yaml) | [albert_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/albert_base.yaml)| <details> <summary>data and albert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/3 </details>|
+| Transformer-large WMT14/en-de scratch |[wmt_transformer/large](https://github.com/tensorflow/models/blob/master/official/nlp/configs/wmt_transformer_experiments.py)|  | | <details> <summary>ende-32k sentencepiece</summary>task.sentencepiece_model_path='gs://tf_model_garden/nlp/transformer_wmt/ende_bpe_32k.model'</details> |

-| Dataset           | Task                     | Config  | Example command |
-| ----------------- | ------------------------ | ------- | ---- |
-| GLUE/MNLI-matched | bert/sentence_prediction | [glue_mnli_matched.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/glue_mnli_matched.yaml) | <details> <summary>finetune BERT-base on this task</summary> PARAMS=runtime.distribution_strategy=mirrored<br/>PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/<br/>PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4<br/><br/>python3 train.py \\<br/>  --experiment=bert/sentence_prediction \\<br/>  --mode=train \\<br/>  --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \\<br/>  --config_file=configs/models/bert_en_uncased_base.yaml \\<br/>  --config_file=configs/experiments/glue_mnli_matched.yaml \\<br/>  --params_override=${PARAMS}</details> |
-| SQuAD v1.1        | bert/squad               | [squad_v1.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/squad_v1.yaml) | <details> <summary>finetune BERT-base on this task</summary> PARAMS=runtime.distribution_strategy=mirrored<br/>PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/<br/>PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4<br/><br/>python3 train.py \\<br/>  --experiment=bert/squad \\<br/>  --mode=train \\<br/>  --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \\<br/>  --config_file=configs/models/bert_en_uncased_base.yaml \\<br/>  --config_file=configs/experiments/squad_v1.yaml \\<br/>  --params_override=${PARAMS}</details> |
-
-One example on how to use the config file: if you want to work on the SQuAD
-question answering task, set
-`--config_file=configs/experiments/squad_v1.yaml` and
-`--experiment=bert/squad`
-as arguments to `train.py`.
-
-## Available Model Configs
-
-There are two available model configs (we will add more) under
-`configs/models/`:
-
-| Model        | Config  | Pretrained checkpoint & Vocabulary | TF-HUB SavedModel | Example command |
-| ------------ | ------- | ---------------------------------- | ----------------- | --------------- |
-| BERT-base    | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | [uncased_L-12_H-768_A-12](https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-12_H-768_A-12.tar.gz) | [uncased_L-12_H-768_A-12](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/) | <details> <summary>finetune on SQuAD v1.1</summary> PARAMS=runtime.distribution_strategy=mirrored<br/>PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/<br/>PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4<br/><br/>python3 train.py \\<br/>  --experiment=bert/squad \\<br/>  --mode=train \\<br/>  --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \\<br/>  --config_file=configs/models/bert_en_uncased_base.yaml \\<br/>  --config_file=configs/experiments/squad_v1.yaml \\<br/>  --params_override=${PARAMS}</details> |
-| ALBERT-base  | [albert_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/albert_base.yaml) | [albert_en_base](https://storage.googleapis.com/tf_model_garden/nlp/albert/albert_base.tar.gz) | [albert_en_base](https://tfhub.dev/tensorflow/albert_en_base/3) | <details> <summary>finetune on SQuAD v1.1</summary> PARAMS=runtime.distribution_strategy=mirrored<br/>PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/<br/>PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/3<br/><br/>python3 train.py \\<br/>  --experiment=bert/squad \\<br/>  --mode=train \\<br/>  --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \\<br/>  --config_file=configs/models/albert_base.yaml \\<br/>  --config_file=configs/experiments/squad_v1.yaml \\<br/>  --params_override=${PARAMS}</details> |
-
-One example on how to use the config file: if you want to train an ALBERT-base
-model, set `--config_file=configs/models/albert_base.yaml` as an argument to
-`train.py`.

 ## Useful links

 [How to Train Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)

-[List of Pretrained Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md)
+[List of Pretrained Models for finetuning](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md)

 [How to Publish Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/tfhub.md)


--- a/official/nlp/README.md
+++ b/official/nlp/README.md
-# TensorFlow NLP Modelling Toolkit
+# TF-NLP Model Garden

-This codebase provides a Natrual Language Processing modeling toolkit written in
+⚠️ Disclaimer: All datasets hyperlinked from this page are not owned or
+distributed by Google. The dataset is made available by third parties. Please
+review the terms and conditions made available by the third parties before using
+the data.
+
+This codebase provides a Natural Language Processing modeling toolkit written in
 [TF2](https://www.tensorflow.org/guide/effective_tf2). It allows researchers and
 developers to reproduce state-of-the-art model results and train custom models
 to experiment new research ideas.

 ## Features

-* Reusable and modularized modeling building blocks
-* State-of-the-art reproducible
-* Easy to customize and extend
-* End-to-end training
-* Distributed trainable on both GPUs and TPUs
+*   Reusable and modularized modeling building blocks
+*   State-of-the-art reproducible
+*   Easy to customize and extend
+*   End-to-end training
+*   Distributed trainable on both GPUs and TPUs

 ## Major components

 ### Libraries

 We provide modeling library to allow users to train custom models for new
-research ideas. Detailed intructions can be found in READMEs in each folder.
+research ideas. Detailed instructions can be found in READMEs in each folder.

 *   [modeling/](modeling): modeling library that provides building blocks
    (e.g.,Layers, Networks, and Models) that can be assembled into
-    transformer-based achitectures .
+    transformer-based architectures.
 *   [data/](data): binaries and utils for input preprocessing, tokenization,
    etc.

@@ -30,27 +35,29 @@ research ideas. Detailed intructions can be found in READMEs in each folder.

 We provide SoTA model implementations, pre-trained models, training and
 evaluation examples, and command lines. Detail instructions can be found in the
-READMEs for specific papers.
+READMEs for specific papers. Below are some papers implemented in the repository
+and more NLP projects can be found in the
+[`projects`](https://github.com/tensorflow/models/tree/master/official/projects)
+folder:

-1.  [BERT](MODEL_GARDEN.md#available-model-configs): [BERT: Pre-training of Deep Bidirectional Transformers for
-    Language Understanding](https://arxiv.org/abs/1810.04805) by Devlin et al.,
-    2018
+1.  [BERT](MODEL_GARDEN.md#available-model-configs): [BERT: Pre-training of Deep
+    Bidirectional Transformers for Language
+    Understanding](https://arxiv.org/abs/1810.04805) by Devlin et al., 2018
 2.  [ALBERT](MODEL_GARDEN.md#available-model-configs):
    [A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
    by Lan et al., 2019
-3.  [XLNet](xlnet):
+3.  [XLNet](MODEL_GARDEN.md):
    [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
    by Yang et al., 2019
-4.  [Transformer for translation](transformer):
+4.  [Transformer for translation](MODEL_GARDEN.md#available-model-configs):
    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Vaswani et
    al., 2017

 ### Common Training Driver

 We provide a single common driver [train.py](train.py) to train above SoTA
-models on popluar tasks. Please see [docs/train.md](docs/train.md) for
-more details.
-
+models on popular tasks. Please see [docs/train.md](docs/train.md) for more
+details.

 ### Pre-trained models with checkpoints and TF-Hub


--- a/official/nlp/__init__.py
+++ b/official/nlp/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/bert/README.md
+++ b/official/nlp/bert/README.md
-# BERT (Bidirectional Encoder Representations from Transformers)
-
-**WARNING**: We are on the way to deprecate most of the code in this directory.
-Please see
-[this link](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
-for the new tutorial and use the new code in `nlp/modeling`. This README is
-still correct for this legacy implementation.
-
-The academic paper which describes BERT in detail and provides full results on a
-number of tasks can be found here: https://arxiv.org/abs/1810.04805.
-
-This repository contains TensorFlow 2.x implementation for BERT.
-
-## Contents
-  * [Contents](#contents)
-  * [Pre-trained Models](#pre-trained-models)
-    * [Restoring from Checkpoints](#restoring-from-checkpoints)
-  * [Set Up](#set-up)
-  * [Process Datasets](#process-datasets)
-  * [Fine-tuning with BERT](#fine-tuning-with-bert)
-    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
-    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
-    * [SQuAD 1.1](#squad-1.1)
-
-
-## Pre-trained Models
-
-We released both checkpoints and tf.hub modules as the pretrained models for
-fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
-released in TF 1.x official BERT repository
-[google-research/bert](https://github.com/google-research/bert)
-in order to keep consistent with BERT paper.
-
-
-### Access to Pretrained Checkpoints
-
-Pretrained checkpoints can be found in the following links:
-
-**Note: We have switched BERT implementation
-to use Keras functional-style networks in [nlp/modeling](../modeling).
-The new checkpoints are:**
-
-*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Large, Cased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_cased_L-24_H-1024_A-16.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz)**:
-    12-layer, 768-hidden, 12-heads, 110M parameters
-*   **[`BERT-Large, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz)**:
-    12-layer, 768-hidden, 12-heads , 110M parameters
-*   **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Multilingual Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/multi_cased_L-12_H-768_A-12.tar.gz)**:
-    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-
-We recommend to host checkpoints on Google Cloud storage buckets when you use
-Cloud GPU/TPU.
-
-### Restoring from Checkpoints
-
-`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
-weights from provided pre-trained checkpoints, you can use the following code:
-
-```python
-init_checkpoint='the pretrained model checkpoint path.'
-model=tf.keras.Model() # Bert pre-trained model as feature extractor.
-checkpoint = tf.train.Checkpoint(model=model)
-checkpoint.restore(init_checkpoint)
-```
-
-Checkpoints featuring native serialized Keras models
-(i.e. model.load()/load_weights()) will be available soon.
-
-### Access to Pretrained hub modules.
-
-Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
-following links:
-
-*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/)**:
-    12-layer, 768-hidden, 12-heads, 110M parameters
-*   **[`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/)**:
-    12-layer, 768-hidden, 12-heads , 110M parameters
-*   **[`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/)**:
-    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-*   **[`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/)**:
-    Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads,
-    110M parameters
-
-## Set Up
-
-```shell
-export PYTHONPATH="$PYTHONPATH:/path/to/models"
-```
-
-Install `tf-nightly` to get latest updates:
-
-```shell
-pip install tf-nightly-gpu
-```
-
-With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
-TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
-
-```shell
-ctpu up -name <instance name> --tf-version=”nightly”
-```
-
-Second, you need to install TF 2 `tf-nightly` on your VM:
-
-```shell
-pip install tf-nightly
-```
-
-## Process Datasets
-
-### Pre-training
-
-There is no change to generate pre-training data. Please use the script
-[`../data/create_pretraining_data.py`](../data/create_pretraining_data.py)
-which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
-to get processed pre-training data and it adapts to TF2 symbols and python3
-compatibility.
-
-Running the pre-training script requires an input and output directory, as well as a vocab file.  Note that max_seq_length will need to match the sequence length parameter you specify when you run pre-training.
-
-Example shell script to call create_pretraining_data.py
-```
-export WORKING_DIR='local disk or cloud location'
-export BERT_DIR='local disk or cloud location'
-python models/official/nlp/data/create_pretraining_data.py \
-  --input_file=$WORKING_DIR/input/input.txt \
-  --output_file=$WORKING_DIR/output/tf_examples.tfrecord \
-  --vocab_file=$BERT_DIR/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
-  --do_lower_case=True \
-  --max_seq_length=512 \
-  --max_predictions_per_seq=76 \
-  --masked_lm_prob=0.15 \
-  --random_seed=12345 \
-  --dupe_factor=5
-```
-
-### Fine-tuning
-
-To prepare the fine-tuning data for final model training, use the
-[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
-Resulting datasets in `tf_record` format and training meta data should be later
-passed to training or evaluation scripts. The task-specific arguments are
-described in following sections:
-
-* GLUE
-
-Users can download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-Also, users can download [Pretrained Checkpoint](#access-to-pretrained-checkpoints) and locate on some directory `$BERT_DIR` instead of using checkpoints on Google Cloud Storage.
-
-```shell
-export GLUE_DIR=~/glue
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-
-export TASK_NAME=MNLI
-export OUTPUT_DIR=gs://some_bucket/datasets
-python ../data/create_finetuning_data.py \
- --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
- --vocab_file=${BERT_DIR}/vocab.txt \
- --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
- --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
- --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
- --fine_tuning_task_type=classification --max_seq_length=128 \
- --classification_task_name=${TASK_NAME}
-```
-
-* SQUAD
-
-The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
-detailed information about the SQuAD datasets and evaluation.
-
-The necessary files can be found here:
-
-*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
-*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
-*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
-*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
-
-```shell
-export SQUAD_DIR=~/squad
-export SQUAD_VERSION=v1.1
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export OUTPUT_DIR=gs://some_bucket/datasets
-
-python ../data/create_finetuning_data.py \
- --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
- --vocab_file=${BERT_DIR}/vocab.txt \
- --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
- --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
- --fine_tuning_task_type=squad --max_seq_length=384
-```
-
-Note: To create fine-tuning data with SQUAD 2.0, you need to add flag `--version_2_with_negative=True`.
-
-## Fine-tuning with BERT
-
-### Cloud GPUs and TPUs
-
-* Cloud Storage
-
-The unzipped pre-trained model files can also be found in the Google Cloud
-Storage folder `gs://cloud-tpu-checkpoints/bert/keras_bert`. For example:
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export MODEL_DIR=gs://some_bucket/my_output_dir
-```
-
-Currently, users are able to access to `tf-nightly` TPUs and the following TPU
-script should run with `tf-nightly`.
-
-* GPU -> TPU
-
-Just add the following flags to `run_classifier.py` or `run_squad.py`:
-
-```shell
-  --distribution_strategy=tpu
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-### Sentence and Sentence-pair Classification Tasks
-
-This example code fine-tunes `BERT-Large` on the Microsoft Research Paraphrase
-Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a
-few minutes on most GPUs.
-
-We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
-workflow.
-For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
-(uncased_L-12_H-768_A-12).
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export GLUE_DIR=gs://some_bucket/datasets
-export TASK=MRPC
-
-python run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=4 \
-  --eval_batch_size=4 \
-  --steps_per_loop=1 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-```
-
-Alternatively, instead of specifying `init_checkpoint`, you can specify
-`hub_module_url` to employ a pretraind BERT hub module, e.g.,
-` --hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1`.
-
-After training a model, to get predictions from the classifier, you can set the
-`--mode=predict` and offer the test set tfrecords to `--eval_data_path`.
-Output will be created in file called test_results.tsv in the output folder.
-Each line will contain output for each sample, columns are the class
-probabilities.
-
-```shell
-python run_classifier.py \
-  --mode='predict' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --eval_batch_size=4 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-```
-
-To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
-information and use remote storage for model checkpoints.
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export TPU_IP_ADDRESS='???'
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export GLUE_DIR=gs://some_bucket/datasets
-export TASK=MRPC
-
-python run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=32 \
-  --eval_batch_size=32 \
-  --steps_per_loop=1000 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-Note that, we specify `steps_per_loop=1000` for TPU, because running a loop of
-training steps inside a `tf.function` can significantly increase TPU utilization
-and callbacks will not be called inside the loop.
-
-### SQuAD 1.1
-
-The Stanford Question Answering Dataset (SQuAD) is a popular question answering
-benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
-
-We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
-workflow.
-For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
-(uncased_L-12_H-768_A-12).
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export SQUAD_DIR=gs://some_bucket/datasets
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --vocab_file=${BERT_DIR}/vocab.txt \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=4 \
-  --predict_batch_size=4 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-```
-
-Similarily, you can replace `init_checkpoint` FLAG with `hub_module_url` to
-specify a hub module path.
-
-`run_squad.py` writes the prediction for `--predict_file` by default. If you set
-the `--model=predict` and offer the SQuAD test data, the scripts will generate
-the prediction json file.
-
-To use TPU, you need switch distribution strategy type to `tpu` with TPU
-information.
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export TPU_IP_ADDRESS='???'
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export SQUAD_DIR=gs://some_bucket/datasets
-export SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --vocab_file=${BERT_DIR}/vocab.txt \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=32 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-The dev set predictions will be saved into a file called predictions.json in the
-model_dir:
-
-```shell
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
-```
-
-
--- a/official/nlp/bert/__init__.py
+++ b/official/nlp/bert/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-