Merge pull request #1 from tensorflow/master

new pull

Merge pull request #1 from tensorflow/master
new pull
f16a7b5b · vedanshu · GitHub · 8e9296ff · 8f58f396 · f16a7b5b
Unverified Commit f16a7b5b authored May 04, 2021 by vedanshu Committed by GitHub May 04, 2021
20 changed files
--- a/official/nlp/bert/configs.py
+++ b/official/nlp/bert/configs.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,15 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""The main BERT model and related functions."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""The main BERT model and related functions."""

 import copy
 import json
+
 import six
 import tensorflow as tf

@@ -105,4 +102,3 @@ class BertConfig(object):
  def to_json_string(self):
    """Serializes this instance to a JSON string."""
    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
--- a/official/nlp/bert/export_tfhub.py
+++ b/official/nlp/bert/export_tfhub.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,18 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""A script to export the BERT core model as a TF-Hub SavedModel."""
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function

+"""A script to export BERT as a TF-Hub SavedModel.
+
+This script is **DEPRECATED** for exporting BERT encoder models;
+see the error message in by main() for details.
+"""
+
+from typing import Text
+
+# Import libraries
 from absl import app
 from absl import flags
 from absl import logging
 import tensorflow as tf
-from typing import Text
 from official.nlp.bert import bert_models
 from official.nlp.bert import configs

@@ -35,9 +37,12 @@ flags.DEFINE_string("model_checkpoint_path", None,
 flags.DEFINE_string("export_path", None, "TF-Hub SavedModel destination path.")
 flags.DEFINE_string("vocab_file", None,
                    "The vocabulary file that the BERT model was trained on.")
-flags.DEFINE_bool("do_lower_case", None, "Whether to lowercase. If None, "
-                  "do_lower_case will be enabled if 'uncased' appears in the "
-                  "name of --vocab_file")
+flags.DEFINE_bool(
+    "do_lower_case", None, "Whether to lowercase. If None, "
+    "do_lower_case will be enabled if 'uncased' appears in the "
+    "name of --vocab_file")
+flags.DEFINE_enum("model_type", "encoder", ["encoder", "squad"],
+                  "What kind of BERT model to export.")


 def create_bert_model(bert_config: configs.BertConfig) -> tf.keras.Model:
@@ -68,8 +73,10 @@ def create_bert_model(bert_config: configs.BertConfig) -> tf.keras.Model:


 def export_bert_tfhub(bert_config: configs.BertConfig,
-                      model_checkpoint_path: Text, hub_destination: Text,
-                      vocab_file: Text, do_lower_case: bool = None):
+                      model_checkpoint_path: Text,
+                      hub_destination: Text,
+                      vocab_file: Text,
+                      do_lower_case: bool = None):
  """Restores a tf.keras.Model and saves for TF-Hub."""
  # If do_lower_case is not explicit, default to checking whether "uncased" is
  # in the vocab file name
@@ -78,17 +85,54 @@ def export_bert_tfhub(bert_config: configs.BertConfig,
    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
                 do_lower_case, vocab_file)
  core_model, encoder = create_bert_model(bert_config)
-  checkpoint = tf.train.Checkpoint(model=encoder)
+  checkpoint = tf.train.Checkpoint(
+      model=encoder,  # Legacy checkpoints.
+      encoder=encoder)
  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
  core_model.vocab_file = tf.saved_model.Asset(vocab_file)
  core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
  core_model.save(hub_destination, include_optimizer=False, save_format="tf")


+def export_bert_squad_tfhub(bert_config: configs.BertConfig,
+                            model_checkpoint_path: Text,
+                            hub_destination: Text,
+                            vocab_file: Text,
+                            do_lower_case: bool = None):
+  """Restores a tf.keras.Model for BERT with SQuAD and saves for TF-Hub."""
+  # If do_lower_case is not explicit, default to checking whether "uncased" is
+  # in the vocab file name
+  if do_lower_case is None:
+    do_lower_case = "uncased" in vocab_file
+    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
+                 do_lower_case, vocab_file)
+  span_labeling, _ = bert_models.squad_model(bert_config, max_seq_length=None)
+  checkpoint = tf.train.Checkpoint(model=span_labeling)
+  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
+  span_labeling.vocab_file = tf.saved_model.Asset(vocab_file)
+  span_labeling.do_lower_case = tf.Variable(do_lower_case, trainable=False)
+  span_labeling.save(hub_destination, include_optimizer=False, save_format="tf")
+
+
 def main(_):
  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path, FLAGS.export_path,
-                    FLAGS.vocab_file, FLAGS.do_lower_case)
+  if FLAGS.model_type == "encoder":
+    deprecation_note = (
+        "nlp/bert/export_tfhub is **DEPRECATED** for exporting BERT encoder "
+        "models. Please switch to nlp/tools/export_tfhub for exporting BERT "
+        "(and other) encoders with dict inputs/outputs conforming to "
+        "https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders"
+    )
+    logging.error(deprecation_note)
+    print("\n\nNOTICE:", deprecation_note, "\n")
+    export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path,
+                      FLAGS.export_path, FLAGS.vocab_file, FLAGS.do_lower_case)
+  elif FLAGS.model_type == "squad":
+    export_bert_squad_tfhub(bert_config, FLAGS.model_checkpoint_path,
+                            FLAGS.export_path, FLAGS.vocab_file,
+                            FLAGS.do_lower_case)
+  else:
+    raise ValueError("Unsupported model_type %s." % FLAGS.model_type)


 if __name__ == "__main__":

--- a/official/nlp/bert/export_tfhub_test.py
+++ b/official/nlp/bert/export_tfhub_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,26 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests official.nlp.bert.export_tfhub."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests official.nlp.bert.export_tfhub."""

 import os

+from absl.testing import parameterized
 import numpy as np
-
 import tensorflow as tf
 import tensorflow_hub as hub
+
 from official.nlp.bert import configs
 from official.nlp.bert import export_tfhub


-class ExportTfhubTest(tf.test.TestCase):
+class ExportTfhubTest(tf.test.TestCase, parameterized.TestCase):

-  def test_export_tfhub(self):
+  @parameterized.parameters("model", "encoder")
+  def test_export_tfhub(self, ckpt_key_name):
    # Exports a savedmodel for TF-Hub
    hidden_size = 16
    bert_config = configs.BertConfig(
@@ -42,7 +40,7 @@ class ExportTfhubTest(tf.test.TestCase):
        num_hidden_layers=1)
    bert_model, encoder = export_tfhub.create_bert_model(bert_config)
    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(model=encoder)
+    checkpoint = tf.train.Checkpoint(**{ckpt_key_name: encoder})
    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)

@@ -91,6 +89,7 @@ class ExportTfhubTest(tf.test.TestCase):
      outputs = np.concatenate(
          [hub_layer(inputs, training=training)[0] for _ in range(num_runs)])
      return np.mean(np.std(outputs, axis=0))
+
    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)


--- a/official/nlp/bert/input_pipeline.py
+++ b/official/nlp/bert/input_pipeline.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""BERT model input pipelines."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""BERT model input pipelines."""

 import tensorflow as tf

@@ -36,11 +32,13 @@ def decode_record(record, name_to_features):
  return example


-def single_file_dataset(input_file, name_to_features):
+def single_file_dataset(input_file, name_to_features, num_samples=None):
  """Creates a single-file dataset to be passed for BERT custom training."""
  # For training, we want a lot of parallel reading and shuffling.
  # For eval, we want no shuffling and parallel reading doesn't matter.
  d = tf.data.TFRecordDataset(input_file)
+  if num_samples:
+    d = d.take(num_samples)
  d = d.map(
      lambda record: decode_record(record, name_to_features),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
@@ -156,7 +154,8 @@ def create_classifier_dataset(file_path,
                              is_training=True,
                              input_pipeline_context=None,
                              label_type=tf.int64,
-                              include_sample_weights=False):
+                              include_sample_weights=False,
+                              num_samples=None):
  """Creates input dataset from (tf)records files for train/eval."""
  name_to_features = {
      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
@@ -166,7 +165,8 @@ def create_classifier_dataset(file_path,
  }
  if include_sample_weights:
    name_to_features['weight'] = tf.io.FixedLenFeature([], tf.float32)
-  dataset = single_file_dataset(file_path, name_to_features)
+  dataset = single_file_dataset(file_path, name_to_features,
+                                num_samples=num_samples)

  # The dataset is always sharded by number of hosts.
  # num_input_pipelines is the number of hosts rather than number of cores.
@@ -258,7 +258,7 @@ def create_retrieval_dataset(file_path,
      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'int_iden': tf.io.FixedLenFeature([1], tf.int64),
+      'example_id': tf.io.FixedLenFeature([1], tf.int64),
  }
  dataset = single_file_dataset(file_path, name_to_features)

@@ -274,12 +274,29 @@ def create_retrieval_dataset(file_path,
        'input_mask': record['input_mask'],
        'input_type_ids': record['segment_ids']
    }
-    y = record['int_iden']
+    y = record['example_id']
    return (x, y)

  dataset = dataset.map(
      _select_data_from_record,
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.batch(batch_size, drop_remainder=False)
+
+  def _pad_to_batch(x, y):
+    cur_size = tf.shape(y)[0]
+    pad_size = batch_size - cur_size
+
+    pad_ids = tf.zeros(shape=[pad_size, seq_length], dtype=tf.int32)
+    for key in ('input_word_ids', 'input_mask', 'input_type_ids'):
+      x[key] = tf.concat([x[key], pad_ids], axis=0)
+
+    pad_labels = -tf.ones(shape=[pad_size, 1], dtype=tf.int32)
+    y = tf.concat([y, pad_labels], axis=0)
+    return x, y
+
+  dataset = dataset.map(
+      _pad_to_batch,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset
--- a/official/nlp/bert/model_saving_utils.py
+++ b/official/nlp/bert/model_saving_utils.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,13 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Utilities to save models."""

-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
+"""Utilities to save models."""

 import os

@@ -32,19 +27,19 @@ def export_bert_model(model_export_path: typing.Text,
                      restore_model_using_load_weights: bool = False) -> None:
  """Export BERT model for serving which does not include the optimizer.

-  Arguments:
+  Args:
      model_export_path: Path to which exported model will be saved.
      model: Keras model object to export.
      checkpoint_dir: Path from which model weights will be loaded, if
        specified.
      restore_model_using_load_weights: Whether to use checkpoint.restore() API
-        for custom checkpoint or to use model.load_weights() API.
-        There are 2 different ways to save checkpoints. One is using
-        tf.train.Checkpoint and another is using Keras model.save_weights().
-        Custom training loop implementation uses tf.train.Checkpoint API
-        and Keras ModelCheckpoint callback internally uses model.save_weights()
-        API. Since these two API's cannot be used toghether, model loading logic
-        must be take into account how model checkpoint was saved.
+        for custom checkpoint or to use model.load_weights() API. There are 2
+        different ways to save checkpoints. One is using tf.train.Checkpoint and
+        another is using Keras model.save_weights(). Custom training loop
+        implementation uses tf.train.Checkpoint API and Keras ModelCheckpoint
+        callback internally uses model.save_weights() API. Since these two API's
+        cannot be used toghether, model loading logic must be take into account
+        how model checkpoint was saved.

  Raises:
    ValueError when either model_export_path or model is not specified.
@@ -55,14 +50,10 @@ def export_bert_model(model_export_path: typing.Text,
    raise ValueError('model must be a tf.keras.Model object.')

  if checkpoint_dir:
-    # Keras compile/fit() was used to save checkpoint using
-    # model.save_weights().
    if restore_model_using_load_weights:
      model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
      assert tf.io.gfile.exists(model_weight_path)
      model.load_weights(model_weight_path)
-
-    # tf.train.Checkpoint API was used via custom training loop logic.
    else:
      checkpoint = tf.train.Checkpoint(model=model)


--- a/official/nlp/bert/model_training_utils.py
+++ b/official/nlp/bert/model_training_utils.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""A light weight utilities to train NLP models."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""A light weight utilities to train NLP models."""

 import json
 import os
@@ -25,8 +21,8 @@ import tempfile
 from absl import logging
 import tensorflow as tf
 from tensorflow.python.util import deprecation
+from official.common import distribute_utils
 from official.staging.training import grad_utils
-from official.utils.misc import distribution_utils

 _SUMMARY_TXT = 'training_summary.txt'
 _MIN_SUMMARY_STEPS = 10
@@ -65,8 +61,7 @@ def _get_input_iterator(input_fn, strategy):
  # pass callable that returns a dataset.
  if not callable(input_fn):
    raise ValueError('`input_fn` should be a closure that returns a dataset.')
-  iterator = iter(
-      strategy.experimental_distribute_datasets_from_function(input_fn))
+  iterator = iter(strategy.distribute_datasets_from_function(input_fn))
  return iterator


@@ -75,6 +70,13 @@ def _float_metric_value(metric):
  return metric.result().numpy().astype(float)


+def clip_by_global_norm_callback(grads_and_vars):
+  """Performs gradient clipping."""
+  grads, variables = zip(*grads_and_vars)
+  (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
+  return zip(clipped_grads, variables)
+
+
 def steps_to_run(current_step, steps_per_epoch, steps_per_loop):
  """Calculates steps to run on device."""
  if steps_per_loop <= 0:
@@ -126,10 +128,11 @@ def run_customized_training_loop(
    explicit_allreduce=False,
    pre_allreduce_callbacks=None,
    post_allreduce_callbacks=None,
-    train_summary_interval=0):
+    train_summary_interval=0,
+    allreduce_bytes_per_pack=0):
  """Run BERT pretrain model training using low-level API.

-  Arguments:
+  Args:
      _sentinel: Used to prevent positional parameters. Internal, do not use.
      strategy: Distribution strategy on which to run low level training loop.
      model_fn: Function that returns a tuple (model, sub_model). Caller of this
@@ -156,16 +159,16 @@ def run_customized_training_loop(
        evaluation is skipped.
      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
        is not none.
-      metric_fn: A metrics function that returns a Keras Metric object to record
-        evaluation result using evaluation dataset or with training dataset
-        after every epoch.
+      metric_fn: A metrics function that returns either a Keras Metric object or
+        a list of Keras Metric objects to record evaluation result using
+        evaluation dataset or with training dataset after every epoch.
      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
        `model_fn`.
      custom_callbacks: A list of Keras Callbacks objects to run during
        training. More specifically, `on_train_begin(), on_train_end(),
        on_batch_begin()`, `on_batch_end()`, `on_epoch_begin()`,
-        `on_epoch_end()` methods are invoked during training.
-        Note that some metrics may be missing from `logs`.
+        `on_epoch_end()` methods are invoked during training. Note that some
+        metrics may be missing from `logs`.
      run_eagerly: Whether to run model training in pure eager execution. This
        should be disable for TPUStrategy.
      sub_model_export_name: If not None, will export `sub_model` returned by
@@ -194,6 +197,11 @@ def run_customized_training_loop(
        when explicit_allreduce=True.
      train_summary_interval: Step interval for training summaries. If the value
        is a negative number, then training summaries are not enabled.
+      allreduce_bytes_per_pack: A non-negative integer. Breaks collective
+        operations into packs of certain size. If it's zero, all gradients are
+        in one pack. Breaking gradient into packs could enable overlap between
+        allreduce and backprop computation. This flag only takes effect when
+        explicit_allreduce is set to True.'

  Returns:
      Trained model.
@@ -237,7 +245,9 @@ def run_customized_training_loop(
  assert tf.executing_eagerly()

  if run_eagerly:
-    if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
+    if isinstance(
+        strategy,
+        (tf.distribute.TPUStrategy, tf.distribute.experimental.TPUStrategy)):
      raise ValueError(
          'TPUStrategy should not run eagerly as it heavily relies on graph'
          ' optimization for the distributed system.')
@@ -253,7 +263,7 @@ def run_customized_training_loop(
  train_iterator = _get_input_iterator(train_input_fn, strategy)
  eval_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)

-  with distribution_utils.get_strategy_scope(strategy):
+  with distribute_utils.get_strategy_scope(strategy):
    # To correctly place the model weights on accelerators,
    # model and optimizer should be created in scope.
    model, sub_model = model_fn()
@@ -273,12 +283,14 @@ def run_customized_training_loop(
      logging.info(
          'Checkpoint file %s found and restoring from '
          'initial checkpoint for core model.', init_checkpoint)
-      checkpoint = tf.train.Checkpoint(model=sub_model)
-      checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
+      checkpoint = tf.train.Checkpoint(model=sub_model, encoder=sub_model)
+      checkpoint.read(init_checkpoint).assert_existing_objects_matched()
      logging.info('Loading from checkpoint file completed')

    train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
-    eval_metrics = [metric_fn()] if metric_fn else []
+    eval_metrics = metric_fn() if metric_fn else []
+    if not isinstance(eval_metrics, list):
+      eval_metrics = [eval_metrics]
    # If evaluation is required, make a copy of metric as it will be used by
    # both train and evaluation.
    train_metrics = [
@@ -325,10 +337,10 @@ def run_customized_training_loop(
        grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
                                                     training_vars,
                                                     pre_allreduce_callbacks,
-                                                     post_allreduce_callbacks)
+                                                     post_allreduce_callbacks,
+                                                     allreduce_bytes_per_pack)
      else:
-        if isinstance(optimizer,
-                      tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+        if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
          with tape:
            scaled_loss = optimizer.get_scaled_loss(loss)
          scaled_grads = tape.gradient(scaled_loss, training_vars)
@@ -458,8 +470,7 @@ def run_customized_training_loop(
    callback_list.on_train_begin()
    while current_step < total_training_steps and not model.stop_training:
      if current_step % steps_per_epoch == 0:
-        callback_list.on_epoch_begin(
-            int(current_step / steps_per_epoch) + 1)
+        callback_list.on_epoch_begin(int(current_step / steps_per_epoch) + 1)

      # Training loss/metric are taking average over steps inside micro
      # training loop. We reset the their values before each round.
@@ -524,13 +535,14 @@ def run_customized_training_loop(
          _save_checkpoint(strategy, checkpoint, model_dir,
                           checkpoint_name.format(step=current_step))
          if eval_input_fn:
-            logging.info('Running evaluation after step: %s.', current_step)
-            logs = _run_evaluation(current_step,
-                                   _get_input_iterator(eval_input_fn, strategy))
            # Re-initialize evaluation metric.
            eval_loss_metric.reset_states()
            for metric in eval_metrics + model.metrics:
              metric.reset_states()
+
+            logging.info('Running evaluation after step: %s.', current_step)
+            logs = _run_evaluation(current_step,
+                                   _get_input_iterator(eval_input_fn, strategy))
        # We add train_loss here rather than call on_batch_end twice to make
        # sure that no duplicated values are generated.
        logs['loss'] = train_loss
@@ -548,6 +560,11 @@ def run_customized_training_loop(
    _save_checkpoint(strategy, checkpoint, model_dir,
                     checkpoint_name.format(step=current_step))
    if eval_input_fn:
+      # Re-initialize evaluation metric.
+      eval_loss_metric.reset_states()
+      for metric in eval_metrics + model.metrics:
+        metric.reset_states()
+
      logging.info('Running final evaluation after training is complete.')
      logs = _run_evaluation(current_step,
                             _get_input_iterator(eval_input_fn, strategy))

--- a/official/nlp/bert/model_training_utils_test.py
+++ b/official/nlp/bert/model_training_utils_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,16 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for official.modeling.training.model_training_utils."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for official.modeling.training.model_training_utils."""

 import os

 from absl import logging
+from absl.testing import flagsaver
 from absl.testing import parameterized
 from absl.testing.absltest import mock
 import numpy as np
@@ -28,20 +25,22 @@ import tensorflow as tf

 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from official.nlp.bert import common_flags
 from official.nlp.bert import model_training_utils


+common_flags.define_common_bert_flags()
+
+
 def eager_strategy_combinations():
  return combinations.combine(
      distribution=[
          strategy_combinations.default_strategy,
-          strategy_combinations.tpu_strategy,
+          strategy_combinations.cloud_tpu_strategy,
          strategy_combinations.one_device_strategy_gpu,
          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
          strategy_combinations.mirrored_strategy_with_two_gpus,
-      ],
-      mode='eager',
-  )
+      ],)


 def eager_gpu_strategy_combinations():
@@ -51,9 +50,7 @@ def eager_gpu_strategy_combinations():
          strategy_combinations.one_device_strategy_gpu,
          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
          strategy_combinations.mirrored_strategy_with_two_gpus,
-      ],
-      mode='eager',
-  )
+      ],)


 def create_fake_data_input_fn(batch_size, features_shape, num_classes):
@@ -106,9 +103,8 @@ def create_model_fn(input_shape, num_classes, use_float16=False):
        tf.reduce_mean(input_layer), name='mean_input', aggregation='mean')
    model.optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
    if use_float16:
-      model.optimizer = (
-          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-              model.optimizer, loss_scale='dynamic'))
+      model.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
+          model.optimizer)
    return model, sub_model

  return _model_fn
@@ -139,9 +135,9 @@ class RecordingCallback(tf.keras.callbacks.Callback):

  def __init__(self):
    self.batch_begin = []  # (batch, logs)
-    self.batch_end = []    # (batch, logs)
+    self.batch_end = []  # (batch, logs)
    self.epoch_begin = []  # (epoch, logs)
-    self.epoch_end = []    # (epoch, logs)
+    self.epoch_end = []  # (epoch, logs)

  def on_batch_begin(self, batch, logs=None):
    self.batch_begin.append((batch, logs))
@@ -162,6 +158,7 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
    super(ModelTrainingUtilsTest, self).setUp()
    self._model_fn = create_model_fn(input_shape=[128], num_classes=3)

+  @flagsaver.flagsaver
  def run_training(self, strategy, model_dir, steps_per_loop, run_eagerly):
    input_fn = create_fake_data_input_fn(
        batch_size=8, features_shape=[128], num_classes=3)
@@ -184,8 +181,10 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):

  @combinations.generate(eager_strategy_combinations())
  def test_train_eager_single_step(self, distribution):
-    model_dir = self.get_temp_dir()
-    if isinstance(distribution, tf.distribute.experimental.TPUStrategy):
+    model_dir = self.create_tempdir().full_path
+    if isinstance(
+        distribution,
+        (tf.distribute.TPUStrategy, tf.distribute.experimental.TPUStrategy)):
      with self.assertRaises(ValueError):
        self.run_training(
            distribution, model_dir, steps_per_loop=1, run_eagerly=True)
@@ -195,9 +194,8 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):

  @combinations.generate(eager_gpu_strategy_combinations())
  def test_train_eager_mixed_precision(self, distribution):
-    model_dir = self.get_temp_dir()
-    policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
-    tf.keras.mixed_precision.experimental.set_policy(policy)
+    model_dir = self.create_tempdir().full_path
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    self._model_fn = create_model_fn(
        input_shape=[128], num_classes=3, use_float16=True)
    self.run_training(
@@ -205,24 +203,26 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):

  @combinations.generate(eager_strategy_combinations())
  def test_train_check_artifacts(self, distribution):
-    model_dir = self.get_temp_dir()
+    model_dir = self.create_tempdir().full_path
    self.run_training(
        distribution, model_dir, steps_per_loop=10, run_eagerly=False)

    # Two checkpoints should be saved after two epochs.
    files = map(os.path.basename,
                tf.io.gfile.glob(os.path.join(model_dir, 'ctl_step_*index')))
-    self.assertCountEqual(['ctl_step_20.ckpt-1.index',
-                           'ctl_step_40.ckpt-2.index'], files)
+    self.assertCountEqual(
+        ['ctl_step_20.ckpt-1.index', 'ctl_step_40.ckpt-2.index'], files)

    # Three submodel checkpoints should be saved after two epochs (one after
    # each epoch plus one final).
-    files = map(os.path.basename,
-                tf.io.gfile.glob(os.path.join(model_dir,
-                                              'my_submodel_name*index')))
-    self.assertCountEqual(['my_submodel_name.ckpt-3.index',
-                           'my_submodel_name_step_20.ckpt-1.index',
-                           'my_submodel_name_step_40.ckpt-2.index'], files)
+    files = map(
+        os.path.basename,
+        tf.io.gfile.glob(os.path.join(model_dir, 'my_submodel_name*index')))
+    self.assertCountEqual([
+        'my_submodel_name.ckpt-3.index',
+        'my_submodel_name_step_20.ckpt-1.index',
+        'my_submodel_name_step_40.ckpt-2.index'
+    ], files)

    self.assertNotEmpty(
        tf.io.gfile.glob(
@@ -247,7 +247,7 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):

  @combinations.generate(eager_strategy_combinations())
  def test_train_check_callbacks(self, distribution):
-    model_dir = self.get_temp_dir()
+    model_dir = self.create_tempdir().full_path
    callback = RecordingCallback()
    callbacks = [callback]
    input_fn = create_fake_data_input_fn(
@@ -286,9 +286,7 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
      combinations.combine(
          distribution=[
              strategy_combinations.one_device_strategy_gpu,
-          ],
-          mode='eager',
-      ))
+          ],))
  def test_train_check_artifacts_non_chief(self, distribution):
    # We shouldn't export artifacts on non-chief workers. Since there's no easy
    # way to test with real MultiWorkerMirroredStrategy, we patch the strategy
@@ -298,7 +296,7 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
                           new_callable=mock.PropertyMock, return_value=False), \
         mock.patch.object(extended.__class__, 'should_save_summary',
                           new_callable=mock.PropertyMock, return_value=False):
-      model_dir = self.get_temp_dir()
+      model_dir = self.create_tempdir().full_path
      self.run_training(
          distribution, model_dir, steps_per_loop=10, run_eagerly=False)
      self.assertEmpty(tf.io.gfile.listdir(model_dir))

--- a/official/nlp/bert/run_classifier.py
+++ b/official/nlp/bert/run_classifier.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,22 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """BERT classification or regression finetuning runner in TF 2.x."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function

 import functools
 import json
 import math
 import os

+# Import libraries
 from absl import app
 from absl import flags
 from absl import logging
 import gin
 import tensorflow as tf
+from official.common import distribute_utils
 from official.modeling import performance
 from official.nlp import optimization
 from official.nlp.bert import bert_models
@@ -34,7 +33,6 @@ from official.nlp.bert import common_flags
 from official.nlp.bert import configs as bert_configs
 from official.nlp.bert import input_pipeline
 from official.nlp.bert import model_saving_utils
-from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils

 flags.DEFINE_enum(
@@ -52,6 +50,9 @@ flags.DEFINE_string(
    'input_meta_data_path', None,
    'Path to file that contains meta data about input '
    'to be used for training and evaluation.')
+flags.DEFINE_integer('train_data_size', None, 'Number of training samples '
+                     'to use. If None, uses the full train data. '
+                     '(default: None).')
 flags.DEFINE_string('predict_checkpoint_path', None,
                    'Path to the checkpoint for predictions.')
 flags.DEFINE_integer(
@@ -91,7 +92,8 @@ def get_dataset_fn(input_file_pattern,
                   global_batch_size,
                   is_training,
                   label_type=tf.int64,
-                   include_sample_weights=False):
+                   include_sample_weights=False,
+                   num_samples=None):
  """Gets a closure to create a dataset."""

  def _dataset_fn(ctx=None):
@@ -105,7 +107,8 @@ def get_dataset_fn(input_file_pattern,
        is_training=is_training,
        input_pipeline_context=ctx,
        label_type=label_type,
-        include_sample_weights=include_sample_weights)
+        include_sample_weights=include_sample_weights,
+        num_samples=num_samples)
    return dataset

  return _dataset_fn
@@ -216,8 +219,8 @@ def run_keras_compile_fit(model_dir,
    optimizer = bert_model.optimizer

    if init_checkpoint:
-      checkpoint = tf.train.Checkpoint(model=sub_model)
-      checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
+      checkpoint = tf.train.Checkpoint(model=sub_model, encoder=sub_model)
+      checkpoint.read(init_checkpoint).assert_existing_objects_matched()

    if not isinstance(metric_fn, (list, tuple)):
      metric_fn = [metric_fn]
@@ -225,7 +228,7 @@ def run_keras_compile_fit(model_dir,
        optimizer=optimizer,
        loss=loss_fn,
        metrics=[fn() for fn in metric_fn],
-        experimental_steps_per_execution=steps_per_loop)
+        steps_per_execution=steps_per_loop)

    summary_dir = os.path.join(model_dir, 'summaries')
    summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
@@ -262,6 +265,7 @@ def run_keras_compile_fit(model_dir,
 def get_predictions_and_labels(strategy,
                               trained_model,
                               eval_input_fn,
+                               is_regression=False,
                               return_probs=False):
  """Obtains predictions of trained model on evaluation data.

@@ -272,6 +276,7 @@ def get_predictions_and_labels(strategy,
    strategy: Distribution strategy.
    trained_model: Trained model with preloaded weights.
    eval_input_fn: Input function for evaluation data.
+    is_regression: Whether it is a regression task.
    return_probs: Whether to return probabilities of classes.

  Returns:
@@ -287,8 +292,11 @@ def get_predictions_and_labels(strategy,
      """Replicated predictions."""
      inputs, labels = inputs
      logits = trained_model(inputs, training=False)
-      probabilities = tf.nn.softmax(logits)
-      return probabilities, labels
+      if not is_regression:
+        probabilities = tf.nn.softmax(logits)
+        return probabilities, labels
+      else:
+        return logits, labels

    outputs, labels = strategy.run(_test_step_fn, args=(next(iterator),))
    # outputs: current batch logits as a tuple of shard logits
@@ -314,8 +322,7 @@ def get_predictions_and_labels(strategy,
      tf.experimental.async_clear_error()
    return preds, golds

-  test_iter = iter(
-      strategy.experimental_distribute_datasets_from_function(eval_input_fn))
+  test_iter = iter(strategy.distribute_datasets_from_function(eval_input_fn))
  predictions, labels = _run_evaluation(test_iter)

  return predictions, labels
@@ -341,9 +348,12 @@ def export_classifier(model_export_path, input_meta_data, bert_config,
    raise ValueError('Export path is not specified: %s' % model_dir)

  # Export uses float32 for now, even if training uses mixed precision.
-  tf.keras.mixed_precision.experimental.set_policy('float32')
+  tf.keras.mixed_precision.set_global_policy('float32')
  classifier_model = bert_models.classifier_model(
-      bert_config, input_meta_data.get('num_labels', 1))[0]
+      bert_config,
+      input_meta_data.get('num_labels', 1),
+      hub_module_url=FLAGS.hub_module_url,
+      hub_module_trainable=False)[0]

  model_saving_utils.export_bert_model(
      model_export_path, model=classifier_model, checkpoint_dir=model_dir)
@@ -365,6 +375,9 @@ def run_bert(strategy,
  epochs = FLAGS.num_train_epochs * FLAGS.num_eval_per_epoch
  train_data_size = (
      input_meta_data['train_data_size'] // FLAGS.num_eval_per_epoch)
+  if FLAGS.train_data_size:
+    train_data_size = min(train_data_size, FLAGS.train_data_size)
+    logging.info('Updated train_data_size: %s', train_data_size)
  steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
  warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size)
  eval_steps = int(
@@ -430,7 +443,7 @@ def custom_main(custom_callbacks=None, custom_metrics=None):
                      FLAGS.model_dir)
    return

-  strategy = distribution_utils.get_distribution_strategy(
+  strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=FLAGS.distribution_strategy,
      num_gpus=FLAGS.num_gpus,
      tpu_address=FLAGS.tpu)
@@ -443,9 +456,10 @@ def custom_main(custom_callbacks=None, custom_metrics=None):
      include_sample_weights=include_sample_weights)

  if FLAGS.mode == 'predict':
+    num_labels = input_meta_data.get('num_labels', 1)
    with strategy.scope():
      classifier_model = bert_models.classifier_model(
-          bert_config, input_meta_data['num_labels'])[0]
+          bert_config, num_labels)[0]
      checkpoint = tf.train.Checkpoint(model=classifier_model)
      latest_checkpoint_file = (
          FLAGS.predict_checkpoint_path or
@@ -456,7 +470,11 @@ def custom_main(custom_callbacks=None, custom_metrics=None):
      checkpoint.restore(
          latest_checkpoint_file).assert_existing_objects_matched()
      preds, _ = get_predictions_and_labels(
-          strategy, classifier_model, eval_input_fn, return_probs=True)
+          strategy,
+          classifier_model,
+          eval_input_fn,
+          is_regression=(num_labels == 1),
+          return_probs=True)
    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
      logging.info('***** Predict results *****')
@@ -475,7 +493,8 @@ def custom_main(custom_callbacks=None, custom_metrics=None):
      FLAGS.train_batch_size,
      is_training=True,
      label_type=label_type,
-      include_sample_weights=include_sample_weights)
+      include_sample_weights=include_sample_weights,
+      num_samples=FLAGS.train_data_size)
  run_bert(
      strategy,
      input_meta_data,

--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,17 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Run masked LM/next sentence pre-training for BERT in TF 2.x."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function

+# Import libraries
 from absl import app
 from absl import flags
 from absl import logging
 import gin
 import tensorflow as tf
+from official.common import distribute_utils
 from official.modeling import performance
 from official.nlp import optimization
 from official.nlp.bert import bert_models
@@ -29,7 +28,6 @@ from official.nlp.bert import common_flags
 from official.nlp.bert import configs
 from official.nlp.bert import input_pipeline
 from official.nlp.bert import model_training_utils
-from official.utils.misc import distribution_utils


 flags.DEFINE_string('input_files', None,
@@ -105,7 +103,11 @@ def run_customized_training(strategy,
                            train_batch_size,
                            use_next_sentence_label=True,
                            train_summary_interval=0,
-                            custom_callbacks=None):
+                            custom_callbacks=None,
+                            explicit_allreduce=False,
+                            pre_allreduce_callbacks=None,
+                            post_allreduce_callbacks=None,
+                            allreduce_bytes_per_pack=0):
  """Run BERT pretrain model training using low-level API."""

  train_input_fn = get_pretrain_dataset_fn(input_files, max_seq_length,
@@ -139,6 +141,10 @@ def run_customized_training(strategy,
      steps_per_loop=steps_per_loop,
      epochs=epochs,
      sub_model_export_name='pretrained/bert_model',
+      explicit_allreduce=explicit_allreduce,
+      pre_allreduce_callbacks=pre_allreduce_callbacks,
+      post_allreduce_callbacks=post_allreduce_callbacks,
+      allreduce_bytes_per_pack=allreduce_bytes_per_pack,
      train_summary_interval=train_summary_interval,
      custom_callbacks=custom_callbacks)

@@ -158,6 +164,12 @@ def run_bert_pretrain(strategy, custom_callbacks=None):

  performance.set_mixed_precision_policy(common_flags.dtype())

+  # Only when explicit_allreduce = True, post_allreduce_callbacks and
+  # allreduce_bytes_per_pack will take effect. optimizer.apply_gradients() no
+  # longer implicitly allreduce gradients, users manually allreduce gradient and
+  # pass the allreduced grads_and_vars to apply_gradients().
+  # With explicit_allreduce = True, clip_by_global_norm is moved to after
+  # allreduce.
  return run_customized_training(
      strategy,
      bert_config,
@@ -176,16 +188,25 @@ def run_bert_pretrain(strategy, custom_callbacks=None):
      FLAGS.train_batch_size,
      FLAGS.use_next_sentence_label,
      FLAGS.train_summary_interval,
-      custom_callbacks=custom_callbacks)
+      custom_callbacks=custom_callbacks,
+      explicit_allreduce=FLAGS.explicit_allreduce,
+      pre_allreduce_callbacks=[
+          model_training_utils.clip_by_global_norm_callback
+      ],
+      allreduce_bytes_per_pack=FLAGS.allreduce_bytes_per_pack)


 def main(_):
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
  if not FLAGS.model_dir:
    FLAGS.model_dir = '/tmp/bert20/'
-  strategy = distribution_utils.get_distribution_strategy(
+  # Configures cluster spec for multi-worker distribution strategy.
+  if FLAGS.num_gpus > 0:
+    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
+  strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=FLAGS.distribution_strategy,
      num_gpus=FLAGS.num_gpus,
+      all_reduce_alg=FLAGS.all_reduce_alg,
      tpu_address=FLAGS.tpu)
  if strategy:
    print('***** Number of cores used : ', strategy.num_replicas_in_sync)

--- a/official/nlp/bert/run_squad.py
+++ b/official/nlp/bert/run_squad.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,28 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Run BERT on SQuAD 1.1 and SQuAD 2.0 in TF 2.x."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Run BERT on SQuAD 1.1 and SQuAD 2.0 in TF 2.x."""

 import json
 import os
 import time

+# Import libraries
 from absl import app
 from absl import flags
 from absl import logging
 import gin
 import tensorflow as tf
-
+from official.common import distribute_utils
 from official.nlp.bert import configs as bert_configs
 from official.nlp.bert import run_squad_helper
 from official.nlp.bert import tokenization
 from official.nlp.data import squad_lib as squad_lib_wp
-from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils


@@ -104,9 +100,8 @@ def main(_):

  # Configures cluster spec for multi-worker distribution strategy.
  if FLAGS.num_gpus > 0:
-    _ = distribution_utils.configure_cluster(FLAGS.worker_hosts,
-                                             FLAGS.task_index)
-  strategy = distribution_utils.get_distribution_strategy(
+    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
+  strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=FLAGS.distribution_strategy,
      num_gpus=FLAGS.num_gpus,
      all_reduce_alg=FLAGS.all_reduce_alg,

--- a/official/nlp/bert/run_squad_helper.py
+++ b/official/nlp/bert/run_squad_helper.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,15 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Library for running BERT family models on SQuAD 1.1/2.0 in TF 2.x."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function

 import collections
 import json
 import os
+
 from absl import flags
 from absl import logging
 import tensorflow as tf
@@ -39,10 +37,10 @@ from official.utils.misc import keras_utils
 def define_common_squad_flags():
  """Defines common flags used by SQuAD tasks."""
  flags.DEFINE_enum(
-      'mode', 'train_and_eval',
-      ['train_and_eval', 'train_and_predict',
-       'train', 'eval', 'predict', 'export_only'],
-      'One of {"train_and_eval", "train_and_predict", '
+      'mode', 'train_and_eval', [
+          'train_and_eval', 'train_and_predict', 'train', 'eval', 'predict',
+          'export_only'
+      ], 'One of {"train_and_eval", "train_and_predict", '
      '"train", "eval", "predict", "export_only"}. '
      '`train_and_eval`: train & predict to json files & compute eval metrics. '
      '`train_and_predict`: train & predict to json files. '
@@ -60,12 +58,12 @@ def define_common_squad_flags():
  # Model training specific flags.
  flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
  # Predict processing related.
-  flags.DEFINE_string('predict_file', None,
-                      'SQuAD prediction json file path. '
-                      '`predict` mode supports multiple files: one can use '
-                      'wildcard to specify multiple files and it can also be '
-                      'multiple file patterns separated by comma. Note that '
-                      '`eval` mode only supports a single predict file.')
+  flags.DEFINE_string(
+      'predict_file', None, 'SQuAD prediction json file path. '
+      '`predict` mode supports multiple files: one can use '
+      'wildcard to specify multiple files and it can also be '
+      'multiple file patterns separated by comma. Note that '
+      '`eval` mode only supports a single predict file.')
  flags.DEFINE_bool(
      'do_lower_case', True,
      'Whether to lower case the input text. Should be True for uncased '
@@ -97,10 +95,7 @@ def define_common_squad_flags():
 FLAGS = flags.FLAGS


-def squad_loss_fn(start_positions,
-                  end_positions,
-                  start_logits,
-                  end_logits):
+def squad_loss_fn(start_positions, end_positions, start_logits, end_logits):
  """Returns sparse categorical crossentropy for start/end logits."""
  start_loss = tf.keras.losses.sparse_categorical_crossentropy(
      start_positions, start_logits, from_logits=True)
@@ -118,11 +113,8 @@ def get_loss_fn():
    start_positions = labels['start_positions']
    end_positions = labels['end_positions']
    start_logits, end_logits = model_outputs
-    return squad_loss_fn(
-        start_positions,
-        end_positions,
-        start_logits,
-        end_logits)
+    return squad_loss_fn(start_positions, end_positions, start_logits,
+                         end_logits)

  return _loss_fn

@@ -168,7 +160,7 @@ def get_squad_model_to_predict(strategy, bert_config, checkpoint_path,
  """Gets a squad model to make predictions."""
  with strategy.scope():
    # Prediction always uses float32, even if training uses mixed precision.
-    tf.keras.mixed_precision.experimental.set_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')
    squad_model, _ = bert_models.squad_model(
        bert_config,
        input_meta_data['max_seq_length'],
@@ -182,11 +174,8 @@ def get_squad_model_to_predict(strategy, bert_config, checkpoint_path,
  return squad_model


-def predict_squad_customized(strategy,
-                             input_meta_data,
-                             predict_tfrecord_path,
-                             num_steps,
-                             squad_model):
+def predict_squad_customized(strategy, input_meta_data, predict_tfrecord_path,
+                             num_steps, squad_model):
  """Make predictions using a Bert-based squad model."""
  predict_dataset_fn = get_dataset_fn(
      predict_tfrecord_path,
@@ -194,8 +183,7 @@ def predict_squad_customized(strategy,
      FLAGS.predict_batch_size,
      is_training=False)
  predict_iterator = iter(
-      strategy.experimental_distribute_datasets_from_function(
-          predict_dataset_fn))
+      strategy.distribute_datasets_from_function(predict_dataset_fn))

  @tf.function
  def predict_step(iterator):
@@ -259,8 +247,7 @@ def train_squad(strategy,
        hub_module_trainable=FLAGS.hub_module_trainable)
    optimizer = optimization.create_optimizer(FLAGS.learning_rate,
                                              steps_per_epoch * epochs,
-                                              warmup_steps,
-                                              FLAGS.end_lr,
+                                              warmup_steps, FLAGS.end_lr,
                                              FLAGS.optimizer_type)

    squad_model.optimizer = performance.configure_optimizer(
@@ -269,15 +256,12 @@ def train_squad(strategy,
        use_graph_rewrite=common_flags.use_graph_rewrite())
    return squad_model, core_model

-  # If explicit_allreduce = True, apply_gradients() no longer implicitly
-  # allreduce gradients, users manually allreduce gradient and pass the
-  # allreduced grads_and_vars to apply_gradients(). clip_by_global_norm will be
-  # applied to allreduced gradients.
-  def clip_by_global_norm_callback(grads_and_vars):
-    grads, variables = zip(*grads_and_vars)
-    (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
-    return zip(clipped_grads, variables)
-
+  # Only when explicit_allreduce = True, post_allreduce_callbacks and
+  # allreduce_bytes_per_pack will take effect. optimizer.apply_gradients() no
+  # longer implicitly allreduce gradients, users manually allreduce gradient and
+  # pass the allreduced grads_and_vars to apply_gradients().
+  # With explicit_allreduce = True, clip_by_global_norm is moved to after
+  # allreduce.
  model_training_utils.run_customized_training_loop(
      strategy=strategy,
      model_fn=_get_squad_model,
@@ -291,8 +275,11 @@ def train_squad(strategy,
      sub_model_export_name=sub_model_export_name,
      run_eagerly=run_eagerly,
      custom_callbacks=custom_callbacks,
-      explicit_allreduce=False,
-      post_allreduce_callbacks=[clip_by_global_norm_callback])
+      explicit_allreduce=FLAGS.explicit_allreduce,
+      pre_allreduce_callbacks=[
+          model_training_utils.clip_by_global_norm_callback
+      ],
+      allreduce_bytes_per_pack=FLAGS.allreduce_bytes_per_pack)


 def prediction_output_squad(strategy, input_meta_data, tokenizer, squad_lib,
@@ -344,8 +331,9 @@ def prediction_output_squad(strategy, input_meta_data, tokenizer, squad_lib,
  logging.info('  Batch size = %d', FLAGS.predict_batch_size)

  num_steps = int(dataset_size / FLAGS.predict_batch_size)
-  all_results = predict_squad_customized(
-      strategy, input_meta_data, eval_writer.filename, num_steps, squad_model)
+  all_results = predict_squad_customized(strategy, input_meta_data,
+                                         eval_writer.filename, num_steps,
+                                         squad_model)

  all_predictions, all_nbest_json, scores_diff_json = (
      squad_lib.postprocess_output(
@@ -362,8 +350,12 @@ def prediction_output_squad(strategy, input_meta_data, tokenizer, squad_lib,
  return all_predictions, all_nbest_json, scores_diff_json


-def dump_to_files(all_predictions, all_nbest_json, scores_diff_json,
-                  squad_lib, version_2_with_negative, file_prefix=''):
+def dump_to_files(all_predictions,
+                  all_nbest_json,
+                  scores_diff_json,
+                  squad_lib,
+                  version_2_with_negative,
+                  file_prefix=''):
  """Save output to json files."""
  output_prediction_file = os.path.join(FLAGS.model_dir,
                                        '%spredictions.json' % file_prefix)
@@ -452,8 +444,7 @@ def eval_squad(strategy,
    dataset_json = json.load(reader)
    pred_dataset = dataset_json['data']
  if input_meta_data.get('version_2_with_negative', False):
-    eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset,
-                                                all_predictions,
+    eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset, all_predictions,
                                                scores_diff_json)
  else:
    eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
@@ -474,7 +465,7 @@ def export_squad(model_export_path, input_meta_data, bert_config):
  if not model_export_path:
    raise ValueError('Export path is not specified: %s' % model_export_path)
  # Export uses float32 for now, even if training uses mixed precision.
-  tf.keras.mixed_precision.experimental.set_policy('float32')
+  tf.keras.mixed_precision.set_global_policy('float32')
  squad_model, _ = bert_models.squad_model(bert_config,
                                           input_meta_data['max_seq_length'])
  model_saving_utils.export_bert_model(

--- a/official/nlp/bert/serving.py
+++ b/official/nlp/bert/serving.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Examples of SavedModel export for tf-serving."""

 from absl import app
@@ -22,11 +21,11 @@ import tensorflow as tf
 from official.nlp.bert import bert_models
 from official.nlp.bert import configs

-flags.DEFINE_integer("sequence_length", None,
-                     "Sequence length to parse the tf.Example. If "
-                     "sequence_length > 0, add a signature for serialized "
-                     "tf.Example and define the parsing specification by the "
-                     "sequence_length.")
+flags.DEFINE_integer(
+    "sequence_length", None, "Sequence length to parse the tf.Example. If "
+    "sequence_length > 0, add a signature for serialized "
+    "tf.Example and define the parsing specification by the "
+    "sequence_length.")
 flags.DEFINE_string("bert_config_file", None,
                    "Bert configuration file to define core bert layers.")
 flags.DEFINE_string("model_checkpoint_path", None,

--- a/official/nlp/bert/squad_evaluate_v1_1.py
+++ b/official/nlp/bert/squad_evaluate_v1_1.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Evaluation of SQuAD predictions (version 1.1).

 The functions are copied from
@@ -22,15 +23,12 @@ Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, Percy Liang
 https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf
 """

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import collections
 import re
 import string

 # pylint: disable=g-bad-import-order
+
 from absl import logging
 # pylint: enable=g-bad-import-order


--- a/official/nlp/bert/squad_evaluate_v2_0.py
+++ b/official/nlp/bert/squad_evaluate_v2_0.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Evaluation script for SQuAD version 2.0.

 The functions are copied and modified from
@@ -22,10 +23,6 @@ This file is expected to map question ID's to the model's predicted probability
 that a question is unanswerable.
 """

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import collections
 import re
 import string

--- a/official/nlp/bert/tf1_checkpoint_converter_lib.py
+++ b/official/nlp/bert/tf1_checkpoint_converter_lib.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,11 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 r"""Convert checkpoints created by Estimator (tf1) to be Keras compatible."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function

 import numpy as np
 import tensorflow.compat.v1 as tf  # TF 1.x
@@ -53,6 +50,7 @@ BERT_V2_NAME_REPLACEMENTS = (
    ("output/dense", "output"),
    ("output/LayerNorm", "output_layer_norm"),
    ("pooler/dense", "pooler_transform"),
+    ("cls/predictions", "bert/cls/predictions"),
    ("cls/predictions/output_bias", "cls/predictions/output_bias/bias"),
    ("cls/seq_relationship/output_bias", "predictions/transform/logits/bias"),
    ("cls/seq_relationship/output_weights",
@@ -111,11 +109,20 @@ def _get_new_shape(name, shape, num_heads):
  return None


-def create_v2_checkpoint(model, src_checkpoint, output_path):
+def create_v2_checkpoint(model,
+                         src_checkpoint,
+                         output_path,
+                         checkpoint_model_name="model"):
  """Converts a name-based matched TF V1 checkpoint to TF V2 checkpoint."""
  # Uses streaming-restore in eager model to read V1 name-based checkpoints.
  model.load_weights(src_checkpoint).assert_existing_objects_matched()
-  checkpoint = tf.train.Checkpoint(model=model)
+  if hasattr(model, "checkpoint_items"):
+    checkpoint_items = model.checkpoint_items
+  else:
+    checkpoint_items = {}
+
+  checkpoint_items[checkpoint_model_name] = model
+  checkpoint = tf.train.Checkpoint(**checkpoint_items)
  checkpoint.save(output_path)


@@ -164,7 +171,6 @@ def convert(checkpoint_from_path,
        new_shape = _get_new_shape(new_var_name, tensor.shape, num_heads)
      if new_shape:
        tf.logging.info("Veriable %s has a shape change from %s to %s",
-
                        var_name, tensor.shape, new_shape)
        tensor = np.reshape(tensor, new_shape)


--- a/official/nlp/bert/tf2_encoder_checkpoint_converter.py
+++ b/official/nlp/bert/tf2_encoder_checkpoint_converter.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,15 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """A converter from a V1 BERT encoder checkpoint to a V2 encoder checkpoint.

 The conversion will yield an object-oriented checkpoint that can be used
-to restore a TransformerEncoder object.
+to restore a BertEncoder or BertPretrainerV2 object (see the `converted_model`
+FLAG below).
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function

 import os

@@ -27,9 +25,10 @@ from absl import app
 from absl import flags

 import tensorflow as tf
-from official.modeling import activations
+from official.modeling import tf_utils
 from official.nlp.bert import configs
 from official.nlp.bert import tf1_checkpoint_converter_lib
+from official.nlp.modeling import models
 from official.nlp.modeling import networks

 FLAGS = flags.FLAGS
@@ -42,6 +41,14 @@ flags.DEFINE_string(
    "BertModel, with no task heads.)")
 flags.DEFINE_string("converted_checkpoint_path", None,
                    "Name for the created object-based V2 checkpoint.")
+flags.DEFINE_string("checkpoint_model_name", "encoder",
+                    "The name of the model when saving the checkpoint, i.e., "
+                    "the checkpoint will be saved using: "
+                    "tf.train.Checkpoint(FLAGS.checkpoint_model_name=model).")
+flags.DEFINE_enum(
+    "converted_model", "encoder", ["encoder", "pretrainer"],
+    "Whether to convert the checkpoint to a `BertEncoder` model or a "
+    "`BertPretrainerV2` model (with mlm but without classification heads).")


 def _create_bert_model(cfg):
@@ -49,19 +56,20 @@ def _create_bert_model(cfg):

  Args:
    cfg: A `BertConfig` to create the core model.
+
  Returns:
-    A TransformerEncoder netowork.
+    A BertEncoder network.
  """
-  bert_encoder = networks.TransformerEncoder(
+  bert_encoder = networks.BertEncoder(
      vocab_size=cfg.vocab_size,
      hidden_size=cfg.hidden_size,
      num_layers=cfg.num_hidden_layers,
      num_attention_heads=cfg.num_attention_heads,
      intermediate_size=cfg.intermediate_size,
-      activation=activations.gelu,
+      activation=tf_utils.get_activation(cfg.hidden_act),
      dropout_rate=cfg.hidden_dropout_prob,
      attention_dropout_rate=cfg.attention_probs_dropout_prob,
-      sequence_length=cfg.max_position_embeddings,
+      max_sequence_length=cfg.max_position_embeddings,
      type_vocab_size=cfg.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=cfg.initializer_range),
@@ -70,13 +78,39 @@ def _create_bert_model(cfg):
  return bert_encoder


-def convert_checkpoint(bert_config, output_path, v1_checkpoint):
+def _create_bert_pretrainer_model(cfg):
+  """Creates a BERT keras core model from BERT configuration.
+
+  Args:
+    cfg: A `BertConfig` to create the core model.
+
+  Returns:
+    A BertPretrainerV2 model.
+  """
+  bert_encoder = _create_bert_model(cfg)
+  pretrainer = models.BertPretrainerV2(
+      encoder_network=bert_encoder,
+      mlm_activation=tf_utils.get_activation(cfg.hidden_act),
+      mlm_initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=cfg.initializer_range))
+  # Makes sure the pretrainer variables are created.
+  _ = pretrainer(pretrainer.inputs)
+  return pretrainer
+
+
+def convert_checkpoint(bert_config,
+                       output_path,
+                       v1_checkpoint,
+                       checkpoint_model_name="model",
+                       converted_model="encoder"):
  """Converts a V1 checkpoint into an OO V2 checkpoint."""
  output_dir, _ = os.path.split(output_path)
+  tf.io.gfile.makedirs(output_dir)

  # Create a temporary V1 name-converted checkpoint in the output directory.
  temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")
  temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt")
+
  tf1_checkpoint_converter_lib.convert(
      checkpoint_from_path=v1_checkpoint,
      checkpoint_to_path=temporary_checkpoint,
@@ -85,10 +119,17 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint):
      permutations=tf1_checkpoint_converter_lib.BERT_V2_PERMUTATIONS,
      exclude_patterns=["adam", "Adam"])

+  if converted_model == "encoder":
+    model = _create_bert_model(bert_config)
+  elif converted_model == "pretrainer":
+    model = _create_bert_pretrainer_model(bert_config)
+  else:
+    raise ValueError("Unsupported converted_model: %s" % converted_model)
+
  # Create a V2 checkpoint from the temporary checkpoint.
-  model = _create_bert_model(bert_config)
  tf1_checkpoint_converter_lib.create_v2_checkpoint(model, temporary_checkpoint,
-                                                    output_path)
+                                                    output_path,
+                                                    checkpoint_model_name)

  # Clean up the temporary checkpoint, if it exists.
  try:
@@ -98,11 +139,21 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint):
    pass


-def main(_):
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
  output_path = FLAGS.converted_checkpoint_path
  v1_checkpoint = FLAGS.checkpoint_to_convert
+  checkpoint_model_name = FLAGS.checkpoint_model_name
+  converted_model = FLAGS.converted_model
  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  convert_checkpoint(bert_config, output_path, v1_checkpoint)
+  convert_checkpoint(
+      bert_config=bert_config,
+      output_path=output_path,
+      v1_checkpoint=v1_checkpoint,
+      checkpoint_model_name=checkpoint_model_name,
+      converted_model=converted_model)


 if __name__ == "__main__":

--- a/official/nlp/bert/tokenization.py
+++ b/official/nlp/bert/tokenization.py
-# coding=utf-8
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
+# coding=utf-8
 """Tokenization classes implementation.

 The file is forked from:
 https://github.com/google-research/bert/blob/master/tokenization.py.
 """

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import collections
 import re
 import unicodedata
@@ -421,7 +417,7 @@ def preprocess_text(inputs, remove_space=True, lower=False):
  """Preprocesses data by removing extra space and normalize data.

  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py

  Args:
    inputs: The input text.
@@ -454,7 +450,7 @@ def encode_pieces(sp_model, text, sample=False):
  """Segements text into pieces.

  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py


  Args:
@@ -496,7 +492,7 @@ def encode_ids(sp_model, text, sample=False):
  """Segments text and return token ids.

  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py

  Args:
    sp_model: A spm.SentencePieceProcessor object.

--- a/official/nlp/bert/tokenization_test.py
+++ b/official/nlp/bert/tokenization_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,10 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function

 import os
 import tempfile

--- a/official/nlp/configs/__init__.py
+++ b/official/nlp/configs/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

--- a/official/nlp/configs/bert.py
+++ b/official/nlp/configs/bert.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Multi-head BERT encoder network with classification heads.

 Includes configurations and instantiation methods.
@@ -20,13 +19,9 @@ Includes configurations and instantiation methods.
 from typing import List, Optional, Text

 import dataclasses
-import tensorflow as tf

-from official.modeling import tf_utils
 from official.modeling.hyperparams import base_config
 from official.nlp.configs import encoders
-from official.nlp.modeling import layers
-from official.nlp.modeling.models import bert_pretrainer


 @dataclasses.dataclass
@@ -40,32 +35,9 @@ class ClsHeadConfig(base_config.Config):


 @dataclasses.dataclass
-class BertPretrainerConfig(base_config.Config):
-  """BERT encoder configuration."""
-  encoder: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+class PretrainerConfig(base_config.Config):
+  """Pretrainer configuration."""
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
  cls_heads: List[ClsHeadConfig] = dataclasses.field(default_factory=list)
-
-
-def instantiate_classification_heads_from_cfgs(
-    cls_head_configs: List[ClsHeadConfig]) -> List[layers.ClassificationHead]:
-  return [
-      layers.ClassificationHead(**cfg.as_dict()) for cfg in cls_head_configs
-    ] if cls_head_configs else []
-
-
-def instantiate_pretrainer_from_cfg(
-    config: BertPretrainerConfig,
-    encoder_network: Optional[tf.keras.Model] = None
-) -> bert_pretrainer.BertPretrainerV2:
-  """Instantiates a BertPretrainer from the config."""
-  encoder_cfg = config.encoder
-  if encoder_network is None:
-    encoder_network = encoders.instantiate_encoder_from_cfg(encoder_cfg)
-  return bert_pretrainer.BertPretrainerV2(
-      mlm_activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
-      mlm_initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=encoder_cfg.initializer_range),
-      encoder_network=encoder_network,
-      classification_heads=instantiate_classification_heads_from_cfgs(
-          config.cls_heads))
+  mlm_activation: str = "gelu"
+  mlm_initializer_range: float = 0.02