Merge pull request #1 from tensorflow/master

new pull

Merge pull request #1 from tensorflow/master
new pull
f16a7b5b · vedanshu · GitHub · 8e9296ff · 8f58f396 · 8e9296ff
Unverified Commit f16a7b5b authored May 04, 2021 by vedanshu Committed by GitHub May 04, 2021
20 changed files
--- a/official/benchmark/transformer_benchmark.py
+++ b/official/benchmark/transformer_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Transformer w/Keras benchmark and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-import tensorflow as tf
-from official.benchmark import benchmark_wrappers
-from official.benchmark import owner_utils
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-from official.nlp.transformer import misc
-from official.nlp.transformer import transformer_main as transformer_main
-from official.utils.flags import core as flags_core
-
-TRANSFORMER_EN2DE_DATA_DIR_NAME = 'wmt32k-en2de-official'
-EN2DE_2014_BLEU_DATA_DIR_NAME = 'newstest2014'
-FLAGS = flags.FLAGS
-TMP_DIR = os.getenv('TMPDIR')
-
-
-class TransformerBenchmark(PerfZeroBenchmark):
-  """Methods common to executing transformer w/keras tests.
-
-     Code under test for the Transformer Keras models report the same data and
-     require the same FLAG setup.
-  """
-
-  def __init__(self, output_dir=None, default_flags=None, root_data_dir=None,
-               flag_methods=None, tpu=None):
-    root_data_dir = root_data_dir if root_data_dir else ''
-
-    self.train_data_dir = os.path.join(root_data_dir,
-                                       TRANSFORMER_EN2DE_DATA_DIR_NAME)
-
-    self.vocab_file = os.path.join(root_data_dir,
-                                   TRANSFORMER_EN2DE_DATA_DIR_NAME,
-                                   'vocab.ende.32768')
-
-    self.bleu_source = os.path.join(root_data_dir,
-                                    EN2DE_2014_BLEU_DATA_DIR_NAME,
-                                    'newstest2014.en')
-
-    self.bleu_ref = os.path.join(root_data_dir,
-                                 EN2DE_2014_BLEU_DATA_DIR_NAME,
-                                 'newstest2014.de')
-
-    if default_flags is None:
-      default_flags = {}
-    default_flags['data_dir'] = self.train_data_dir
-    default_flags['vocab_file'] = self.vocab_file
-
-    super(TransformerBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=flag_methods,
-        tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                bleu_max=None,
-                                bleu_min=None,
-                                log_steps=None,
-                                total_batch_size=None,
-                                warmup=1):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      bleu_max: highest passing level for bleu score.
-      bleu_min: lowest passing level for bleu score.
-      log_steps: How often the log was created for stats['step_timestamp_log'].
-      total_batch_size: Global batch-size.
-      warmup: number of entries in stats['step_timestamp_log'] to ignore.
-    """
-    start_time_sec = time.time()
-    task = transformer_main.TransformerTask(FLAGS)
-    stats = task.train()
-    wall_time_sec = time.time() - start_time_sec
-
-    metrics = []
-    if 'bleu_uncased' in stats:
-      if 'bleu_uncased_history' in stats:
-        bleu_uncased_best = max(stats['bleu_uncased_history'],
-                                key=lambda x: x[1])
-        metrics.append({'name': 'bleu_uncased',
-                        'value': bleu_uncased_best[1],
-                        'min_value': bleu_min,
-                        'max_value': bleu_max})
-        metrics.append({'name': 'bleu_best_score_iteration',
-                        'value': bleu_uncased_best[0]})
-        metrics.append({'name': 'bleu_uncased_last',
-                        'value': stats['bleu_uncased']})
-      else:
-        metrics.append({'name': 'bleu_uncased',
-                        'value': stats['bleu_uncased'],
-                        'min_value': bleu_min,
-                        'max_value': bleu_max})
-
-    if (warmup and 'step_timestamp_log' in stats and
-        len(stats['step_timestamp_log']) > warmup + 1):
-      # first entry in the time_log is start of step 1. The rest of the
-      # entries are the end of each step recorded
-      time_log = stats['step_timestamp_log']
-      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      num_examples = (
-          total_batch_size * log_steps * (len(time_log) - warmup - 1))
-      examples_per_sec = num_examples / elapsed
-      metrics.append({'name': 'exp_per_second',
-                      'value': examples_per_sec})
-
-    if 'avg_exp_per_second' in stats:
-      metrics.append({'name': 'avg_exp_per_second',
-                      'value': stats['avg_exp_per_second']})
-
-    if 'step_timestamp_log' in stats:
-      time_log = stats['step_timestamp_log']
-      metrics.append({'name': 'startup_time',
-                      'value': time_log[0].timestamp - start_time_sec})
-
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics,
-                          extras={'flags': flags_str})
-
-
-class TransformerBaseKerasAccuracy(TransformerBenchmark):
-  """Benchmark accuracy tests for Transformer Base model w/ Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Benchmark accuracy tests for Transformer Base model w/ Keras.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    flag_methods = [misc.define_transformer_flags]
-
-    super(TransformerBaseKerasAccuracy, self).__init__(
-        output_dir=output_dir, root_data_dir=root_data_dir,
-        flag_methods=flag_methods)
-
-  def benchmark_1_gpu(self):
-    """Benchmark 1 gpu.
-
-      The paper uses 8 GPUs and a much larger effective batch size, this is will
-      not converge to the 27.3 BLEU (uncased) SOTA.
-    """
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 2048
-    FLAGS.train_steps = 1000
-    FLAGS.steps_between_evals = 500
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    # These bleu scores are based on test runs after at this limited
-    # number of steps and batch size after verifying SOTA at 8xV100s.
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=25.3,
-                                   bleu_max=26)
-
-  def benchmark_1_gpu_static_batch(self):
-    """Benchmark 1 gpu with static_batch.
-
-      The paper uses 8 GPUs and a much larger effective batch size, this is will
-      not converge to the 27.3 BLEU (uncased) SOTA.
-    """
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 4096
-    FLAGS.train_steps = 100000
-    FLAGS.steps_between_evals = 5000
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_static_batch')
-    # These bleu scores are based on test runs after at this limited
-    # number of steps and batch size after verifying SOTA at 8xV100s.
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=25.3,
-                                   bleu_max=26)
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu.
-
-      Should converge to 27.3 BLEU (uncased). This has not been confirmed yet.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 4096*8
-    FLAGS.train_steps = 100000
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=27,
-                                   bleu_max=28)
-
-  def benchmark_8_gpu_static_batch(self):
-    """Benchmark 8 gpu.
-
-      Should converge to 27.3 BLEU (uncased). This has not been confirmed yet.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 4096*8
-    FLAGS.train_steps = 100000
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.steps_between_evals = 5000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=27,
-                                   bleu_max=28)
-
-
-class TransformerBigKerasAccuracy(TransformerBenchmark):
-  """Benchmark accuracy tests for Transformer Big model w/ Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Benchmark accuracy tests for Transformer Big model w/ Keras.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    flag_methods = [misc.define_transformer_flags]
-
-    super(TransformerBigKerasAccuracy, self).__init__(
-        output_dir=output_dir, root_data_dir=root_data_dir,
-        flag_methods=flag_methods)
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu.
-
-    Over 6 runs with eval every 20K steps the average highest value was 28.195
-    (bleu uncased). 28.424 was the highest and 27.96 the lowest. The values are
-    the highest value seen during a run and occurred at a median of iteration 9.
-    Iterations are not epochs, an iteration is a number of steps between evals.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.train_steps = 20000 * 12
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=27.9,
-                                   bleu_max=29.2)
-
-  def benchmark_8_gpu_static_batch(self):
-    """Benchmark 8 gpu.
-
-    Should converge to 28.4 BLEU (uncased). This has not be verified yet."
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.train_steps = 20000 * 12
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29.2)
-
-  def benchmark_8_gpu_fp16(self):
-    """Benchmark 8 gpu with dynamic batch and fp16.
-
-    Over 6 runs with eval every 20K steps the average highest value was 28.247
-    (bleu uncased). 28.424 was the highest and 28.09 the lowest. The values are
-    the highest value seen during a run and occurred at a median of iteration
-    11. While this could be interpreted as worse than FP32, if looking at the
-    first iteration at which 28 is passed FP16 performs equal and possibly
-    better. Although not part of the initial test runs, the highest value
-    recorded with the arguments below was 28.9 at iteration 12. Iterations are
-    not epochs, an iteration is a number of steps between evals.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.train_steps = 20000 * 12
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29.2)
-
-  def benchmark_8_gpu_fp16_amp(self):
-    """Benchmark 8 gpu with dynamic batch and fp16 with automatic mixed precision.
-
-      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.train_steps = 20000 * 12
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16_amp')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29)
-
-  def benchmark_8_gpu_static_batch_fp16(self):
-    """Benchmark 8 gpu with static batch and fp16.
-
-      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.train_steps = 400000
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29.2)
-
-  def benchmark_xla_8_gpu_static_batch_fp16(self):
-    """Benchmark 8 gpu with static batch, XLA, and FP16.
-
-      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_xla = True
-    FLAGS.data_dir = self.train_data_dir
-    FLAGS.vocab_file = self.vocab_file
-    # Sets values directly to avoid validation check.
-    FLAGS['bleu_source'].value = self.bleu_source
-    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 3072*8
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.train_steps = 400000
-    FLAGS.steps_between_evals = 20000
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_static_batch_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
-                                   bleu_max=29.2)
-
-
-class TransformerKerasBenchmark(TransformerBenchmark):
-  """Benchmarks for Transformer (Base and Big) using Keras."""
-
-  def __init__(self, output_dir=None, default_flags=None,
-               root_data_dir=None, batch_per_gpu=4096, tpu=None):
-    """Initialize.
-
-    Args:
-      output_dir: Based directory for saving artifacts, e.g. checkpoints.
-      default_flags: default flags to use for all tests.
-      root_data_dir: root directory for data, e.g. training.
-      batch_per_gpu: batch size to use per gpu.
-      tpu: Target TPU to use.
-    """
-    flag_methods = [misc.define_transformer_flags]
-    self.batch_per_gpu = batch_per_gpu
-
-    super(TransformerKerasBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        root_data_dir=root_data_dir,
-        flag_methods=flag_methods,
-        tpu=tpu)
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Benchmark 1 gpu without distribution strategy."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu_no_dist_strat_static_batch(self):
-    """Benchmark 1 gpu without distribution strategy with static batch."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_ds_sb')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu(self):
-    """Benchmark 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu_fp16(self):
-    """Benchmark 1 gpu FP16."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_1_gpu(self):
-    """Benchmark 1 gpu w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_1_gpu_fp16(self):
-    """Benchmark 1 gpu w/xla and FP16."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu_static_batch(self):
-    """Benchmark 1 gpu with static batch."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_static_batch')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_1_gpu_static_batch(self):
-    """Benchmark 1 gpu with static batch w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_static_batch')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu_static_batch_fp16(self):
-    """Benchmark 1 gpu with static batch FP16."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_static_batch_fp16')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_1_gpu_static_batch_fp16(self):
-    """Benchmark 1 gpu with static batch w/xla and FP16."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = self.batch_per_gpu
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_1_gpu_static_batch_fp16')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu(self):
-    """Benchmark 8 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu_fp16(self):
-    """Benchmark 8 gpu FP16."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_8_gpu(self):
-    """Benchmark 8 gpu w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_8_gpu_fp16(self):
-    """Benchmark 8 gpu w/xla and FP16."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu_static_batch(self):
-    """Benchmark 8 gpu with static batch."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu_static_batch_fp16(self):
-    """Benchmark 8 gpu with static batch FP16."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_8_gpu_static_batch_fp16')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_8_gpu_static_batch(self):
-    """Benchmark 8 gpu with static batch w/xla."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_static_batch')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-  def benchmark_xla_8_gpu_static_batch_fp16(self):
-    """Benchmark 8 gpu with static batch w/xla and FP16."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = self.batch_per_gpu * 8
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_static_batch_fp16')
-    FLAGS.static_batch = True
-    FLAGS.max_length = 64
-    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
-                                   log_steps=FLAGS.log_steps)
-
-
-class TransformerBaseKerasBenchmarkReal(TransformerKerasBenchmark):
-  """Transformer based version real data benchmark tests."""
-
-  def __init__(self, output_dir=TMP_DIR, root_data_dir=TMP_DIR, **kwargs):
-    def_flags = {}
-    def_flags['param_set'] = 'base'
-    def_flags['train_steps'] = 50
-    def_flags['log_steps'] = 10
-
-    super(TransformerBaseKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags,
-        root_data_dir=root_data_dir, batch_per_gpu=4096)
-
-
-class TransformerBigKerasBenchmarkReal(TransformerKerasBenchmark):
-  """Transformer based version real data benchmark tests."""
-
-  def __init__(self, output_dir=TMP_DIR, root_data_dir=TMP_DIR,
-               tpu=None, **kwargs):
-    def_flags = {}
-    def_flags['param_set'] = 'big'
-    def_flags['train_steps'] = 50
-    def_flags['log_steps'] = 10
-
-    super(TransformerBigKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags,
-        root_data_dir=root_data_dir, batch_per_gpu=3072,
-        tpu=tpu)
-
-  def benchmark_2x2_tpu(self):
-    """Port of former snaggletooth transformer_big model on 2x2."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu')
-    FLAGS.train_steps = 300
-    FLAGS.log_steps = 150
-    FLAGS.steps_between_evals = 150
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.static_batch = True
-    FLAGS.use_ctl = True
-    FLAGS.batch_size = 6144
-    FLAGS.max_length = 64
-    FLAGS.decode_batch_size = 32
-    FLAGS.decode_max_length = 97
-    FLAGS.padded_decode = True
-    FLAGS.enable_checkpointing = False
-
-    self._run_and_report_benchmark(
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-  def benchmark_4x4_tpu(self):
-    """Port of former GCP transformer_big model on 4x4."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu')
-    FLAGS.train_steps = 300
-    FLAGS.log_steps = 150
-    FLAGS.steps_between_evals = 150
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.static_batch = True
-    FLAGS.use_ctl = True
-    FLAGS.batch_size = 24576
-    FLAGS.max_length = 64
-    FLAGS.decode_batch_size = 32
-    FLAGS.decode_max_length = 97
-    FLAGS.padded_decode = True
-    FLAGS.enable_checkpointing = False
-
-    self._run_and_report_benchmark(
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_4x4_tpu_mlir(self):
-    """Run transformer_big model on 4x4 with the MLIR Bridge enabled."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu')
-    FLAGS.train_steps = 300
-    FLAGS.log_steps = 150
-    FLAGS.steps_between_evals = 150
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.static_batch = True
-    FLAGS.use_ctl = True
-    FLAGS.batch_size = 24576
-    FLAGS.max_length = 64
-    FLAGS.decode_batch_size = 32
-    FLAGS.decode_max_length = 97
-    FLAGS.padded_decode = True
-    FLAGS.enable_checkpointing = False
-    tf.config.experimental.enable_mlir_bridge()
-
-    self._run_and_report_benchmark(
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/unet3d_benchmark.py
+++ b/official/benchmark/unet3d_benchmark.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes benchmark testing for 3D Unet model."""
-# pylint: disable=line-too-long
-from __future__ import print_function
-
-import functools
-import os
-import time
-from typing import Optional
-from absl import flags
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import keras_benchmark
-from official.benchmark import owner_utils
-from official.vision.segmentation import unet_main as unet_training_lib
-from official.vision.segmentation import unet_model as unet_model_lib
-
-UNET3D_MIN_ACCURACY = 0.90
-UNET3D_MAX_ACCURACY = 0.98
-UNET_TRAINING_FILES = 'gs://mlcompass-data/unet3d/train_data/*'
-UNET_EVAL_FILES = 'gs://mlcompass-data/unet3d/eval_data/*'
-UNET_MODEL_CONFIG_FILE = 'gs://mlcompass-data/unet3d/config/unet_config.yaml'
-
-FLAGS = flags.FLAGS
-
-
-class Unet3DAccuracyBenchmark(keras_benchmark.KerasBenchmark):
-  """Benchmark accuracy tests for UNet3D model in Keras."""
-
-  def __init__(self,
-               output_dir: Optional[str] = None,
-               root_data_dir: Optional[str] = None,
-               **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-        constructor forward compatible in case PerfZero provides more named
-        arguments before updating the constructor.
-    """
-
-    flag_methods = [unet_training_lib.define_unet3d_flags]
-
-    # UNet3D model in Keras."""
-    self.training_file_pattern = UNET_TRAINING_FILES
-    self.eval_file_pattern = UNET_EVAL_FILES
-
-    # TODO(hongjunchoi): Create and use shared config file instead.
-    self.config_file = UNET_MODEL_CONFIG_FILE
-    super(Unet3DAccuracyBenchmark, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def _set_benchmark_parameters(self, experiment_name):
-    """Overrides training parameters for benchmark tests."""
-    FLAGS.model_dir = self._get_model_dir(experiment_name)
-    FLAGS.mode = 'train'
-    FLAGS.training_file_pattern = self.training_file_pattern
-    FLAGS.eval_file_pattern = self.eval_file_pattern
-    FLAGS.config_file = self.config_file
-    FLAGS.lr_init_value = 0.00005
-    FLAGS.lr_decay_rate = 0.5
-    FLAGS.epochs = 3
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                experiment_name: str,
-                                min_accuracy: float = UNET3D_MIN_ACCURACY,
-                                max_accuracy: float = UNET3D_MAX_ACCURACY,
-                                distribution_strategy: str = 'tpu',
-                                epochs: int = 10,
-                                steps: int = 0,
-                                epochs_between_evals: int = 1,
-                                dtype: str = 'float32',
-                                enable_xla: bool = False,
-                                run_eagerly: bool = False):
-    """Runs and reports the benchmark given the provided configuration."""
-    params = unet_training_lib.extract_params(FLAGS)
-    strategy = unet_training_lib.create_distribution_strategy(params)
-
-    input_dtype = params.dtype
-    if input_dtype == 'float16' or input_dtype == 'bfloat16':
-      policy = tf.keras.mixed_precision.experimental.Policy(
-          'mixed_bfloat16' if input_dtype == 'bfloat16' else 'mixed_float16')
-      tf.keras.mixed_precision.experimental.set_policy(policy)
-
-    stats = {}
-    start_time_sec = time.time()
-    with strategy.scope():
-      unet_model = unet_model_lib.build_unet_model(params)
-      history = unet_training_lib.train(
-          params, strategy, unet_model,
-          functools.partial(unet_training_lib.get_train_dataset, params),
-          functools.partial(unet_training_lib.get_eval_dataset, params))
-
-      stats['accuracy_top_1'] = history.history['val_metric_accuracy'][-1]
-      stats['training_accuracy_top_1'] = history.history['metric_accuracy'][-1]
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Unet3DAccuracyBenchmark, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=min_accuracy,
-        top_1_max=max_accuracy,
-        total_batch_size=params.train_batch_size)
-
-  def _get_model_dir(self, folder_name):
-    return os.path.join(self.output_dir, folder_name)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_4x4_tpu_bf16(self):
-    """Test Keras model with 4x4 TPU, fp16."""
-    experiment_name = 'benchmark_4x4_tpu_fp16'
-    self._setup()
-    self._set_benchmark_parameters(experiment_name)
-    self._run_and_report_benchmark(
-        experiment_name=experiment_name,
-        dtype='bfloat16',
-        distribution_strategy='tpu')
-
-  @owner_utils.Owner('tf-graph-compiler')
-  def benchmark_4x4_tpu_bf16_mlir(self):
-    """Test Keras model with 4x4 TPU, fp16 and MLIR enabled."""
-    experiment_name = 'benchmark_4x4_tpu_fp16_mlir'
-    tf.config.experimental.enable_mlir_bridge()
-    self._setup()
-    self._set_benchmark_parameters(experiment_name)
-    self._run_and_report_benchmark(
-        experiment_name=experiment_name,
-        dtype='bfloat16',
-        distribution_strategy='tpu')
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/xlnet_benchmark.py
+++ b/official/benchmark/xlnet_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes XLNet benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-import os
-import time
-
-# pylint: disable=g-bad-import-order
-from absl import flags
-from absl.testing import flagsaver
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.benchmark import bert_benchmark_utils as benchmark_utils
-from official.benchmark import owner_utils
-from official.nlp.xlnet import run_classifier
-from official.nlp.xlnet import run_squad
-from official.benchmark import benchmark_wrappers
-
-
-# pylint: disable=line-too-long
-PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/xlnet/large/xlnet_model-1'
-CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.train.tf_record'
-CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.dev.eval.tf_record'
-SQUAD_DATA_PATH = 'gs://tf-perfzero-data/xlnet/squadv2_cased/'
-# pylint: enable=line-too-long
-
-FLAGS = flags.FLAGS
-
-
-class XLNetBenchmarkBase(benchmark_utils.BertBenchmarkBase):
-  """Base class to hold methods common to test classes in the module."""
-
-  def __init__(self, output_dir=None, tpu=None):
-    super(XLNetBenchmarkBase, self).__init__(output_dir=output_dir, tpu=tpu)
-    self.num_epochs = None
-    self.num_steps_per_epoch = None
-
-  @flagsaver.flagsaver
-  def _run_xlnet_classifier(self):
-    """Starts XLNet classification task."""
-    run_classifier.main(unused_argv=None)
-
-  @flagsaver.flagsaver
-  def _run_xlnet_squad(self):
-    """Starts XLNet classification task."""
-    run_squad.main(unused_argv=None)
-
-
-class XLNetClassifyAccuracy(XLNetBenchmarkBase):
-  """Short accuracy test for XLNet classifier model.
-
-  Tests XLNet classification task model accuracy. The naming
-  convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` format.
-  """
-
-  def __init__(self, output_dir=None, tpu=None, **kwargs):
-    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
-    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
-    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
-
-    super(XLNetClassifyAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                training_summary_path,
-                                min_accuracy=0.95,
-                                max_accuracy=0.97):
-    """Starts XLNet accuracy benchmark test."""
-
-    start_time_sec = time.time()
-    self._run_xlnet_classifier()
-    wall_time_sec = time.time() - start_time_sec
-
-    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
-      summary = json.loads(reader.read().decode('utf-8'))
-
-    super(XLNetClassifyAccuracy, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=min_accuracy,
-        max_accuracy=max_accuracy)
-
-  def _setup(self):
-    super(XLNetClassifyAccuracy, self)._setup()
-    FLAGS.test_data_size = 25024
-    FLAGS.train_batch_size = 16
-    FLAGS.seq_len = 512
-    FLAGS.mem_len = 0
-    FLAGS.n_layer = 24
-    FLAGS.d_model = 1024
-    FLAGS.d_embed = 1024
-    FLAGS.n_head = 16
-    FLAGS.d_head = 64
-    FLAGS.d_inner = 4096
-    FLAGS.untie_r = True
-    FLAGS.n_class = 2
-    FLAGS.ff_activation = 'gelu'
-    FLAGS.strategy_type = 'mirror'
-    FLAGS.learning_rate = 2e-5
-    FLAGS.train_steps = 4000
-    FLAGS.warmup_steps = 500
-    FLAGS.iterations = 200
-    FLAGS.bi_data = False
-    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
-    FLAGS.train_tfrecord_path = self.train_data_path
-    FLAGS.test_tfrecord_path = self.eval_data_path
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu_imdb(self):
-    """Run XLNet model accuracy test with 8 GPUs."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_imdb')
-    # Sets timer_callback to None as we do not use it now.
-    self.timer_callback = None
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu_imdb(self):
-    """Run XLNet model accuracy test on 2x2 tpu."""
-    self._setup()
-    FLAGS.strategy_type = 'tpu'
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_imdb')
-    # Sets timer_callback to None as we do not use it now.
-    self.timer_callback = None
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-
-class XLNetSquadAccuracy(XLNetBenchmarkBase):
-  """Short accuracy test for XLNet squad model.
-
-  Tests XLNet squad task model accuracy. The naming
-  convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` format.
-  """
-
-  def __init__(self, output_dir=None, tpu=None, **kwargs):
-    self.train_data_path = SQUAD_DATA_PATH
-    self.predict_file = os.path.join(SQUAD_DATA_PATH, "dev-v2.0.json")
-    self.test_data_path = os.path.join(SQUAD_DATA_PATH, "12048.eval.tf_record")
-    self.spiece_model_file = os.path.join(SQUAD_DATA_PATH, "spiece.cased.model")
-    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
-
-    super(XLNetSquadAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                training_summary_path,
-                                min_accuracy=87.0,
-                                max_accuracy=89.0):
-    """Starts XLNet accuracy benchmark test."""
-
-    start_time_sec = time.time()
-    self._run_xlnet_squad()
-    wall_time_sec = time.time() - start_time_sec
-
-    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
-      summary = json.loads(reader.read().decode('utf-8'))
-
-    super(XLNetSquadAccuracy, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=min_accuracy,
-        max_accuracy=max_accuracy)
-
-  def _setup(self):
-    super(XLNetSquadAccuracy, self)._setup()
-    FLAGS.train_batch_size = 16
-    FLAGS.seq_len = 512
-    FLAGS.mem_len = 0
-    FLAGS.n_layer = 24
-    FLAGS.d_model = 1024
-    FLAGS.d_embed = 1024
-    FLAGS.n_head = 16
-    FLAGS.d_head = 64
-    FLAGS.d_inner = 4096
-    FLAGS.untie_r = True
-    FLAGS.ff_activation = 'gelu'
-    FLAGS.strategy_type = 'mirror'
-    FLAGS.learning_rate = 3e-5
-    FLAGS.train_steps = 8000
-    FLAGS.warmup_steps = 1000
-    FLAGS.iterations = 1000
-    FLAGS.bi_data = False
-    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
-    FLAGS.train_tfrecord_path = self.train_data_path
-    FLAGS.test_tfrecord_path = self.test_data_path
-    FLAGS.spiece_model_file = self.spiece_model_file
-    FLAGS.predict_file = self.predict_file
-    FLAGS.adam_epsilon = 1e-6
-    FLAGS.lr_layer_decay_rate = 0.75
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu_squadv2(self):
-    """Run XLNet model squad v2 accuracy test with 8 GPUs."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squadv2')
-    FLAGS.predict_dir = FLAGS.model_dir
-    # Sets timer_callback to None as we do not use it now.
-    self.timer_callback = None
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu_squadv2(self):
-    """Run XLNet model squad v2 accuracy test on 2x2 tpu."""
-    self._setup()
-    FLAGS.strategy_type = 'tpu'
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_squadv2')
-    FLAGS.predict_dir = FLAGS.model_dir
-    # Sets timer_callback to None as we do not use it now.
-    self.timer_callback = None
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/colab/decoding_api_in_tf_nlp.ipynb
+++ b/official/colab/decoding_api_in_tf_nlp.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vXLA5InzXydn"
+      },
+      "source": [
+        "##### Copyright 2021 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "RuRlpLL-X0R_"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fsACVQpVSifi"
+      },
+      "source": [
+        "### Install the TensorFlow Model Garden pip package\n",
+        "\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
+        "*  pip will install all models and dependencies automatically."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hYEwGTeCXnnX"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2j-xhrsVQOQT"
+      },
+      "outputs": [],
+      "source": [
+        "pip install  tf-models-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BjP7zwxmskpY"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "from official import nlp\n",
+        "from official.nlp.modeling.ops import sampling_module\n",
+        "from official.nlp.modeling.ops import beam_search"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0AWgyo-IQ5sP"
+      },
+      "source": [
+        "# Decoding API\n",
+        "This API provides an interface to experiment with different decoding strategies used for auto-regressive models.\n",
+        "\n",
+        "1. The following sampling strategies are provided in sampling_module.py, which inherits from the base Decoding class:\n",
+        "  *   [top_p](https://arxiv.org/abs/1904.09751) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L65) \n",
+        "\n",
+        "      This implementation chooses most probable logits with cumulative probabilities upto top_p.\n",
+        "\n",
+        "  *   [top_k](https://arxiv.org/pdf/1805.04833.pdf) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L48)\n",
+        "\n",
+        "      At each timestep, this implementation samples from top-k logits based on their probability distribution\n",
+        "\n",
+        "  *   Greedy : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L26)\n",
+        "\n",
+        "      This implementation returns the top logits based on probabilities.\n",
+        "\n",
+        "2. Beam search is provided in beam_search.py. [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search.py)\n",
+        "\n",
+        "      This implementation reduces the risk of missing hidden high probability logits by keeping the most likely num_beams of logits at each time step and eventually choosing the logits that has the overall highest probability."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MfOj7oaBRQnS"
+      },
+      "source": [
+        "## Initialize Sampling Module in TF-NLP.\n",
+        "\n",
+        "\n",
+        "\u003e **symbols_to_logits_fn** : This is a closure implemented by the users of the API. The input to this closure will be  \n",
+        "```\n",
+        "Args:\n",
+        "  1] ids [batch_size, .. (index + 1 or 1 if padded_decode is True)],\n",
+        "  2] index [scalar] : current decoded step,\n",
+        "  3] cache [nested dictionary of tensors].\n",
+        "Returns:\n",
+        "  1] tensor for next-step logits [batch_size, vocab]\n",
+        "  2] the updated_cache [nested dictionary of tensors].\n",
+        "```\n",
+        "This closure calls the model to predict the logits for the 'index+1' step. The cache is used for faster decoding.\n",
+        "Here is a [reference](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search_test.py#L88) implementation for the above closure.\n",
+        "\n",
+        "\n",
+        "\u003e **length_normalization_fn** : Closure for returning length normalization parameter.\n",
+        "```\n",
+        "Args: \n",
+        "  1] length : scalar for decoded step index.\n",
+        "  2] dtype : data-type of output tensor\n",
+        "Returns:\n",
+        "  1] value of length normalization factor.\n",
+        "Example :\n",
+        "  def _length_norm(length, dtype):\n",
+        "    return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)\n",
+        "```\n",
+        "\n",
+        "\u003e **vocab_size** : Output vocabulary size.\n",
+        "\n",
+        "\u003e **max_decode_length** : Scalar for total number of decoding steps.\n",
+        "\n",
+        "\u003e **eos_id** : Decoding will stop if all output decoded ids in the batch have this ID.\n",
+        "\n",
+        "\u003e **padded_decode** : Set this to True if running on TPU. Tensors are padded to max_decoding_length if this is True.\n",
+        "\n",
+        "\u003e **top_k** : top_k is enabled if this value is \u003e 1.\n",
+        "\n",
+        "\u003e **top_p** : top_p is enabled if this value is \u003e 0 and \u003c 1.0\n",
+        "\n",
+        "\u003e **sampling_temperature** : This is used to re-estimate the softmax output. Temperature skews the distribution towards high probability tokens and lowers the mass in tail distribution. Value has to be positive. Low temperature is equivalent to greedy and makes the distribution sharper, while high temperature makes it more flat.\n",
+        "\n",
+        "\u003e **enable_greedy** : By default, this is true and greedy decoding is enabled.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lV1RRp6ihnGX"
+      },
+      "source": [
+        "# Initialize the Model Hyper-parameters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "eTsGp2gaKLdE"
+      },
+      "outputs": [],
+      "source": [
+        "params = {}\n",
+        "params['num_heads'] = 2\n",
+        "params['num_layers'] = 2\n",
+        "params['batch_size'] = 2\n",
+        "params['n_dims'] = 256\n",
+        "params['max_decode_length'] = 4"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UGvmd0_dRFYI"
+      },
+      "source": [
+        "## What is a Cache?\n",
+        "In auto-regressive architectures like Transformer based [Encoder-Decoder](https://arxiv.org/abs/1706.03762) models, \n",
+        "Cache is used for fast sequential decoding.\n",
+        "It is a nested dictionary storing pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) for every layer.\n",
+        "\n",
+        "```\n",
+        "{\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers']),\n",
+        "    'model_specific_item' : Model specific tensor shape,\n",
+        "}\n",
+        "\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CYXkoplAij01"
+      },
+      "source": [
+        "# Initialize cache. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "D6kfZOOKgkm1"
+      },
+      "outputs": [],
+      "source": [
+        "cache = {\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers'])\n",
+        "    }\n",
+        "print(\"cache key shape for layer 1 :\", cache['layer_1']['k'].shape)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nNY3Xn8SiblP"
+      },
+      "source": [
+        "# Define closure for length normalization. **optional.**\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T92ccAzlnGqh"
+      },
+      "outputs": [],
+      "source": [
+        "def length_norm(length, dtype):\n",
+        "  \"\"\"Return length normalization factor.\"\"\"\n",
+        "  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "syl7I5nURPgW"
+      },
+      "source": [
+        "# Create model_fn\n",
+        "  In practice, this will be replaced by an actual model implementation such as [here](https://github.com/tensorflow/models/blob/master/official/nlp/transformer/transformer.py#L236)\n",
+        "```\n",
+        "Args:\n",
+        "i : Step that is being decoded.\n",
+        "Returns:\n",
+        "  logit probabilities of size [batch_size, 1, vocab_size]\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AhzSkRisRdB6"
+      },
+      "outputs": [],
+      "source": [
+        "probabilities = tf.constant([[[0.3, 0.4, 0.3], [0.3, 0.3, 0.4],\n",
+        "                              [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],\n",
+        "                            [[0.2, 0.5, 0.3], [0.2, 0.7, 0.1],\n",
+        "                              [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]])\n",
+        "def model_fn(i):\n",
+        "  return probabilities[:, i, :]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DBMUkaVmVZBg"
+      },
+      "source": [
+        "# Initialize symbols_to_logits_fn\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FAJ4CpbfVdjr"
+      },
+      "outputs": [],
+      "source": [
+        "def _symbols_to_logits_fn():\n",
+        "  \"\"\"Calculates logits of the next tokens.\"\"\"\n",
+        "  def symbols_to_logits_fn(ids, i, temp_cache):\n",
+        "    del ids\n",
+        "    logits = tf.cast(tf.math.log(model_fn(i)), tf.float32)\n",
+        "    return logits, temp_cache\n",
+        "  return symbols_to_logits_fn"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "R_tV3jyWVL47"
+      },
+      "source": [
+        "# Greedy \n",
+        "Greedy decoding selects the token id with the highest probability as its next id: $id_t = argmax_{w}P(id | id_{1:t-1})$ at each timestep $t$. The following sketch shows greedy decoding. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aGt9idSkVQEJ"
+      },
+      "outputs": [],
+      "source": [
+        "greedy_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=None,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    padded_decode=False)\n",
+        "ids, _ = greedy_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"Greedy Decoded Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "s4pTTsQXVz5O"
+      },
+      "source": [
+        "# top_k sampling\n",
+        "In *Top-K* sampling, the *K* most likely next token ids are filtered and the probability mass is redistributed among only those *K* ids. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pCLWIn6GV5_G"
+      },
+      "outputs": [],
+      "source": [
+        "top_k_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=length_norm,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    sample_temperature=tf.constant(1.0),\n",
+        "    top_k=tf.constant(3),\n",
+        "    padded_decode=False,\n",
+        "    enable_greedy=False)\n",
+        "ids, _ = top_k_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"top-k sampled Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Jp3G-eE_WI4Y"
+      },
+      "source": [
+        "# top_p sampling\n",
+        "Instead of sampling only from the most likely *K* token ids, in *Top-p* sampling chooses from the smallest possible set of ids whose cumulative probability exceeds the probability *p*."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rEGdIWcuWILO"
+      },
+      "outputs": [],
+      "source": [
+        "top_p_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=length_norm,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    sample_temperature=tf.constant(1.0),\n",
+        "    top_p=tf.constant(0.9),\n",
+        "    padded_decode=False,\n",
+        "    enable_greedy=False)\n",
+        "ids, _ = top_p_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"top-p sampled Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2hcuyJ2VWjDz"
+      },
+      "source": [
+        "# Beam search decoding\n",
+        "Beam search reduces the risk of missing hidden high probability token ids by keeping the most likely num_beams of hypotheses at each time step and eventually choosing the hypothesis that has the overall highest probability. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cJ3WzvSrWmSA"
+      },
+      "outputs": [],
+      "source": [
+        "beam_size = 2\n",
+        "params['batch_size'] = 1\n",
+        "beam_cache = {\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers'])\n",
+        "    }\n",
+        "print(\"cache key shape for layer 1 :\", beam_cache['layer_1']['k'].shape)\n",
+        "ids, _ = beam_search.sequence_beam_search(\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    initial_ids=tf.constant([9], tf.int32),\n",
+        "    initial_cache=beam_cache,\n",
+        "    vocab_size=3,\n",
+        "    beam_size=beam_size,\n",
+        "    alpha=0.6,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    padded_decode=False,\n",
+        "    dtype=tf.float32)\n",
+        "print(\"Beam search ids:\", ids)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "decoding_api_in_tf_nlp.ipynb",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/official/colab/fine_tuning_bert.ipynb
+++ b/official/colab/fine_tuning_bert.ipynb
@@ -3,7 +3,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "vXLA5InzXydn"
      },
      "source": [
@@ -12,11 +11,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "RuRlpLL-X0R_"
      },
      "outputs": [],
@@ -37,7 +34,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "1mLJmVotXs64"
      },
      "source": [
@@ -47,7 +43,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "hYEwGTeCXnnX"
      },
      "source": [
@@ -64,13 +59,15 @@
        "  \u003ctd\u003e\n",
        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
        "\u003c/table\u003e"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "YN2ACivEPxgD"
      },
      "source": [
@@ -82,7 +79,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "s2d9S2CSSO1z"
      },
      "source": [
@@ -92,34 +88,30 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "fsACVQpVSifi"
      },
      "source": [
        "### Install the TensorFlow Model Garden pip package\n",
        "\n",
-        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
        "*  pip will install all models and dependencies automatically."
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "NvNr2svBM-p3"
      },
      "outputs": [],
      "source": [
-        "!pip install -q tf-nightly\n",
-        "!pip install -q tf-models-nightly"
+        "!pip install -q tf-models-official==2.4.0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "U-7qPCjWUAyy"
      },
      "source": [
@@ -128,10 +120,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "lXsXev5MNr20"
      },
      "outputs": [],
@@ -160,13 +150,12 @@
        "import official.nlp.data.classifier_data_lib\n",
        "import official.nlp.modeling.losses\n",
        "import official.nlp.modeling.models\n",
-        "import official.nlp.modeling.networks"
+        "import official.nlp.modeling.networks\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "mbanlzTvJBsz"
      },
      "source": [
@@ -176,7 +165,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "PpW0x8TpR8DT"
      },
      "source": [
@@ -185,45 +173,39 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "vzRHOLciR8eq"
      },
      "outputs": [],
      "source": [
-        "gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12\"\n",
+        "gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12\"\n",
        "tf.io.gfile.listdir(gs_folder_bert)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9uFskufsR2LT"
      },
      "source": [
-        "You can get a pre-trained BERT encoder from TensorFlow Hub here:"
+        "You can get a pre-trained BERT encoder from [TensorFlow Hub](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2):"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "e0dAkUttJAzj"
      },
      "outputs": [],
      "source": [
-        "hub_url_bert = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\""
+        "hub_url_bert = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Qv6abtRvH4xO"
      },
      "source": [
@@ -236,7 +218,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "28DvUhC1YUiB"
      },
      "source": [
@@ -252,10 +233,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Ijikx5OsH9AT"
      },
      "outputs": [],
@@ -267,10 +246,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "xf9zz4vLYXjr"
      },
      "outputs": [],
@@ -281,7 +258,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "ZgBg2r2nYT-K"
      },
      "source": [
@@ -290,10 +266,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "IQrHxv7W7jH5"
      },
      "outputs": [],
@@ -304,7 +278,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "vhsVWYNxazz5"
      },
      "source": [
@@ -313,10 +286,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "n0gfc_VTayfQ"
      },
      "outputs": [],
@@ -327,7 +298,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "38zJcap6xkbC"
      },
      "source": [
@@ -336,10 +306,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "xON_i6SkwApW"
      },
      "outputs": [],
@@ -353,7 +321,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9fbTyfJpNr7x"
      },
      "source": [
@@ -363,7 +330,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "wqeN54S61ZKQ"
      },
      "source": [
@@ -376,10 +342,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "idxyhmrCQcw5"
      },
      "outputs": [],
@@ -395,7 +359,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "zYHDSquU2lDU"
      },
      "source": [
@@ -404,10 +367,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "L_OfOYPg853R"
      },
      "outputs": [],
@@ -421,7 +382,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "kkAXLtuyWWDI"
      },
      "source": [
@@ -435,7 +395,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "62UTWLQd9-LB"
      },
      "source": [
@@ -446,10 +405,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "bdL-dRNRBRJT"
      },
      "outputs": [],
@@ -460,7 +417,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "UrPktnqpwqie"
      },
      "source": [
@@ -469,10 +425,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "BR7BmtU498Bh"
      },
      "outputs": [],
@@ -490,10 +444,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "has42aUdfky-"
      },
      "outputs": [],
@@ -505,7 +457,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MU9lTWy_xXbb"
      },
      "source": [
@@ -514,10 +465,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "USD8uihw-g4J"
      },
      "outputs": [],
@@ -530,7 +479,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xmNv4l4k-dBZ"
      },
      "source": [
@@ -540,7 +488,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "DIWjNIKq-ldh"
      },
      "source": [
@@ -553,7 +500,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "ulNZ4U96-8JZ"
      },
      "source": [
@@ -562,10 +508,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "EezOO9qj91kP"
      },
      "outputs": [],
@@ -578,7 +522,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "rxLenwAvCkBf"
      },
      "source": [
@@ -587,10 +530,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "2CetH_5C9P2m"
      },
      "outputs": [],
@@ -606,7 +547,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "P5UBnCn8Ii6s"
      },
      "source": [
@@ -617,10 +557,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "sDGiWYPLEd5a"
      },
      "outputs": [],
@@ -661,10 +599,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "yuLKxf6zHxw-"
      },
      "outputs": [],
@@ -682,7 +618,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "7FC5aLVxKVKK"
      },
      "source": [
@@ -691,10 +626,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "jyjTdGpFhO_1"
      },
      "outputs": [],
@@ -708,7 +641,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "FSwymsbkbLDA"
      },
      "source": [
@@ -718,7 +650,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Efrj3Cn1kLAp"
      },
      "source": [
@@ -728,7 +659,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xxpOY5r2Ayq6"
      },
      "source": [
@@ -737,10 +667,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "ujapVfZ_AKW7"
      },
      "outputs": [],
@@ -758,7 +686,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "96ldxDSwkVkj"
      },
      "source": [
@@ -769,10 +696,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "cH682__U0FBv"
      },
      "outputs": [],
@@ -784,7 +709,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "XqKp3-5GIZlw"
      },
      "source": [
@@ -793,10 +717,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "bAQblMIjwkvx"
      },
      "outputs": [],
@@ -807,7 +729,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "sFmVG4SKZAw8"
      },
      "source": [
@@ -816,10 +737,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "VTjgPbp4ZDKo"
      },
      "outputs": [],
@@ -834,7 +753,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Q0NTdwZsQK8n"
      },
      "source": [
@@ -845,10 +763,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "8L__-erBwLIQ"
      },
      "outputs": [],
@@ -859,7 +775,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "mKAvkQc3heSy"
      },
      "source": [
@@ -870,23 +785,20 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "97Ll2Gichd_Y"
      },
      "outputs": [],
      "source": [
-        "checkpoint = tf.train.Checkpoint(model=bert_encoder)\n",
-        "checkpoint.restore(\n",
+        "checkpoint = tf.train.Checkpoint(encoder=bert_encoder)\n",
+        "checkpoint.read(\n",
        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "2oHOql35k3Dd"
      },
      "source": [
@@ -896,7 +808,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "115caFLMk-_l"
      },
      "source": [
@@ -908,10 +819,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "w8qXKRZuCwW4"
      },
      "outputs": [],
@@ -934,7 +843,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "pXRGxiRNEHS2"
      },
      "source": [
@@ -943,10 +851,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "eQNA16bhDpky"
      },
      "outputs": [],
@@ -957,7 +863,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xqu_K71fJQB8"
      },
      "source": [
@@ -967,7 +872,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "78FEUOOEkoP0"
      },
      "source": [
@@ -977,7 +881,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "OTNcA0O0nSq9"
      },
      "source": [
@@ -986,10 +889,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "nzi8hjeTQTRs"
      },
      "outputs": [],
@@ -1012,7 +913,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "IFtKFWbNKb0u"
      },
      "source": [
@@ -1023,10 +923,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "9ZoUgDUNJPz3"
      },
      "outputs": [],
@@ -1046,7 +944,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "7ynJibkBRTJF"
      },
      "source": [
@@ -1055,26 +952,22 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "umo0ttrgRYIM"
      },
      "outputs": [],
      "source": [
        "result = bert_classifier(my_examples, training=False)\n",
        "\n",
-        "result = tf.argmax(result).numpy()\n",
+        "result = tf.argmax(result, axis=-1).numpy()\n",
        "result"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "utGl0M3aZCE4"
      },
      "outputs": [],
@@ -1085,7 +978,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "fVo_AnT0l26j"
      },
      "source": [
@@ -1096,10 +988,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Nl5x6nElZqkP"
      },
      "outputs": [],
@@ -1110,10 +1000,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
+        "collapsed": true,
        "id": "y_ACvKPsVUXC"
      },
      "outputs": [],
@@ -1134,7 +1023,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "eQceYqRFT_Eg"
      },
      "source": [
@@ -1144,7 +1032,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "SaC1RlFawUpc"
      },
      "source": [
@@ -1155,7 +1042,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "CwUdjFBkzUgh"
      },
      "source": [
@@ -1167,7 +1053,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "2UTQrkyOT5wD"
      },
      "source": [
@@ -1176,10 +1061,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "XQeDFOzYR9Z9"
      },
      "outputs": [],
@@ -1192,7 +1075,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "XrFQbfErUWxa"
      },
      "source": [
@@ -1201,10 +1083,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "ymw7GOHpSHKU"
      },
      "outputs": [],
@@ -1231,7 +1111,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "uX_Sp-wTUoRm"
      },
      "source": [
@@ -1240,10 +1119,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "rkHxIK57SQ_r"
      },
      "outputs": [],
@@ -1264,7 +1141,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "stbaVouogvzS"
      },
      "source": [
@@ -1273,10 +1149,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "gwhrlQl4gxVF"
      },
      "outputs": [],
@@ -1287,7 +1161,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "dbJ76vSJj77j"
      },
      "source": [
@@ -1297,7 +1170,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9J95LFRohiYw"
      },
      "source": [
@@ -1306,10 +1178,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "gCvaLLAxPuMc"
      },
      "outputs": [],
@@ -1351,10 +1221,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "rutkBadrhzdR"
      },
      "outputs": [],
@@ -1379,10 +1247,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "59TVgt4Z7fuU"
      },
      "outputs": [],
@@ -1393,7 +1259,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "QbklKt-w_CiI"
      },
      "source": [
@@ -1406,17 +1271,38 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "lo6479At4sP1"
+        "id": "GDWrHm0BGpbX"
      },
      "outputs": [],
      "source": [
        "# Note: 350MB download.\n",
-        "import tensorflow_hub as hub\n",
-        "hub_encoder = hub.KerasLayer(hub_url_bert, trainable=True)\n",
+        "import tensorflow_hub as hub"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "Y29meH0qGq_5"
+      },
+      "outputs": [],
+      "source": [
+        "hub_model_name = \"bert_en_uncased_L-12_H-768_A-12\" #@param [\"bert_en_uncased_L-24_H-1024_A-16\", \"bert_en_wwm_cased_L-24_H-1024_A-16\", \"bert_en_uncased_L-12_H-768_A-12\", \"bert_en_wwm_uncased_L-24_H-1024_A-16\", \"bert_en_cased_L-24_H-1024_A-16\", \"bert_en_cased_L-12_H-768_A-12\", \"bert_zh_L-12_H-768_A-12\", \"bert_multi_cased_L-12_H-768_A-12\"]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lo6479At4sP1"
+      },
+      "outputs": [],
+      "source": [
+        "hub_encoder = hub.KerasLayer(f\"https://tfhub.dev/tensorflow/{hub_model_name}/3\",\n",
+        "                             trainable=True)\n",
        "\n",
        "print(f\"The Hub encoder has {len(hub_encoder.trainable_variables)} trainable variables\")"
      ]
@@ -1424,7 +1310,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "iTzF574wivQv"
      },
      "source": [
@@ -1433,29 +1318,27 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "XEcYrCR45Uwo"
      },
      "outputs": [],
      "source": [
        "result = hub_encoder(\n",
-        "    inputs=[glue_train['input_word_ids'][:10],\n",
-        "            glue_train['input_mask'][:10],\n",
-        "            glue_train['input_type_ids'][:10],],\n",
+        "    inputs=dict(\n",
+        "        input_word_ids=glue_train['input_word_ids'][:10],\n",
+        "        input_mask=glue_train['input_mask'][:10],\n",
+        "        input_type_ids=glue_train['input_type_ids'][:10],),\n",
        "    training=False,\n",
        ")\n",
        "\n",
-        "print(\"Pooled output shape:\", result[0].shape)\n",
-        "print(\"Sequence output shape:\", result[1].shape)"
+        "print(\"Pooled output shape:\", result['pooled_output'].shape)\n",
+        "print(\"Sequence output shape:\", result['sequence_output'].shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "cjojn8SmLSRI"
      },
      "source": [
@@ -1466,35 +1349,33 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "9nTDaApyLR70"
      },
      "outputs": [],
      "source": [
-        "hub_classifier, hub_encoder = bert.bert_models.classifier_model(\n",
-        "    # Caution: Most of `bert_config` is ignored if you pass a hub url.\n",
-        "    bert_config=bert_config, hub_module_url=hub_url_bert, num_labels=2)"
+        "hub_classifier = nlp.modeling.models.BertClassifier(\n",
+        "    bert_encoder,\n",
+        "    num_classes=2,\n",
+        "    dropout_rate=0.1,\n",
+        "    initializer=tf.keras.initializers.TruncatedNormal(\n",
+        "        stddev=0.02))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xMJX3wV0_v7I"
      },
      "source": [
-        "The one downside to loading this model from TFHub is that the structure of internal keras layers is not restored. So it's more difficult to inspect or modify the model. The `TransformerEncoder` model is now a single layer:"
+        "The one downside to loading this model from TFHub is that the structure of internal keras layers is not restored. So it's more difficult to inspect or modify the model. The `BertEncoder` model is now a single layer:"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "pD71dnvhM2QS"
      },
      "outputs": [],
@@ -1504,10 +1385,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "nLZD-isBzNKi"
      },
      "outputs": [],
@@ -1522,7 +1401,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "ZxSqH0dNAgXV"
      },
      "source": [
@@ -1530,13 +1408,12 @@
        "\n",
        "### Low level model building\n",
        "\n",
-        "If you need a more control over the construction of the model it's worth noting that the `classifier_model` function used earlier is really just a thin wrapper over the `nlp.modeling.networks.TransformerEncoder` and `nlp.modeling.models.BertClassifier` classes. Just remember that if you start modifying the architecture it may not be correct or possible to reload the pre-trained checkpoint so you'll need to retrain from scratch."
+        "If you need a more control over the construction of the model it's worth noting that the `classifier_model` function used earlier is really just a thin wrapper over the `nlp.modeling.networks.BertEncoder` and `nlp.modeling.models.BertClassifier` classes. Just remember that if you start modifying the architecture it may not be correct or possible to reload the pre-trained checkpoint so you'll need to retrain from scratch."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0cgABEwDj06P"
      },
      "source": [
@@ -1545,45 +1422,40 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "5r_yqhBFSVEM"
      },
      "outputs": [],
      "source": [
-        "transformer_config = config_dict.copy()\n",
+        "bert_encoder_config = config_dict.copy()\n",
        "\n",
        "# You need to rename a few fields to make this work:\n",
-        "transformer_config['attention_dropout_rate'] = transformer_config.pop('attention_probs_dropout_prob')\n",
-        "transformer_config['activation'] = tf_utils.get_activation(transformer_config.pop('hidden_act'))\n",
-        "transformer_config['dropout_rate'] = transformer_config.pop('hidden_dropout_prob')\n",
-        "transformer_config['initializer'] = tf.keras.initializers.TruncatedNormal(\n",
-        "          stddev=transformer_config.pop('initializer_range'))\n",
-        "transformer_config['max_sequence_length'] = transformer_config.pop('max_position_embeddings')\n",
-        "transformer_config['num_layers'] = transformer_config.pop('num_hidden_layers')\n",
+        "bert_encoder_config['attention_dropout_rate'] = bert_encoder_config.pop('attention_probs_dropout_prob')\n",
+        "bert_encoder_config['activation'] = tf_utils.get_activation(bert_encoder_config.pop('hidden_act'))\n",
+        "bert_encoder_config['dropout_rate'] = bert_encoder_config.pop('hidden_dropout_prob')\n",
+        "bert_encoder_config['initializer'] = tf.keras.initializers.TruncatedNormal(\n",
+        "          stddev=bert_encoder_config.pop('initializer_range'))\n",
+        "bert_encoder_config['max_sequence_length'] = bert_encoder_config.pop('max_position_embeddings')\n",
+        "bert_encoder_config['num_layers'] = bert_encoder_config.pop('num_hidden_layers')\n",
        "\n",
-        "transformer_config"
+        "bert_encoder_config"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "rIO8MI7LLijh"
      },
      "outputs": [],
      "source": [
-        "manual_encoder = nlp.modeling.networks.TransformerEncoder(**transformer_config)"
+        "manual_encoder = nlp.modeling.networks.BertEncoder(**bert_encoder_config)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "4a4tFSg9krRi"
      },
      "source": [
@@ -1592,23 +1464,20 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "X6N9NEqfXJCx"
      },
      "outputs": [],
      "source": [
-        "checkpoint = tf.train.Checkpoint(model=manual_encoder)\n",
-        "checkpoint.restore(\n",
+        "checkpoint = tf.train.Checkpoint(encoder=manual_encoder)\n",
+        "checkpoint.read(\n",
        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "1BPiPO4ykuwM"
      },
      "source": [
@@ -1617,10 +1486,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "hlVdgJKmj389"
      },
      "outputs": [],
@@ -1634,7 +1501,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "nJMXvVgJkyBv"
      },
      "source": [
@@ -1643,10 +1509,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "tQX57GJ6wkAb"
      },
      "outputs": [],
@@ -1654,17 +1518,14 @@
        "manual_classifier = nlp.modeling.models.BertClassifier(\n",
        "        bert_encoder,\n",
        "        num_classes=2,\n",
-        "        dropout_rate=transformer_config['dropout_rate'],\n",
-        "        initializer=tf.keras.initializers.TruncatedNormal(\n",
-        "          stddev=bert_config.initializer_range))"
+        "        dropout_rate=bert_encoder_config['dropout_rate'],\n",
+        "        initializer=bert_encoder_config['initializer'])"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "kB-nBWhQk0dS"
      },
      "outputs": [],
@@ -1675,7 +1536,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "E6AJlOSyIO1L"
      },
      "source": [
@@ -1688,10 +1548,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "28Dv3BPRlFTD"
      },
      "outputs": [],
@@ -1703,7 +1561,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "LRjcHr0UlT8c"
      },
      "source": [
@@ -1714,10 +1571,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "MHY8K6kDngQn"
      },
      "outputs": [],
@@ -1733,10 +1588,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
+        "collapsed": true,
        "id": "wKIcSprulu3P"
      },
      "outputs": [],
@@ -1752,7 +1606,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "IMTC_gfAl_PZ"
      },
      "source": [
@@ -1761,10 +1614,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "YRt3VTmBmCBY"
      },
      "outputs": [],
@@ -1786,7 +1637,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "l8D9Lv3Bn740"
      },
      "source": [
@@ -1795,10 +1645,8 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "2Hf2rpRXk89N"
      },
      "outputs": [],

--- a/official/colab/nlp/customize_encoder.ipynb
+++ b/official/colab/nlp/customize_encoder.ipynb
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Customizing a Transformer Encoder",
+      "private_outputs": true,
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Bp8t2AI8i7uP"
      },
      "source": [
@@ -12,14 +26,10 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "rxPj2Lsni9O4"
      },
-      "outputs": [],
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
@@ -32,12 +42,13 @@
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "6xS-9i5DrRvO"
      },
      "source": [
@@ -47,30 +58,28 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Mwb9uw1cDXsa"
      },
      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "  </td>\n",
+        "</table>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "iLrcV4IyrcGX"
      },
      "source": [
@@ -84,7 +93,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "YYxdyoWgsl8t"
      },
      "source": [
@@ -94,34 +102,30 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "fEJSFutUsn_h"
      },
      "source": [
        "### Install the TensorFlow Model Garden pip package\n",
        "\n",
-        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
        "*  `pip` will install all models and dependencies automatically."
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "thsKZDjhswhR"
      },
-      "outputs": [],
      "source": [
-        "!pip install -q tf-nightly\n",
-        "!pip install -q tf-models-nightly"
-      ]
+        "!pip install -q tf-models-official==2.4.0"
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "hpf7JPCVsqtv"
      },
      "source": [
@@ -130,13 +134,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "my4dp-RMssQe"
      },
-      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import tensorflow as tf\n",
@@ -144,12 +144,13 @@
        "from official.modeling import activations\n",
        "from official.nlp import modeling\n",
        "from official.nlp.modeling import layers, losses, models, networks"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "vjDmVsFfs85n"
      },
      "source": [
@@ -160,13 +161,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Oav8sbgstWc-"
      },
-      "outputs": [],
      "source": [
        "cfg = {\n",
        "    \"vocab_size\": 100,\n",
@@ -177,22 +174,23 @@
        "    \"activation\": activations.gelu,\n",
        "    \"dropout_rate\": 0.1,\n",
        "    \"attention_dropout_rate\": 0.1,\n",
-        "    \"sequence_length\": 16,\n",
+        "    \"max_sequence_length\": 16,\n",
        "    \"type_vocab_size\": 2,\n",
        "    \"initializer\": tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
        "}\n",
-        "bert_encoder = modeling.networks.TransformerEncoder(**cfg)\n",
+        "bert_encoder = modeling.networks.BertEncoder(**cfg)\n",
        "\n",
        "def build_classifier(bert_encoder):\n",
        "  return modeling.models.BertClassifier(bert_encoder, num_classes=2)\n",
        "\n",
        "canonical_classifier_model = build_classifier(bert_encoder)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Qe2UWI6_tsHo"
      },
      "source": [
@@ -203,31 +201,28 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "csED2d-Yt5h6"
      },
-      "outputs": [],
      "source": [
        "def predict(model):\n",
        "  batch_size = 3\n",
        "  np.random.seed(0)\n",
        "  word_ids = np.random.randint(\n",
-        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
-        "  mask = np.random.randint(2, size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
+        "  mask = np.random.randint(2, size=(batch_size, cfg[\"max_sequence_length\"]))\n",
        "  type_ids = np.random.randint(\n",
-        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
        "  print(model([word_ids, mask, type_ids], training=False))\n",
        "\n",
        "predict(canonical_classifier_model)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "PzKStEK9t_Pb"
      },
      "source": [
@@ -239,7 +234,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "rmwQfhj6fmKz"
      },
      "source": [
@@ -250,7 +244,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xsMgEVHAui11"
      },
      "source": [
@@ -263,26 +256,21 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "-JBabpa2AOz8"
      },
      "source": [
        "#### Without Customization\n",
        "\n",
-        "Without any customization, `EncoderScaffold` behaves the same the canonical `TransformerEncoder`.\n",
+        "Without any customization, `EncoderScaffold` behaves the same the canonical `BertEncoder`.\n",
        "\n",
-        "As shown in the following example, `EncoderScaffold` can load `TransformerEncoder`'s weights and output the same values:"
+        "As shown in the following example, `EncoderScaffold` can load `BertEncoder`'s weights and output the same values:"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "ktNzKuVByZQf"
      },
-      "outputs": [],
      "source": [
        "default_hidden_cfg = dict(\n",
        "    num_attention_heads=cfg[\"num_attention_heads\"],\n",
@@ -296,10 +284,9 @@
        "    vocab_size=cfg[\"vocab_size\"],\n",
        "    type_vocab_size=cfg[\"type_vocab_size\"],\n",
        "    hidden_size=cfg[\"hidden_size\"],\n",
-        "    seq_length=cfg[\"sequence_length\"],\n",
        "    initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
        "    dropout_rate=cfg[\"dropout_rate\"],\n",
-        "    max_seq_length=cfg[\"sequence_length\"],\n",
+        "    max_seq_length=cfg[\"max_sequence_length\"]\n",
        ")\n",
        "default_kwargs = dict(\n",
        "    hidden_cfg=default_hidden_cfg,\n",
@@ -309,17 +296,19 @@
        "    return_all_layer_outputs=True,\n",
        "    pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
        ")\n",
+        "\n",
        "encoder_scaffold = modeling.networks.EncoderScaffold(**default_kwargs)\n",
        "classifier_model_from_encoder_scaffold = build_classifier(encoder_scaffold)\n",
        "classifier_model_from_encoder_scaffold.set_weights(\n",
        "    canonical_classifier_model.get_weights())\n",
        "predict(classifier_model_from_encoder_scaffold)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "sMaUmLyIuwcs"
      },
      "source": [
@@ -332,18 +321,14 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "LTinnaG6vcsw"
      },
-      "outputs": [],
      "source": [
        "word_ids = tf.keras.layers.Input(\n",
-        "    shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
+        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
        "mask = tf.keras.layers.Input(\n",
-        "    shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
+        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
        "embedding_layer = modeling.layers.OnDeviceEmbedding(\n",
        "    vocab_size=cfg['vocab_size'],\n",
        "    embedding_width=cfg['hidden_size'],\n",
@@ -353,12 +338,13 @@
        "attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])\n",
        "new_embedding_network = tf.keras.Model([word_ids, mask],\n",
        "                                       [word_embeddings, attention_mask])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "HN7_yu-6O3qI"
      },
      "source": [
@@ -368,21 +354,18 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "fO9zKFE4OpHp"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(new_embedding_network, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9cOaGQHLv12W"
      },
      "source": [
@@ -391,13 +374,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "mtFDMNf2vIl9"
      },
-      "outputs": [],
      "source": [
        "kwargs = dict(default_kwargs)\n",
        "\n",
@@ -412,12 +391,13 @@
        "\n",
        "# Assert that there are only two inputs.\n",
        "assert len(classifier_model.inputs) == 2"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Z73ZQDtmwg9K"
      },
      "source": [
@@ -432,13 +412,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "uAIarLZgw6pA"
      },
-      "outputs": [],
      "source": [
        "kwargs = dict(default_kwargs)\n",
        "\n",
@@ -452,12 +428,13 @@
        "\n",
        "# Assert that the variable `rezero_alpha` from ReZeroTransformer exists.\n",
        "assert 'rezero_alpha' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "6PMHFdvnxvR0"
      },
      "source": [
@@ -470,7 +447,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "D6FejlgwyAy_"
      },
      "source": [
@@ -485,13 +461,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "nFrSMrZuyNeQ"
      },
-      "outputs": [],
      "source": [
        "# Use TalkingHeadsAttention\n",
        "hidden_cfg = dict(default_hidden_cfg)\n",
@@ -508,12 +480,13 @@
        "\n",
        "# Assert that the variable `pre_softmax_weight` from TalkingHeadsAttention exists.\n",
        "assert 'pre_softmax_weight' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "kuEJcTyByVvI"
      },
      "source": [
@@ -528,13 +501,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "XAbKy_l4y_-i"
      },
-      "outputs": [],
      "source": [
        "# Use TalkingHeadsAttention\n",
        "hidden_cfg = dict(default_hidden_cfg)\n",
@@ -551,12 +520,13 @@
        "\n",
        "# Assert that the variable `gate` from GatedFeedforward exists.\n",
        "assert 'gate' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "a_8NWUhkzeAq"
      },
      "source": [
@@ -564,29 +534,26 @@
        "\n",
        "Finally, you could also build a new encoder using building blocks in the modeling library.\n",
        "\n",
-        "See [AlbertTransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_transformer_encoder.py) as an example:\n"
+        "See [AlbertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_encoder.py) as an example:\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "xsiA3RzUzmUM"
      },
-      "outputs": [],
      "source": [
-        "albert_encoder = modeling.networks.AlbertTransformerEncoder(**cfg)\n",
+        "albert_encoder = modeling.networks.AlbertEncoder(**cfg)\n",
        "classifier_model = build_classifier(albert_encoder)\n",
        "# ... Train the model ...\n",
        "predict(classifier_model)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MeidDfhlHKSO"
      },
      "source": [
@@ -595,31 +562,14 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Uv_juT22HERW"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(albert_encoder, show_shapes=True, dpi=48)"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Customizing a Transformer Encoder",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
+      ],
+      "execution_count": null,
+      "outputs": []
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
--- a/official/colab/nlp/nlp_modeling_library_intro.ipynb
+++ b/official/colab/nlp/nlp_modeling_library_intro.ipynb
@@ -3,7 +3,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "80xnUmoI7fBX"
      },
      "source": [
@@ -15,8 +14,6 @@
      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "8nvTnfs6Q692"
      },
      "outputs": [],
@@ -37,7 +34,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "WmfcMK5P5C1G"
      },
      "source": [
@@ -47,7 +43,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "cH-oJ8R6AHMK"
      },
      "source": [
@@ -70,7 +65,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0H_EFIhq4-MJ"
      },
      "source": [
@@ -82,7 +76,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "2N97-dps_nUk"
      },
      "source": [
@@ -92,13 +85,13 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "459ygAVl_rg0"
      },
      "source": [
        "### Install the TensorFlow Model Garden pip package\n",
        "\n",
-        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
        "*  `pip` will install all models and dependencies automatically."
      ]
    },
@@ -106,20 +99,16 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Y-qGkdh6_sZc"
      },
      "outputs": [],
      "source": [
-        "!pip install -q tf-nightly\n",
-        "!pip install -q tf-models-nightly"
+        "!pip install -q tf-models-official==2.4.0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "e4huSSwyAG_5"
      },
      "source": [
@@ -130,8 +119,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "jqYXqtjBAJd9"
      },
      "outputs": [],
@@ -146,7 +133,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "djBQWjvy-60Y"
      },
      "source": [
@@ -160,13 +146,12 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MKuHVlsCHmiq"
      },
      "source": [
-        "### Build a `BertPretrainer` model wrapping `TransformerEncoder`\n",
+        "### Build a `BertPretrainer` model wrapping `BertEncoder`\n",
        "\n",
-        "The [TransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/transformer_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
+        "The [BertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/bert_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
        "\n",
        "The [BertPretrainer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_pretrainer.py) allows a user to pass in a transformer stack, and instantiates the masked language model and classification networks that are used to create the training objectives."
      ]
@@ -175,8 +160,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "EXkcXz-9BwB3"
      },
      "outputs": [],
@@ -184,14 +167,13 @@
        "# Build a small transformer network.\n",
        "vocab_size = 100\n",
        "sequence_length = 16\n",
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "    vocab_size=vocab_size, num_layers=2, sequence_length=16)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0NH5irV5KTMS"
      },
      "source": [
@@ -204,8 +186,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "lZNoZkBrIoff"
      },
      "outputs": [],
@@ -217,8 +197,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "o7eFOZXiIl-b"
      },
      "outputs": [],
@@ -232,7 +210,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "d5h5HT7gNHx_"
      },
      "source": [
@@ -243,8 +220,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "2tcNfm03IBF7"
      },
      "outputs": [],
@@ -256,8 +231,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "F2oHrXGUIS0M"
      },
      "outputs": [],
@@ -280,7 +253,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "bnx3UCHniCS5"
      },
      "source": [
@@ -292,8 +264,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "k30H4Q86f52x"
      },
      "outputs": [],
@@ -316,7 +286,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "wrmSs8GjHxVw"
      },
      "source": [
@@ -328,7 +297,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "k8cQVFvBCV4s"
      },
      "source": [
@@ -342,28 +310,25 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xrLLEWpfknUW"
      },
      "source": [
-        "### Build a BertSpanLabeler wrapping TransformerEncoder\n",
+        "### Build a BertSpanLabeler wrapping BertEncoder\n",
        "\n",
        "[BertSpanLabeler](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_span_labeler.py) implements a simple single-span start-end predictor (that is, a model that predicts two values: a start token index and an end token index), suitable for SQuAD-style tasks.\n",
        "\n",
-        "Note that `BertSpanLabeler` wraps a `TransformerEncoder`, the weights of which can be restored from the above pretraining model.\n"
+        "Note that `BertSpanLabeler` wraps a `BertEncoder`, the weights of which can be restored from the above pretraining model.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "B941M4iUCejO"
      },
      "outputs": [],
      "source": [
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
        "\n",
        "# Create a BERT trainer with the created network.\n",
@@ -373,7 +338,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "QpB9pgj4PpMg"
      },
      "source": [
@@ -384,8 +348,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "RbqRNJCLJu4H"
      },
      "outputs": [],
@@ -397,8 +359,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "fUf1vRxZJwio"
      },
      "outputs": [],
@@ -417,7 +377,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "WqhgQaN1lt-G"
      },
      "source": [
@@ -429,8 +388,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "waqs6azNl3Nn"
      },
      "outputs": [],
@@ -450,7 +407,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Zdf03YtZmd_d"
      },
      "source": [
@@ -460,7 +416,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0A1XnGSTChg9"
      },
      "source": [
@@ -472,26 +427,23 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MSK8OpZgnQa9"
      },
      "source": [
-        "### Build a BertClassifier model wrapping TransformerEncoder\n",
+        "### Build a BertClassifier model wrapping BertEncoder\n",
        "\n",
-        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a simple token classification model containing a single classification head using the `TokenClassification` network."
+        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a [CLS] token classification model containing a single classification head."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "cXXCsffkCphk"
      },
      "outputs": [],
      "source": [
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
        "\n",
        "# Create a BERT trainer with the created network.\n",
@@ -503,7 +455,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "8tZKueKYP4bB"
      },
      "source": [
@@ -514,8 +465,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "snlutm9ZJgEZ"
      },
      "outputs": [],
@@ -527,8 +476,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "yyHPHsqBJkCz"
      },
      "outputs": [],
@@ -546,7 +493,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "w--a2mg4nzKm"
      },
      "source": [
@@ -559,23 +505,20 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "9X0S1DoFn_5Q"
      },
      "outputs": [],
      "source": [
        "labels = np.random.randint(num_classes, size=(batch_size))\n",
        "\n",
-        "loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
-        "    labels=labels, predictions=tf.nn.log_softmax(logits, axis=-1))\n",
+        "loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
+        "    labels, logits, from_logits=True)\n",
        "print(loss)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "mzBqOylZo3og"
      },
      "source": [

--- a/official/colab/uncertainty_quantification_with_sngp_bert.ipynb
+++ b/official/colab/uncertainty_quantification_with_sngp_bert.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vs3a5tGVAWGI"
+      },
+      "source": [
+        "##### Copyright 2021 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "HYfsarcYBJQp"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aOpqCFEyBQDd"
+      },
+      "source": [
+        "# Uncertainty-aware Deep Language Learning with BERT-SNGP"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6MlSYP6cBT61"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/uncertainty_quantification_with_sngp_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/uncertainty_quantification_with_sngp_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/uncertainty_quantification_with_sngp_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/docs/models/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-IM5IzM26GBh"
+      },
+      "source": [
+        "In the [SNGP tutorial](https://www.tensorflow.org/tutorials/uncertainty/sngp), you learned how to build SNGP model on top of a deep residual network to improve its ability to quantify its uncertainty. In this tutorial, you will apply SNGP to a natural language understanding (NLU) task by building it on top of a deep BERT encoder to improve deep NLU model's ability in detecting out-of-scope queries. \n",
+        "\n",
+        "Specifically, you will:\n",
+        "* Build BERT-SNGP, a SNGP-augmented [BERT](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2) model.\n",
+        "* Load the [CLINC Out-of-scope (OOS)](https://www.tensorflow.org/datasets/catalog/clinc_oos) intent detection dataset.\n",
+        "* Train the BERT-SNGP model.\n",
+        "* Evaluate the BERT-SNGP model's performance in uncertainty calibration and out-of-domain detection.\n",
+        "\n",
+        "Beyond CLINC OOS, the SNGP model has been applied to large-scale datasets such as [Jigsaw toxicity detection](https://www.tensorflow.org/datasets/catalog/wikipedia_toxicity_subtypes), and to the image datasets such as [CIFAR-100](https://www.tensorflow.org/datasets/catalog/cifar100) and [ImageNet](https://www.tensorflow.org/datasets/catalog/imagenet2012). \n",
+        "For benchmark results of SNGP and other uncertainty methods, as well as high-quality implementation with end-to-end training / evaluation scripts, you can check out the [Uncertainty Baselines](https://github.com/google/uncertainty-baselines) benchmark."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-bsids4eAYYI"
+      },
+      "source": [
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3sgnLBKk7iuR"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install tf-models-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "M42dnVSk7dVy"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "import sklearn.metrics\n",
+        "import sklearn.calibration\n",
+        "\n",
+        "import tensorflow_hub as hub\n",
+        "import tensorflow_datasets as tfds\n",
+        "\n",
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "import official.nlp.modeling.layers as layers\n",
+        "import official.nlp.optimization as optimization"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cnRQfguq6GZj"
+      },
+      "source": [
+        "First implement a standard BERT classifier following the [classify text with BERT](https://www.tensorflow.org/tutorials/text/classify_text_with_bert) tutorial. We will use the [BERT-base](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3) encoder, and the built-in [`ClassificationHead`](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/cls_head.py) as the classifier."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "bNBEGs7s6NHB"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Standard BERT model\n",
+        "\n",
+        "PREPROCESS_HANDLE = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'\n",
+        "MODEL_HANDLE = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'\n",
+        "\n",
+        "class BertClassifier(tf.keras.Model):\n",
+        "  def __init__(self, \n",
+        "               num_classes=150, inner_dim=768, dropout_rate=0.1,\n",
+        "               **classifier_kwargs):\n",
+        "    \n",
+        "    super().__init__()\n",
+        "    self.classifier_kwargs = classifier_kwargs\n",
+        "\n",
+        "    # Initiate the BERT encoder components.\n",
+        "    self.bert_preprocessor = hub.KerasLayer(PREPROCESS_HANDLE, name='preprocessing')\n",
+        "    self.bert_hidden_layer = hub.KerasLayer(MODEL_HANDLE, trainable=True, name='bert_encoder')\n",
+        "\n",
+        "    # Defines the encoder and classification layers.\n",
+        "    self.bert_encoder = self.make_bert_encoder()\n",
+        "    self.classifier = self.make_classification_head(num_classes, inner_dim, dropout_rate)\n",
+        "\n",
+        "  def make_bert_encoder(self):\n",
+        "    text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')\n",
+        "    encoder_inputs = self.bert_preprocessor(text_inputs)\n",
+        "    encoder_outputs = self.bert_hidden_layer(encoder_inputs)\n",
+        "    return tf.keras.Model(text_inputs, encoder_outputs)\n",
+        "\n",
+        "  def make_classification_head(self, num_classes, inner_dim, dropout_rate):\n",
+        "    return layers.ClassificationHead(\n",
+        "        num_classes=num_classes, \n",
+        "        inner_dim=inner_dim,\n",
+        "        dropout_rate=dropout_rate,\n",
+        "        **self.classifier_kwargs)\n",
+        "\n",
+        "  def call(self, inputs, **kwargs):\n",
+        "    encoder_outputs = self.bert_encoder(inputs)\n",
+        "    classifier_inputs = encoder_outputs['sequence_output']\n",
+        "    return self.classifier(classifier_inputs, **kwargs)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SbhbNbKk6WNR"
+      },
+      "source": [
+        "### Build SNGP model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "p7YakN0V6Oif"
+      },
+      "source": [
+        "To implement a BERT-SNGP model, you only need to replace the `ClassificationHead` with the built-in [`GaussianProcessClassificationHead`](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/cls_head.py). Spectral normalization is already pre-packaged into this classification head. Like in the [SNGP tutorial](https://www.tensorflow.org/tutorials/uncertainty/sngp), add a covariance reset callback to the model, so the model automatically reset the covariance estimator at the begining of a new epoch to avoid counting the same data twice."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "QCaJy85y8WeE"
+      },
+      "outputs": [],
+      "source": [
+        "class ResetCovarianceCallback(tf.keras.callbacks.Callback):\n",
+        "\n",
+        "  def on_epoch_begin(self, epoch, logs=None):\n",
+        "    \"\"\"Resets covariance matrix at the begining of the epoch.\"\"\"\n",
+        "    if epoch \u003e 0:\n",
+        "      self.model.classifier.reset_covariance_matrix()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YoHgOuiZ6Q4y"
+      },
+      "outputs": [],
+      "source": [
+        "class SNGPBertClassifier(BertClassifier):\n",
+        "\n",
+        "  def make_classification_head(self, num_classes, inner_dim, dropout_rate):\n",
+        "    return layers.GaussianProcessClassificationHead(\n",
+        "        num_classes=num_classes, \n",
+        "        inner_dim=inner_dim,\n",
+        "        dropout_rate=dropout_rate,\n",
+        "        gp_cov_momentum=-1,\n",
+        "        temperature=30.,\n",
+        "        **self.classifier_kwargs)\n",
+        "\n",
+        "  def fit(self, *args, **kwargs):\n",
+        "    \"\"\"Adds ResetCovarianceCallback to model callbacks.\"\"\"\n",
+        "    kwargs['callbacks'] = list(kwargs.get('callbacks', []))\n",
+        "    kwargs['callbacks'].append(ResetCovarianceCallback())\n",
+        "\n",
+        "    return super().fit(*args, **kwargs)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UOj5YWTt6dCe"
+      },
+      "source": [
+        "Note: The `GaussianProcessClassificationHead` takes a new argument `temperature`. It corresponds to the $\\lambda$ parameter in the __mean-field approximation__ introduced in the [SNGP tutorial](https://www.tensorflow.org/tutorials/uncertainty/sngp). In practice, this value is usually treated as a hyperparamter, and is finetuned to optimize the model's calibration performance."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qdU90uDT6hFq"
+      },
+      "source": [
+        "### Load CLINC OOS dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AnuNeyHw6kH7"
+      },
+      "source": [
+        "Now load the [CLINC OOS](https://www.tensorflow.org/datasets/catalog/clinc_oos) intent detection dataset. This dataset contains 15000 user's spoken queries collected over 150 intent classes, it also contains 1000 out-of-domain (OOD) sentences that are not covered by any of the known classes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mkMZN2iA6hhg"
+      },
+      "outputs": [],
+      "source": [
+        "(clinc_train, clinc_test, clinc_test_oos), ds_info = tfds.load(\n",
+        "    'clinc_oos', split=['train', 'test', 'test_oos'], with_info=True, batch_size=-1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UJSL2nm8Bo02"
+      },
+      "source": [
+        "Make the train and test data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cgkOOZOq6fQL"
+      },
+      "outputs": [],
+      "source": [
+        "train_examples = clinc_train['text']\n",
+        "train_labels = clinc_train['intent']\n",
+        "\n",
+        "# Makes the in-domain (IND) evaluation data.\n",
+        "ind_eval_data = (clinc_test['text'], clinc_test['intent'])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Kw76f6caBq_E"
+      },
+      "source": [
+        "Create a OOD evaluation dataset. For this, combine the in-domain test data `clinc_test` and the out-of-domain data `clinc_test_oos`. We will also assign label 0 to the in-domain examples, and label 1 to the out-of-domain examples. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uVFuzecR64FJ"
+      },
+      "outputs": [],
+      "source": [
+        "test_data_size = ds_info.splits['test'].num_examples\n",
+        "oos_data_size = ds_info.splits['test_oos'].num_examples\n",
+        "\n",
+        "# Combines the in-domain and out-of-domain test examples.\n",
+        "oos_texts = tf.concat([clinc_test['text'], clinc_test_oos['text']], axis=0)\n",
+        "oos_labels = tf.constant([0] * test_data_size + [1] * oos_data_size)\n",
+        "\n",
+        "# Converts into a TF dataset.\n",
+        "ood_eval_dataset = tf.data.Dataset.from_tensor_slices(\n",
+        "    {\"text\": oos_texts, \"label\": oos_labels})"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZcHwfwfU6qCE"
+      },
+      "source": [
+        "### Train and evaluate"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_VTY6KYc6sBB"
+      },
+      "source": [
+        "First set up the basic training configurations."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_-uUkUtk6qWC"
+      },
+      "outputs": [],
+      "source": [
+        "TRAIN_EPOCHS = 3\n",
+        "TRAIN_BATCH_SIZE = 32\n",
+        "EVAL_BATCH_SIZE = 256"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "tiEjMdFV6wXQ"
+      },
+      "outputs": [],
+      "source": [
+        "#@title\n",
+        "\n",
+        "def bert_optimizer(learning_rate, \n",
+        "                   batch_size=TRAIN_BATCH_SIZE, epochs=TRAIN_EPOCHS, \n",
+        "                   warmup_rate=0.1):\n",
+        "  \"\"\"Creates an AdamWeightDecay optimizer with learning rate schedule.\"\"\"\n",
+        "  train_data_size = ds_info.splits['train'].num_examples\n",
+        "  \n",
+        "  steps_per_epoch = int(train_data_size / batch_size)\n",
+        "  num_train_steps = steps_per_epoch * epochs\n",
+        "  num_warmup_steps = int(warmup_rate * num_train_steps)  \n",
+        "\n",
+        "  # Creates learning schedule.\n",
+        "  lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(\n",
+        "      initial_learning_rate=learning_rate,\n",
+        "      decay_steps=num_train_steps,\n",
+        "      end_learning_rate=0.0)  \n",
+        "  \n",
+        "  return optimization.AdamWeightDecay(\n",
+        "      learning_rate=lr_schedule,\n",
+        "      weight_decay_rate=0.01,\n",
+        "      epsilon=1e-6,\n",
+        "      exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KX_Hzl3l6w-H"
+      },
+      "outputs": [],
+      "source": [
+        "optimizer = bert_optimizer(learning_rate=1e-4)\n",
+        "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
+        "metrics = tf.metrics.SparseCategoricalAccuracy()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ptn9Cupe6z7o"
+      },
+      "outputs": [],
+      "source": [
+        "fit_configs = dict(batch_size=TRAIN_BATCH_SIZE,\n",
+        "                   epochs=TRAIN_EPOCHS,\n",
+        "                   validation_batch_size=EVAL_BATCH_SIZE, \n",
+        "                   validation_data=ind_eval_data)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0ZK5PBwW61jd"
+      },
+      "outputs": [],
+      "source": [
+        "sngp_model = SNGPBertClassifier()\n",
+        "sngp_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)\n",
+        "sngp_model.fit(train_examples, train_labels, **fit_configs)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cpDsgTYx63tO"
+      },
+      "source": [
+        "### Evaluate OOD performance"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "d5NGVe7L67bB"
+      },
+      "source": [
+        "Evaluate how well the model can detect the unfamiliar out-of-domain queries. For rigorous evaluation, use the OOD evaluation dataset `ood_eval_dataset` built earlier."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "yyLgt_lL7APo"
+      },
+      "outputs": [],
+      "source": [
+        "#@title\n",
+        "\n",
+        "def oos_predict(model, ood_eval_dataset, **model_kwargs):\n",
+        "  oos_labels = []\n",
+        "  oos_probs = []\n",
+        "\n",
+        "  ood_eval_dataset = ood_eval_dataset.batch(EVAL_BATCH_SIZE)\n",
+        "  for oos_batch in ood_eval_dataset:\n",
+        "    oos_text_batch = oos_batch[\"text\"]\n",
+        "    oos_label_batch = oos_batch[\"label\"] \n",
+        "\n",
+        "    pred_logits = model(oos_text_batch, **model_kwargs)\n",
+        "    pred_probs_all = tf.nn.softmax(pred_logits, axis=-1)\n",
+        "    pred_probs = tf.reduce_max(pred_probs_all, axis=-1)\n",
+        "\n",
+        "    oos_labels.append(oos_label_batch)\n",
+        "    oos_probs.append(pred_probs)\n",
+        "\n",
+        "  oos_probs = tf.concat(oos_probs, axis=0)\n",
+        "  oos_labels = tf.concat(oos_labels, axis=0) \n",
+        "\n",
+        "  return oos_probs, oos_labels"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Dmc2tVXs6_uo"
+      },
+      "source": [
+        "Computes the OOD probabilities as $1 - p(x)$, where $p(x)=softmax(logit(x))$ is the predictive probability."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_9aFVVDO7C7o"
+      },
+      "outputs": [],
+      "source": [
+        "sngp_probs, ood_labels = oos_predict(sngp_model, ood_eval_dataset)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_PC0wwZp7GJD"
+      },
+      "outputs": [],
+      "source": [
+        "ood_probs = 1 - sngp_probs"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AsandMTX7HjX"
+      },
+      "source": [
+        "Now evaluate how well the model's uncertainty score `ood_probs` predicts the out-of-domain label. First compute the Area under precision-recall curve (AUPRC) for OOD probability v.s. OOD detection accuracy."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0u5Wx8AP7Mdx"
+      },
+      "outputs": [],
+      "source": [
+        "precision, recall, _ = sklearn.metrics.precision_recall_curve(ood_labels, ood_probs)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "axcctOsh7N5A"
+      },
+      "outputs": [],
+      "source": [
+        "auprc = sklearn.metrics.auc(recall, precision)\n",
+        "print(f'SNGP AUPRC: {auprc:.4f}')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U_GEqxq-7Q1Y"
+      },
+      "source": [
+        "This matches the SNGP performance reported at the CLINC OOS benchmark under the [Uncertainty Baselines](https://github.com/google/uncertainty-baselines)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8H4vYcyd7Ux2"
+      },
+      "source": [
+        "Next, examine the model's quality in [uncertainty calibration](https://scikit-learn.org/stable/modules/calibration.html), i.e., whether the model's predictive probability corresponds to its predictive accuracy. A well-calibrated model is considered trust-worthy, since, for example, its predictive probability $p(x)=0.8$ means that the model is correct 80% of the time."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "x5GxrSWJ7SYn"
+      },
+      "outputs": [],
+      "source": [
+        "prob_true, prob_pred = sklearn.calibration.calibration_curve(\n",
+        "    ood_labels, ood_probs, n_bins=10, strategy='quantile')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ozzJM-D-7XVq"
+      },
+      "outputs": [],
+      "source": [
+        "plt.plot(prob_pred, prob_true)\n",
+        "\n",
+        "plt.plot([0., 1.], [0., 1.], c='k', linestyle=\"--\")\n",
+        "plt.xlabel('Predictive Probability')\n",
+        "plt.ylabel('Predictive Accuracy')\n",
+        "plt.title('Calibration Plots, SNGP')\n",
+        "\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "36M6HeHx7ZI4"
+      },
+      "source": [
+        "## Resources and further reading"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xdFTpyaP0A-N"
+      },
+      "source": [
+        "* See the [SNGP tutorial](https://www.tensorflow.org/tutorials/uncertainty/sngp) for an detailed walkthrough of implementing SNGP from scratch. \n",
+        "* See [Uncertainty Baselines](https://github.com/google/uncertainty-baselines)  for the implementation of SNGP model (and many other uncertainty methods) on a wide variety of benchmark datasets (e.g., [CIFAR](https://www.tensorflow.org/datasets/catalog/cifar100), [ImageNet](https://www.tensorflow.org/datasets/catalog/imagenet2012), [Jigsaw toxicity detection](https://www.tensorflow.org/datasets/catalog/wikipedia_toxicity_subtypes), etc).\n",
+        "* For a deeper understanding of the SNGP method, check out the paper [Simple and Principled Uncertainty Estimation with Deterministic Deep Learning via Distance Awareness](https://arxiv.org/abs/2006.10108).\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "uncertainty_quantification_with_sngp_bert.ipynb",
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/official/common/__init__.py
+++ b/official/common/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
--- a/official/common/dataset_fn.py
+++ b/official/common/dataset_fn.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility library for picking an appropriate dataset function."""
+
+from typing import Any, Callable, Union, Type
+
+import tensorflow as tf
+
+PossibleDatasetType = Union[Type[tf.data.Dataset], Callable[[tf.Tensor], Any]]
+
+
+def pick_dataset_fn(file_type: str) -> PossibleDatasetType:
+  if file_type == 'tfrecord':
+    return tf.data.TFRecordDataset
+
+  raise ValueError('Unrecognized file_type: {}'.format(file_type))
--- a/official/common/distribute_utils.py
+++ b/official/common/distribute_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for running models in a distributed setting."""
+
+import json
+import os
+import tensorflow as tf
+
+
+def _collective_communication(all_reduce_alg):
+  """Return a CollectiveCommunication based on all_reduce_alg.
+
+  Args:
+    all_reduce_alg: a string specifying which collective communication to pick,
+      or None.
+
+  Returns:
+    tf.distribute.experimental.CollectiveCommunication object
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
+  """
+  collective_communication_options = {
+      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
+  }
+  if all_reduce_alg not in collective_communication_options:
+    raise ValueError(
+        "When used with `multi_worker_mirrored`, valid values for "
+        "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
+            all_reduce_alg))
+  return collective_communication_options[all_reduce_alg]
+
+
+def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
+  """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
+
+  Args:
+    all_reduce_alg: a string specifying which cross device op to pick, or None.
+    num_packs: an integer specifying number of packs for the cross device op.
+
+  Returns:
+    tf.distribute.CrossDeviceOps object or None.
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
+  """
+  if all_reduce_alg is None:
+    return None
+  mirrored_all_reduce_options = {
+      "nccl": tf.distribute.NcclAllReduce,
+      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
+  }
+  if all_reduce_alg not in mirrored_all_reduce_options:
+    raise ValueError(
+        "When used with `mirrored`, valid values for all_reduce_alg are "
+        "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
+            all_reduce_alg))
+  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
+  return cross_device_ops_class(num_packs=num_packs)
+
+
+def tpu_initialize(tpu_address):
+  """Initializes TPU for TF 2.x training.
+
+  Args:
+    tpu_address: string, bns address of master TPU worker.
+
+  Returns:
+    A TPUClusterResolver.
+  """
+  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=tpu_address)
+  if tpu_address not in ("", "local"):
+    tf.config.experimental_connect_to_cluster(cluster_resolver)
+  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
+  return cluster_resolver
+
+
+def get_distribution_strategy(distribution_strategy="mirrored",
+                              num_gpus=0,
+                              all_reduce_alg=None,
+                              num_packs=1,
+                              tpu_address=None,
+                              **kwargs):
+  """Return a DistributionStrategy for running the model.
+
+  Args:
+    distribution_strategy: a string specifying which distribution strategy to
+      use. Accepted values are "off", "one_device", "mirrored",
+      "parameter_server", "multi_worker_mirrored", and "tpu" -- case
+      insensitive. "off" means not to use Distribution Strategy; "tpu" means to
+      use TPUStrategy using `tpu_address`.
+    num_gpus: Number of GPUs to run this model.
+    all_reduce_alg: Optional. Specifies which algorithm to use when performing
+      all-reduce. For `MirroredStrategy`, valid values are "nccl" and
+      "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
+      "ring" and "nccl".  If None, DistributionStrategy will choose based on
+      device topology.
+    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
+      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
+    tpu_address: Optional. String that represents TPU to connect to. Must not be
+      None if `distribution_strategy` is set to `tpu`.
+    **kwargs: Additional kwargs for internal usages.
+
+  Returns:
+    tf.distribute.DistibutionStrategy object.
+  Raises:
+    ValueError: if `distribution_strategy` is "off" or "one_device" and
+      `num_gpus` is larger than 1; or `num_gpus` is negative or if
+      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
+  """
+  del kwargs
+  if num_gpus < 0:
+    raise ValueError("`num_gpus` can not be negative.")
+
+  if not isinstance(distribution_strategy, str):
+    msg = ("distribution_strategy must be a string but got: %s." %
+           (distribution_strategy,))
+    if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
+      msg += (" If you meant to pass the string 'off', make sure you add "
+              "quotes around 'off' so that yaml interprets it as a string "
+              "instead of a bool.")
+    raise ValueError(msg)
+
+  distribution_strategy = distribution_strategy.lower()
+  if distribution_strategy == "off":
+    if num_gpus > 1:
+      raise ValueError("When {} GPUs are specified, distribution_strategy "
+                       "flag cannot be set to `off`.".format(num_gpus))
+    return None
+
+  if distribution_strategy == "tpu":
+    # When tpu_address is an empty string, we communicate with local TPUs.
+    cluster_resolver = tpu_initialize(tpu_address)
+    return tf.distribute.TPUStrategy(cluster_resolver)
+
+  if distribution_strategy == "multi_worker_mirrored":
+    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
+        communication=_collective_communication(all_reduce_alg))
+
+  if distribution_strategy == "one_device":
+    if num_gpus == 0:
+      return tf.distribute.OneDeviceStrategy("device:CPU:0")
+    if num_gpus > 1:
+      raise ValueError("`OneDeviceStrategy` can not be used for more than "
+                       "one device.")
+    return tf.distribute.OneDeviceStrategy("device:GPU:0")
+
+  if distribution_strategy == "mirrored":
+    if num_gpus == 0:
+      devices = ["device:CPU:0"]
+    else:
+      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
+    return tf.distribute.MirroredStrategy(
+        devices=devices,
+        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
+
+  if distribution_strategy == "parameter_server":
+    cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
+    return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
+
+  raise ValueError("Unrecognized Distribution Strategy: %r" %
+                   distribution_strategy)
+
+
+def configure_cluster(worker_hosts=None, task_index=-1):
+  """Set multi-worker cluster spec in TF_CONFIG environment variable.
+
+  Args:
+    worker_hosts: comma-separated list of worker ip:port pairs.
+    task_index: index of the worker.
+
+  Returns:
+    Number of workers in the cluster.
+  """
+  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+  if tf_config:
+    num_workers = (
+        len(tf_config["cluster"].get("chief", [])) +
+        len(tf_config["cluster"].get("worker", [])))
+  elif worker_hosts:
+    workers = worker_hosts.split(",")
+    num_workers = len(workers)
+    if num_workers > 1 and task_index < 0:
+      raise ValueError("Must specify task_index when number of workers > 1")
+    task_index = 0 if num_workers == 1 else task_index
+    os.environ["TF_CONFIG"] = json.dumps({
+        "cluster": {
+            "worker": workers
+        },
+        "task": {
+            "type": "worker",
+            "index": task_index
+        }
+    })
+  else:
+    num_workers = 1
+  return num_workers
+
+
+def get_strategy_scope(strategy):
+  if strategy:
+    strategy_scope = strategy.scope()
+  else:
+    strategy_scope = DummyContextManager()
+
+  return strategy_scope
+
+
+class DummyContextManager(object):
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, *args):
+    pass
--- a/official/common/distribute_utils_test.py
+++ b/official/common/distribute_utils_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Tests for distribution util functions."""
+
+import tensorflow as tf
+
+from official.common import distribute_utils
+
+
+class GetDistributionStrategyTest(tf.test.TestCase):
+  """Tests for get_distribution_strategy."""
+
+  def test_one_device_strategy_cpu(self):
+    ds = distribute_utils.get_distribution_strategy(num_gpus=0)
+    self.assertEquals(ds.num_replicas_in_sync, 1)
+    self.assertEquals(len(ds.extended.worker_devices), 1)
+    self.assertIn('CPU', ds.extended.worker_devices[0])
+
+  def test_one_device_strategy_gpu(self):
+    ds = distribute_utils.get_distribution_strategy(num_gpus=1)
+    self.assertEquals(ds.num_replicas_in_sync, 1)
+    self.assertEquals(len(ds.extended.worker_devices), 1)
+    self.assertIn('GPU', ds.extended.worker_devices[0])
+
+  def test_mirrored_strategy(self):
+    ds = distribute_utils.get_distribution_strategy(num_gpus=5)
+    self.assertEquals(ds.num_replicas_in_sync, 5)
+    self.assertEquals(len(ds.extended.worker_devices), 5)
+    for device in ds.extended.worker_devices:
+      self.assertIn('GPU', device)
+
+  def test_no_strategy(self):
+    ds = distribute_utils.get_distribution_strategy('off')
+    self.assertIsNone(ds)
+
+  def test_invalid_strategy(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'distribution_strategy must be a string but got: False. If'):
+      distribute_utils.get_distribution_strategy(False)
+    with self.assertRaisesRegexp(
+        ValueError, 'distribution_strategy must be a string but got: 1'):
+      distribute_utils.get_distribution_strategy(1)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/common/flags.py
+++ b/official/common/flags.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The central place to define flags."""
+
+from absl import flags
+
+
+def define_flags():
+  """Defines flags."""
+  flags.DEFINE_string(
+      'experiment', default=None, help='The experiment type registered.')
+
+  flags.DEFINE_enum(
+      'mode',
+      default=None,
+      enum_values=[
+          'train', 'eval', 'train_and_eval', 'continuous_eval',
+          'continuous_train_and_eval', 'train_and_validate'
+      ],
+      help='Mode to run: `train`, `eval`, `train_and_eval`, '
+      '`continuous_eval`, `continuous_train_and_eval` and '
+      '`train_and_validate` (which is not implemented in '
+      'the open source version).')
+
+  flags.DEFINE_string(
+      'model_dir',
+      default=None,
+      help='The directory where the model and training/evaluation summaries'
+      'are stored.')
+
+  flags.DEFINE_multi_string(
+      'config_file',
+      default=None,
+      help='YAML/JSON files which specifies overrides. The override order '
+      'follows the order of args. Note that each file '
+      'can be used as an override template to override the default parameters '
+      'specified in Python. If the same parameter is specified in both '
+      '`--config_file` and `--params_override`, `config_file` will be used '
+      'first, followed by params_override.')
+
+  flags.DEFINE_string(
+      'params_override',
+      default=None,
+      help='a YAML/JSON string or a YAML file which specifies additional '
+      'overrides over the default parameters and those specified in '
+      '`--config_file`. Note that this is supposed to be used only to override '
+      'the model parameters, but not the parameters like TPU specific flags. '
+      'One canonical use case of `--config_file` and `--params_override` is '
+      'users first define a template config file using `--config_file`, then '
+      'use `--params_override` to adjust the minimal set of tuning parameters, '
+      'for example setting up different `train_batch_size`. The final override '
+      'order of parameters: default_model_params --> params from config_file '
+      '--> params in params_override. See also the help message of '
+      '`--config_file`.')
+
+  # The libraries rely on gin often make mistakes that include flags inside
+  # the library files which causes conflicts.
+  try:
+    flags.DEFINE_multi_string(
+        'gin_file', default=None, help='List of paths to the config files.')
+  except flags.DuplicateFlagError:
+    pass
+
+  try:
+    flags.DEFINE_multi_string(
+        'gin_params',
+        default=None,
+        help='Newline separated list of Gin parameter bindings.')
+  except flags.DuplicateFlagError:
+    pass
+
+  flags.DEFINE_string(
+      'tpu',
+      default=None,
+      help='The Cloud TPU to use for training. This should be either the name '
+      'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 '
+      'url.')
+
+  flags.DEFINE_string(
+      'tf_data_service', default=None, help='The tf.data service address')
--- a/official/common/registry_imports.py
+++ b/official/common/registry_imports.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""All necessary imports for registration."""
+# pylint: disable=unused-import
+from official.nlp import tasks
+from official.nlp.configs import experiment_configs
+from official.utils.testing import mock_task
+from official.vision import beta
--- a/official/core/__init__.py
+++ b/official/core/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/core/base_task.py
+++ b/official/core/base_task.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,52 +11,81 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Defines the base task abstraction."""
 import abc
-import functools
-from typing import Any, Callable, Optional
+from typing import Optional

-import six
+from absl import logging
 import tensorflow as tf

-from official.modeling.hyperparams import config_definitions as cfg
-from official.utils import registry
+from official.core import config_definitions
+from official.modeling import optimization
+from official.modeling import performance
+
+OptimizationConfig = optimization.OptimizationConfig
+RuntimeConfig = config_definitions.RuntimeConfig


-@six.add_metaclass(abc.ABCMeta)
-class Task(tf.Module):
+class Task(tf.Module, metaclass=abc.ABCMeta):
  """A single-replica view of training procedure.

-  Tasks provide artifacts for training/evalution procedures, including
-  loading/iterating over Datasets, initializing the model, calculating the loss
-  and customized metrics with reduction.
+  Tasks provide artifacts for training/validation procedures, including
+  loading/iterating over Datasets, training/validation steps, calculating the
+  loss and customized metrics with reduction.
  """

  # Special keys in train/validate step returned logs.
  loss = "loss"

-  def __init__(self, params: cfg.TaskConfig, logging_dir: str = None):
+  def __init__(self, params, logging_dir: str = None, name: str = None):
    """Task initialization.

    Args:
-      params: cfg.TaskConfig instance.
+      params: the task configuration instance, which can be any of dataclass,
+        ConfigDict, namedtuple, etc.
      logging_dir: a string pointing to where the model, summaries etc. will be
        saved. You can also write additional stuff in this directory.
+      name: the task name.
    """
+    super().__init__(name=name)
    self._task_config = params
    self._logging_dir = logging_dir

  @property
-  def task_config(self) -> cfg.TaskConfig:
+  def task_config(self):
    return self._task_config

  @property
  def logging_dir(self) -> str:
    return self._logging_dir

+  @classmethod
+  def create_optimizer(cls, optimizer_config: OptimizationConfig,
+                       runtime_config: Optional[RuntimeConfig] = None):
+    """Creates an TF optimizer from configurations.
+
+    Args:
+      optimizer_config: the parameters of the Optimization settings.
+      runtime_config: the parameters of the runtime.
+
+    Returns:
+      A tf.optimizers.Optimizer object.
+    """
+    opt_factory = optimization.OptimizerFactory(optimizer_config)
+    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
+    # Configuring optimizer when loss_scale is set in runtime config. This helps
+    # avoiding overflow/underflow for float16 computations.
+    if runtime_config and runtime_config.loss_scale:
+      optimizer = performance.configure_optimizer(
+          optimizer,
+          use_float16=runtime_config.mixed_precision_dtype == "float16",
+          loss_scale=runtime_config.loss_scale)
+
+    return optimizer
+
  def initialize(self, model: tf.keras.Model):
-    """A callback function used as CheckpointManager's init_fn.
+    """[Optional] A callback function used as CheckpointManager's init_fn.

    This function will be called when no checkpoint is found for the model.
    If there is a checkpoint, the checkpoint will be loaded and this function
@@ -67,54 +95,34 @@ class Task(tf.Module):
    Args:
      model: The keras.Model built or used by this task.
    """
-    pass
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    logging.info("Trying to load pretrained checkpoint from %s",
+                 ckpt_dir_or_file)
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+
+    if hasattr(model, "checkpoint_items"):
+      checkpoint_items = model.checkpoint_items
+    else:
+      checkpoint_items = dict(model=model)
+    ckpt = tf.train.Checkpoint(**checkpoint_items)
+    status = ckpt.read(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info("Finished loading pretrained checkpoint from %s",
+                 ckpt_dir_or_file)

-  @abc.abstractmethod
  def build_model(self) -> tf.keras.Model:
-    """Creates model architecture.
+    """[Optional] Creates model architecture.

    Returns:
      A model instance.
    """

-  def compile_model(self,
-                    model: tf.keras.Model,
-                    optimizer: tf.keras.optimizers.Optimizer,
-                    loss=None,
-                    train_step: Optional[Callable[..., Any]] = None,
-                    validation_step: Optional[Callable[..., Any]] = None,
-                    **kwargs) -> tf.keras.Model:
-    """Compiles the model with objects created by the task.
-
-    The method should not be used in any customized training implementation.
-
-    Args:
-      model: a keras.Model.
-      optimizer: the keras optimizer.
-      loss: a callable/list of losses.
-      train_step: optional train step function defined by the task.
-      validation_step: optional validation_step step function defined by the
-        task.
-      **kwargs: other kwargs consumed by keras.Model compile().
-
-    Returns:
-      a compiled keras.Model.
-    """
-    if bool(loss is None) == bool(train_step is None):
-      raise ValueError("`loss` and `train_step` should be exclusive to "
-                       "each other.")
-    model.compile(optimizer=optimizer, loss=loss, **kwargs)
-
-    if train_step:
-      model.train_step = functools.partial(
-          train_step, model=model, optimizer=model.optimizer)
-    if validation_step:
-      model.test_step = functools.partial(validation_step, model=model)
-    return model
-
  @abc.abstractmethod
  def build_inputs(self,
-                   params: cfg.DataConfig,
+                   params,
                   input_context: Optional[tf.distribute.InputContext] = None):
    """Returns a dataset or a nested structure of dataset functions.

@@ -122,7 +130,8 @@ class Task(tf.Module):
    With distributed training, this method runs on remote hosts.

    Args:
-      params: hyperparams to create input pipelines.
+      params: hyperparams to create input pipelines, which can be any of
+        dataclass, ConfigDict, namedtuple, etc.
      input_context: optional distribution input pipeline context.

    Returns:
@@ -155,26 +164,30 @@ class Task(tf.Module):
    return []

  def process_metrics(self, metrics, labels, model_outputs):
-    """Process and update metrics. Called when using custom training loop API.
+    """Process and update metrics.
+
+    Called when using custom training loop API.

    Args:
-      metrics: a nested structure of metrics objects.
-        The return of function self.build_metrics.
+      metrics: a nested structure of metrics objects. The return of function
+        self.build_metrics.
      labels: a tensor or a nested structure of tensors.
-      model_outputs: a tensor or a nested structure of tensors.
-        For example, output of the keras model built by self.build_model.
+      model_outputs: a tensor or a nested structure of tensors. For example,
+        output of the keras model built by self.build_model.
    """
    for metric in metrics:
      metric.update_state(labels, model_outputs)

  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    """Process and update compiled_metrics. call when using compile/fit API.
+    """Process and update compiled_metrics.
+
+    call when using compile/fit API.

    Args:
      compiled_metrics: the compiled metrics (model.compiled_metrics).
      labels: a tensor or a nested structure of tensors.
-      model_outputs: a tensor or a nested structure of tensors.
-        For example, output of the keras model built by self.build_model.
+      model_outputs: a tensor or a nested structure of tensors. For example,
+        output of the keras model built by self.build_model.
    """
    compiled_metrics.update_state(labels, model_outputs)

@@ -203,8 +216,14 @@ class Task(tf.Module):
    with tf.GradientTape() as tape:
      outputs = model(features, training=True)
      # Computes per-replica loss.
-      loss = self.build_losses(
-          labels=labels, model_outputs=outputs, aux_losses=model.losses)
+      if model.compiled_loss:
+        loss = model.compiled_loss(
+            labels, outputs, regularization_losses=model.losses)
+        loss += self.build_losses(
+            labels=labels, model_outputs=outputs, aux_losses=None)
+      else:
+        loss = self.build_losses(
+            labels=labels, model_outputs=outputs, aux_losses=model.losses)
      # Scales loss as the default gradients allreduce performs sum inside the
      # optimizer.
      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
@@ -212,22 +231,22 @@ class Task(tf.Module):
      # For mixed precision, when a LossScaleOptimizer is used, the loss is
      # scaled to avoid numeric underflow.
      if isinstance(optimizer,
-                    tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+                    tf.keras.mixed_precision.LossScaleOptimizer):
        scaled_loss = optimizer.get_scaled_loss(scaled_loss)

    tvars = model.trainable_variables
    grads = tape.gradient(scaled_loss, tvars)

    if isinstance(optimizer,
-                  tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+                  tf.keras.mixed_precision.LossScaleOptimizer):
      grads = optimizer.get_unscaled_gradients(grads)
    optimizer.apply_gradients(list(zip(grads, tvars)))
    logs = {self.loss: loss}
    if metrics:
      self.process_metrics(metrics, labels, outputs)
-      logs.update({m.name: m.result() for m in metrics})
-    elif model.compiled_metrics:
+    if model.compiled_metrics:
      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
+      logs.update({m.name: m.result() for m in metrics or []})
      logs.update({m.name: m.result() for m in model.metrics})
    return logs

@@ -254,9 +273,9 @@ class Task(tf.Module):
    logs = {self.loss: loss}
    if metrics:
      self.process_metrics(metrics, labels, outputs)
-      logs.update({m.name: m.result() for m in metrics})
-    elif model.compiled_metrics:
+    if model.compiled_metrics:
      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
+      logs.update({m.name: m.result() for m in metrics or []})
      logs.update({m.name: m.result() for m in model.metrics})
    return logs

@@ -278,53 +297,8 @@ class Task(tf.Module):
    """Optional aggregation over logs returned from a validation step."""
    pass

-  def reduce_aggregated_logs(self, aggregated_logs):
+  def reduce_aggregated_logs(self,
+                             aggregated_logs,
+                             global_step: Optional[tf.Tensor] = None):
    """Optional reduce of aggregated logs over validation steps."""
    return {}
-
-
-_REGISTERED_TASK_CLS = {}
-
-
-# TODO(b/158268740): Move these outside the base class file.
-# TODO(b/158741360): Add type annotations once pytype checks across modules.
-def register_task_cls(task_config_cls):
-  """Decorates a factory of Tasks for lookup by a subclass of TaskConfig.
-
-  This decorator supports registration of tasks as follows:
-
-  ```
-  @dataclasses.dataclass
-  class MyTaskConfig(TaskConfig):
-    # Add fields here.
-    pass
-
-  @register_task_cls(MyTaskConfig)
-  class MyTask(Task):
-    # Inherits def __init__(self, task_config).
-    pass
-
-  my_task_config = MyTaskConfig()
-  my_task = get_task(my_task_config)  # Returns MyTask(my_task_config).
-  ```
-
-  Besisdes a class itself, other callables that create a Task from a TaskConfig
-  can be decorated by the result of this function, as long as there is at most
-  one registration for each config class.
-
-  Args:
-    task_config_cls: a subclass of TaskConfig (*not* an instance of TaskConfig).
-      Each task_config_cls can only be used for a single registration.
-
-  Returns:
-    A callable for use as class decorator that registers the decorated class
-    for creation from an instance of task_config_cls.
-  """
-  return registry.register(_REGISTERED_TASK_CLS, task_config_cls)
-
-
-# The user-visible get_task() is defined after classes have been registered.
-# TODO(b/158741360): Add type annotations once pytype checks across modules.
-def get_task_cls(task_config_cls):
-  task_cls = registry.lookup(_REGISTERED_TASK_CLS, task_config_cls)
-  return task_cls
--- a/official/core/base_trainer.py
+++ b/official/core/base_trainer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Standard Trainer implementation.
+
+The base trainer implements the Orbit `StandardTrainable` and
+`StandardEvaluable` interfaces. Trainers inside this project should be
+interchangable and independent on model architectures and tasks.
+"""
+import functools
+from absl import logging
+import gin
+import orbit
+import tensorflow as tf
+
+from official.core import base_task
+from official.core import config_definitions
+from official.modeling import optimization
+
+
+ExperimentConfig = config_definitions.ExperimentConfig
+TrainerConfig = config_definitions.TrainerConfig
+
+
+class Recovery:
+  """Built-in model blowup recovery module.
+
+  Checks the loss value by the given threshold. If applicable, recover the
+  model by reading the checkpoint on disk.
+  """
+
+  def __init__(self,
+               loss_upper_bound: float,
+               checkpoint_manager: tf.train.CheckpointManager,
+               recovery_begin_steps: int = 0,
+               recovery_max_trials: int = 3):
+    self.recover_counter = 0
+    self.recovery_begin_steps = recovery_begin_steps
+    self.recovery_max_trials = recovery_max_trials
+    self.loss_upper_bound = loss_upper_bound
+    self.checkpoint_manager = checkpoint_manager
+
+  def should_recover(self, loss_value, global_step):
+    if tf.math.is_nan(loss_value):
+      return True
+    if (global_step >= self.recovery_begin_steps and
+        loss_value > self.loss_upper_bound):
+      return True
+    return False
+
+  def maybe_recover(self, loss_value, global_step):
+    """Conditionally recovers the training by triggering checkpoint restoration.
+
+    Args:
+      loss_value: the loss value as a float.
+      global_step: the number of global training steps.
+
+    Raises:
+      RuntimeError: when recovery happens more than the max number of trials,
+      the job should crash.
+    """
+    if not self.should_recover(loss_value, global_step):
+      return
+    self.recover_counter += 1
+    if self.recover_counter > self.recovery_max_trials:
+      raise RuntimeError(
+          "The loss value is NaN after training loop and it happens %d times." %
+          self.recover_counter)
+    # Loads the previous good checkpoint.
+    checkpoint_path = self.checkpoint_manager.restore_or_initialize()
+    logging.warning(
+        "Recovering the model from checkpoint: %s. The loss value becomes "
+        "%f at step %d.", checkpoint_path, loss_value, global_step)
+
+
+class _AsyncTrainer(orbit.StandardTrainer, orbit.StandardEvaluator):
+  """Trainer class for both sync and async Strategy."""
+
+  def init_async(self):
+    """Initializes the Async Trainer base class."""
+    assert isinstance(self._strategy, tf.distribute.Strategy)
+    self._is_async = isinstance(
+        self._strategy, tf.distribute.experimental.ParameterServerStrategy)
+    self._coordinator = None
+    if self._is_async:
+      self._coordinator = (
+          tf.distribute.experimental.coordinator.ClusterCoordinator(
+              self._strategy))
+
+  def join(self):
+    """Join all async steps. Only useful in aysnc training."""
+    if getattr(self, "_is_async", False):
+      self._coordinator.join()
+
+  def create_train_loop_fn(self):
+    """Creates a eval loop from the given step function and options."""
+    train_loop_fn = super().create_train_loop_fn()
+    if getattr(self, "_is_async", False):
+
+      def _async_loop_fn(iterator, num_steps):
+        self._coordinator.schedule(train_loop_fn, args=(iterator, num_steps))
+
+      return _async_loop_fn
+    else:
+      return train_loop_fn
+
+  def create_eval_loop_fn(self, has_state: bool):
+    """Creates a training loop from the given step function and options."""
+    eval_loop_fn = super().create_eval_loop_fn(has_state)
+
+    if getattr(self, "_is_async", False):
+      if has_state:
+        raise ValueError(
+            "Stateful eval loop is not supported in async training.")
+
+      def _async_loop_fn(iterator, num_steps, state=None, reduce_fn=None):
+        assert state is None
+        assert reduce_fn is None
+        self._coordinator.schedule(eval_loop_fn, args=(iterator, num_steps))
+
+      return _async_loop_fn
+    else:
+      return eval_loop_fn
+
+  def distribute_dataset(self, dataset_or_fn, *args, **kwargs):
+    """A utility function to help create a `tf.distribute.DistributedDataset`.
+
+    Args:
+      dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
+        returning a `tf.data.Dataset`. If it is a function, it may optionally
+        have an argument named `input_context` which will be passed a
+        `tf.distribute.InputContext` instance.
+      *args: Any positional arguments to pass through to `dataset_or_fn`.
+      **kwargs: Any keyword arguments to pass through to `dataset_or_fn`.
+    Returns:
+      A distributed Dataset.
+    """
+    if getattr(self, "_is_async", False):
+      per_worker_dataset_fn = functools.partial(
+          orbit.utils.make_distributed_dataset, self._strategy, dataset_or_fn,
+          *args, **kwargs)
+      per_worker_dataset_fn = tf.function(per_worker_dataset_fn)
+
+      return self._coordinator.create_per_worker_dataset(per_worker_dataset_fn)
+    else:
+      return orbit.utils.make_distributed_dataset(self._strategy, dataset_or_fn,
+                                                  *args, **kwargs)
+
+
+def get_runtime_options(config: ExperimentConfig):
+  """Get tf.distribute.RunOptions from config."""
+  xla_options = {}
+  if config.runtime.tpu_enable_xla_dynamic_padder is not None:
+    xla_options["enable_xla_dynamic_padder"] = (
+        config.runtime.tpu_enable_xla_dynamic_padder)
+  return tf.distribute.RunOptions(
+      experimental_xla_options=tf.tpu.XLAOptions(**xla_options))
+
+
+@gin.configurable
+class Trainer(_AsyncTrainer):
+  """Implements the common trainer shared for TensorFlow models."""
+
+  # pylint: disable=super-init-not-called
+  def __init__(self,
+               config: ExperimentConfig,
+               task: base_task.Task,
+               model: tf.keras.Model,
+               optimizer: tf.optimizers.Optimizer,
+               train: bool = True,
+               evaluate: bool = True,
+               checkpoint_exporter=None):
+    """Initialize common trainer for TensorFlow models.
+
+    Args:
+      config: An `ExperimentConfig` instance specifying experiment config.
+      task: A base_task.Task instance.
+      model: The model instance, e.g. a tf.keras.Model instance.
+      optimizer: tf.optimizers.Optimizer instance.
+      train: bool, whether or not this trainer will be used for training.
+        default to True.
+      evaluate: bool, whether or not this trainer will be used for evaluation.
+        default to True.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
+    """
+    # Gets the current distribution strategy. If not inside any strategy scope,
+    # it gets a single-replica no-op strategy.
+    self._strategy = tf.distribute.get_strategy()
+    self._validate_params(config)
+    self._config = config
+    self._task = task
+    self._model = model
+    self._optimizer = optimizer
+    self._checkpoint_exporter = checkpoint_exporter
+    self._recovery = None
+    # Runtime options are only applied to train_step.
+    # We use default for eval_step.
+    self._runtime_options = get_runtime_options(config)
+
+    # Creates a shadow copy of the weights to store weights moving average.
+    if isinstance(self._optimizer, optimization.ExponentialMovingAverage):
+      self._optimizer.shadow_copy(self._model)
+
+    # global_step increases by 1 after each training iteration.
+    # We should have global_step.numpy() == self.optimizer.iterations.numpy()
+    # when there is only 1 optimizer.
+    self._global_step = orbit.utils.create_global_step()
+    if hasattr(self.model, "checkpoint_items"):
+      checkpoint_items = self.model.checkpoint_items
+    else:
+      checkpoint_items = {}
+    self._checkpoint = tf.train.Checkpoint(
+        global_step=self.global_step,
+        model=self.model,
+        optimizer=self.optimizer,
+        **checkpoint_items)
+
+    self._train_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
+    self._validation_loss = tf.keras.metrics.Mean(
+        "validation_loss", dtype=tf.float32)
+    self._train_metrics = self.task.build_metrics(
+        training=True) + self.model.metrics
+    self._validation_metrics = self.task.build_metrics(
+        training=False) + self.model.metrics
+
+    self.init_async()
+
+    if train:
+      train_dataset = self.distribute_dataset(
+          self.task.build_inputs, self.config.task.train_data)
+      orbit.StandardTrainer.__init__(
+          self,
+          train_dataset,
+          options=orbit.StandardTrainerOptions(
+              use_tf_while_loop=config.trainer.train_tf_while_loop,
+              use_tf_function=config.trainer.train_tf_function,
+              use_tpu_summary_optimization=config.trainer.allow_tpu_summary))
+
+    if evaluate:
+      eval_dataset = self.distribute_dataset(
+          self.task.build_inputs, self.config.task.validation_data)
+      orbit.StandardEvaluator.__init__(
+          self,
+          eval_dataset,
+          options=orbit.StandardEvaluatorOptions(
+              use_tf_function=config.trainer.eval_tf_function,
+              use_tf_while_loop=config.trainer.eval_tf_while_loop))
+
+  def _validate_params(self, config):
+    r"""Validates if the configuration object passed to the Trainer.
+
+    The experiment configuration should be structured as:
+    \trainer
+    \task
+      \train_data
+      \validation_data
+
+    Args:
+      config: a namedtuple, dataclass, ConfigDict, etc.
+    """
+    if not hasattr(config, "trainer"):
+      raise AttributeError("The trainer requires the configuration contains an"
+                           " attribute `trainer`.")
+
+    if not hasattr(config, "task"):
+      raise AttributeError("The trainer requires the configuration contains an"
+                           " attribute `task`.")
+
+    if not hasattr(config.task, "train_data"):
+      raise AttributeError("The trainer requires the configuration contains an"
+                           " attribute `task.train_data`.")
+
+    if not hasattr(config.task, "validation_data"):
+      raise AttributeError("The trainer requires the configuration contains an"
+                           " attribute `task.validation_data`.")
+
+  @property
+  def strategy(self):
+    return self._strategy
+
+  @property
+  def config(self):
+    return self._config
+
+  @property
+  def task(self):
+    return self._task
+
+  @property
+  def model(self):
+    return self._model
+
+  @property
+  def optimizer(self):
+    if hasattr(self, "_optimizer"):
+      return self._optimizer
+    else:
+      return None
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  @property
+  def train_loss(self):
+    """Accesses the training loss metric object."""
+    return self._train_loss
+
+  @property
+  def validation_loss(self):
+    """Accesses the validation loss metric object."""
+    return self._validation_loss
+
+  @property
+  def train_metrics(self):
+    """Accesses all training metric objects."""
+    return self._train_metrics
+
+  @property
+  def validation_metrics(self):
+    """Accesses all validation metric metric objects."""
+    return self._validation_metrics
+
+  def initialize(self):
+    """A callback function.
+
+    This function will be called when no checkpoint found for the model.
+    If there is a checkpoint, the checkpoint will be loaded and this function
+    will not be called. Tasks may use this callback function to load a
+    pretrained checkpoint, saved under a directory other than the model_dir.
+    """
+    self.task.initialize(self.model)
+
+  @property
+  def checkpoint(self):
+    """Accesses the training checkpoint."""
+    return self._checkpoint
+
+  def add_recovery(self, params: TrainerConfig,
+                   checkpoint_manager: tf.train.CheckpointManager):
+    if params.recovery_max_trials >= 0:
+      self._recovery = Recovery(
+          loss_upper_bound=params.loss_upper_bound,
+          recovery_begin_steps=params.recovery_begin_steps,
+          recovery_max_trials=params.recovery_max_trials,
+          checkpoint_manager=checkpoint_manager)
+
+  def train_loop_end(self):
+    """See base class."""
+    self.join()
+    # Checks if the model numeric status is stable and conducts the checkpoint
+    # recovery accordingly.
+    if self._recovery:
+      self._recovery.maybe_recover(self.train_loss.result().numpy(),
+                                   self.global_step.numpy())
+    logs = {}
+    for metric in self.train_metrics + [self.train_loss]:
+      logs[metric.name] = metric.result()
+      metric.reset_states()
+    if callable(self.optimizer.learning_rate):
+      logs["learning_rate"] = self.optimizer.learning_rate(self.global_step)
+    else:
+      logs["learning_rate"] = self.optimizer.learning_rate
+    return logs
+
+  def train_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      if self.config.runtime.enable_xla and (self.config.runtime.num_gpus > 0):
+        task_train_step = tf.function(self.task.train_step, jit_compile=True)
+      else:
+        task_train_step = self.task.train_step
+      logs = task_train_step(
+          inputs,
+          model=self.model,
+          optimizer=self.optimizer,
+          metrics=self.train_metrics)
+      self._train_loss.update_state(logs[self.task.loss])
+      self.global_step.assign_add(1)
+
+    self.strategy.run(
+        step_fn, args=(next(iterator),), options=self._runtime_options)
+
+  def eval_begin(self):
+    """Sets up metrics."""
+    for metric in self.validation_metrics + [self.validation_loss]:
+      metric.reset_states()
+    # Swaps weights to test on weights moving average.
+    if self.optimizer and isinstance(
+        self.optimizer, optimization.ExponentialMovingAverage):
+      self.optimizer.swap_weights()
+
+  def eval_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      logs = self.task.validation_step(
+          inputs, model=self.model, metrics=self.validation_metrics)
+      if self.task.loss in logs:
+        self._validation_loss.update_state(logs[self.task.loss])
+      return logs
+
+    distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
+    return tf.nest.map_structure(self.strategy.experimental_local_results,
+                                 distributed_outputs)
+
+  def eval_end(self, aggregated_logs=None):
+    """Processes evaluation results."""
+    self.join()
+    logs = {}
+    for metric in self.validation_metrics:
+      logs[metric.name] = metric.result()
+    if self.validation_loss.count.numpy() != 0:
+      logs[self.validation_loss.name] = self.validation_loss.result()
+    else:
+      # `self.validation_loss` metric was not updated, because the validation
+      # loss was not returned from the task's `validation_step` method.
+      logging.info("The task did not report validation loss.")
+    if aggregated_logs:
+      metrics = self.task.reduce_aggregated_logs(
+          aggregated_logs, global_step=self.global_step)
+      logs.update(metrics)
+
+    if self._checkpoint_exporter:
+      self._checkpoint_exporter.maybe_export_checkpoint(
+          self.checkpoint, logs, self.global_step.numpy())
+      metric_name = self.config.trainer.best_checkpoint_eval_metric
+      logs["best_" +
+           metric_name] = self._checkpoint_exporter.best_ckpt_logs[metric_name]
+
+    # Swaps back weights after testing when EMA is used.
+    # This happens after best checkpoint export so that average weights used for
+    # eval are exported instead of regular weights.
+    if self.optimizer and isinstance(
+        self.optimizer, optimization.ExponentialMovingAverage):
+      self.optimizer.swap_weights()
+    return logs
+
+  def eval_reduce(self, state=None, step_outputs=None):
+    return self.task.aggregate_logs(state, step_outputs)
--- a/official/core/base_trainer_test.py
+++ b/official/core/base_trainer_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensorflow_models.core.trainers.trainer."""
+# pylint: disable=g-direct-tensorflow-import
+import multiprocessing
+import os
+import sys
+
+from absl.testing import parameterized
+import numpy as np
+import portpicker
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import base_trainer as trainer_lib
+from official.core import config_definitions as cfg
+from official.core import train_lib
+from official.utils.testing import mock_task
+
+TPU_TEST = 'test_tpu' in sys.argv[0]
+GPU_TEST = 'test_gpu' in sys.argv[0]
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+
+
+def create_in_process_cluster(num_workers, num_ps):
+  """Creates and starts local servers and returns the cluster_resolver."""
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+
+  cluster_dict = {}
+  cluster_dict['worker'] = ['localhost:%s' % port for port in worker_ports]
+  if num_ps > 0:
+    cluster_dict['ps'] = ['localhost:%s' % port for port in ps_ports]
+
+  cluster_spec = tf.train.ClusterSpec(cluster_dict)
+
+  # Workers need some inter_ops threads to work properly.
+  worker_config = tf.compat.v1.ConfigProto()
+  if multiprocessing.cpu_count() < num_workers + 1:
+    worker_config.inter_op_parallelism_threads = num_workers + 1
+
+  for i in range(num_workers):
+    tf.distribute.Server(
+        cluster_spec,
+        job_name='worker',
+        task_index=i,
+        config=worker_config,
+        protocol='grpc')
+
+  for i in range(num_ps):
+    tf.distribute.Server(
+        cluster_spec, job_name='ps', task_index=i, protocol='grpc')
+
+  cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
+      cluster_spec, rpc_layer='grpc')
+  return cluster_resolver
+
+
+def dataset_fn(input_context=None):
+  del input_context
+
+  def dummy_data(_):
+    return tf.zeros((1, 1), dtype=tf.float32)
+
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(
+      dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+class MockAsyncTrainer(trainer_lib._AsyncTrainer):
+  """Mock AsyncTrainer to test the _AsyncTrainer class."""
+
+  def __init__(self):
+    self._strategy = tf.distribute.get_strategy()
+    self.init_async()
+
+    self.global_step = tf.Variable(
+        0,
+        dtype=tf.int64,
+        name='global_step',
+        trainable=False,
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+    self.eval_global_step = tf.Variable(
+        0,
+        dtype=tf.int64,
+        name='eval_global_step',
+        trainable=False,
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+
+    train_dataset = self.distribute_dataset(dataset_fn)
+    trainer_lib.orbit.StandardTrainer.__init__(
+        self, train_dataset, options=trainer_lib.orbit.StandardTrainerOptions())
+
+    eval_dataset = self.distribute_dataset(dataset_fn)
+    trainer_lib.orbit.StandardEvaluator.__init__(
+        self,
+        eval_dataset,
+        options=trainer_lib.orbit.StandardEvaluatorOptions(
+            use_tf_while_loop=True))
+
+  def train_loop_begin(self):
+    self.global_step.assign(0)
+
+  def train_step(self, iterator):
+
+    def replica_step(_):
+      self.global_step.assign_add(1)
+
+    self._strategy.run(replica_step, args=(next(iterator),))
+
+  def train_loop_end(self):
+    self.join()
+    return self.global_step.numpy()
+
+  def eval_begin(self):
+    self.eval_global_step.assign(0)
+
+  def eval_step(self, iterator):
+
+    def replica_step(_):
+      self.eval_global_step.assign_add(1)
+
+    self._strategy.run(replica_step, args=(next(iterator),))
+
+  def eval_end(self):
+    self.join()
+    return self.eval_global_step.numpy()
+
+
+class TrainerTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            })))
+
+  def create_test_trainer(self, config, model_dir=None, task=None):
+    task = task or mock_task.MockTask(config.task, logging_dir=model_dir)
+    ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
+    trainer = trainer_lib.Trainer(
+        config,
+        task,
+        model=task.build_model(),
+        optimizer=task.create_optimizer(config.trainer.optimizer_config,
+                                        config.runtime),
+        checkpoint_exporter=ckpt_exporter)
+    return trainer
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_train(self, distribution):
+    with distribution.scope():
+      trainer = self.create_test_trainer(self._config)
+      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('training_loss', logs)
+      self.assertIn('learning_rate', logs)
+
+  def test_base_async_trainer(self):
+    if TPU_TEST or GPU_TEST:
+      self.skipTest('Aysnc training is not available on GPU/GPU.')
+    num_workers = 3
+    num_ps = 2
+    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
+    distribution = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver)
+    with distribution.scope():
+      trainer = MockAsyncTrainer()
+      trainer.init_async()
+      self.assertIsInstance(
+          trainer._coordinator,
+          tf.distribute.experimental.coordinator.ClusterCoordinator)
+      self.assertEqual(trainer.train(tf.constant(10)), 10)
+      self.assertEqual(trainer.evaluate(tf.constant(11)), 11)
+
+  def test_async_trainer_train(self):
+    if TPU_TEST or GPU_TEST:
+      self.skipTest('Aysnc training is not available on GPU/GPU.')
+    num_workers = 3
+    num_ps = 2
+    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
+    distribution = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver)
+    with distribution.scope():
+      config = cfg.ExperimentConfig(**self._config.as_dict())
+      config.trainer.eval_tf_while_loop = True
+      trainer = self.create_test_trainer(config)
+      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('training_loss', logs)
+      self.assertIn('learning_rate', logs)
+
+  def test_async_trainer_validate(self):
+    if TPU_TEST or GPU_TEST:
+      self.skipTest('Aysnc training is not available on GPU/GPU.')
+    num_workers = 3
+    num_ps = 2
+    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
+    distribution = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver)
+    with distribution.scope():
+      config = cfg.ExperimentConfig(**self._config.as_dict())
+      config.trainer.eval_tf_while_loop = True
+      trainer = self.create_test_trainer(config)
+      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('acc', logs)
+      self.assertIn('validation_loss', logs)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_validate(self, distribution):
+    with distribution.scope():
+      trainer = self.create_test_trainer(self._config)
+      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
+      self.assertIn('validation_loss', logs)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_validate_without_loss(self, distribution):
+
+    class MockTaskWithoutValidationLoss(mock_task.MockTask):
+
+      def validation_step(self, inputs, model, metrics=None):
+        # Disable validation loss.
+        logs = super().validation_step(inputs, model)
+        del logs[self.loss]
+        return logs
+
+    with distribution.scope():
+      task = MockTaskWithoutValidationLoss()
+      trainer = self.create_test_trainer(self._config, task=task)
+      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
+      self.assertNotIn('validation_loss', logs)
+
+  @combinations.generate(
+      combinations.combine(
+          mixed_precision_dtype=['float32', 'bfloat16', 'float16'],
+          loss_scale=[None, 'dynamic', 128, 256],
+      ))
+  def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
+    config = cfg.ExperimentConfig(
+        runtime=cfg.RuntimeConfig(
+            mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale),
+        trainer=cfg.TrainerConfig(
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                },
+            })))
+    trainer = self.create_test_trainer(config)
+    if mixed_precision_dtype != 'float16':
+      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
+    elif mixed_precision_dtype == 'float16' and loss_scale is None:
+      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
+    else:
+      self.assertIsInstance(trainer.optimizer,
+                            tf.keras.mixed_precision.LossScaleOptimizer)
+
+    metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertIn('training_loss', metrics)
+
+  def test_export_best_ckpt(self):
+    config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            best_checkpoint_export_subdir='best_ckpt',
+            best_checkpoint_eval_metric='acc',
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            })))
+    model_dir = self.get_temp_dir()
+    trainer = self.create_test_trainer(config, model_dir=model_dir)
+    trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
+    trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
+    self.assertTrue(
+        tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
+
+  def test_recovery(self):
+    config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            loss_upper_bound=0.5,
+            recovery_max_trials=2,
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            })))
+    model_dir = self.get_temp_dir()
+    trainer = self.create_test_trainer(config, model_dir=model_dir)
+    checkpoint_manager = tf.train.CheckpointManager(
+        trainer.checkpoint, self.get_temp_dir(), max_to_keep=2)
+    checkpoint_manager.save()
+    trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
+    before_weights = trainer.model.get_weights()
+    _ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
+    # The training loss is 1.0 and upper_bound is 0.5, so the recover happens.
+    after_weights = trainer.model.get_weights()
+    for left, right in zip(before_weights, after_weights):
+      self.assertAllEqual(left, right)
+
+    # Let's the loss be NaN and max_trials = 0 to see RuntimeError.
+    config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            recovery_max_trials=0,
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            })))
+    task = mock_task.MockTask(config.task, logging_dir=model_dir)
+
+    def build_losses(labels, model_outputs, aux_losses=None):
+      del labels, model_outputs
+      return tf.constant([np.nan], tf.float32) + aux_losses
+
+    task.build_losses = build_losses
+    trainer = trainer_lib.Trainer(
+        config,
+        task,
+        model=task.build_model(),
+        optimizer=task.create_optimizer(config.trainer.optimizer_config,
+                                        config.runtime))
+    trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
+    with self.assertRaises(RuntimeError):
+      _ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
+
+  def test_model_with_compiled_loss(self):
+    task = mock_task.MockTask()
+    model = task.build_model()
+    model.compile(loss=tf.keras.losses.CategoricalCrossentropy())
+    trainer = trainer_lib.Trainer(
+        self._config,
+        task,
+        model=model,
+        optimizer=task.create_optimizer(self._config.trainer.optimizer_config))
+    logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertIn('training_loss', logs)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/core/config_definitions.py
+++ b/official/core/config_definitions.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common configuration settings."""
+
+from typing import Optional, Sequence, Union
+
+import dataclasses
+
+from official.modeling.hyperparams import base_config
+from official.modeling.optimization.configs import optimization_config
+
+OptimizationConfig = optimization_config.OptimizationConfig
+
+
+@dataclasses.dataclass
+class DataConfig(base_config.Config):
+  """The base configuration for building datasets.
+
+  Attributes:
+    input_path: The path to the input. It can be either (1) a str indicating
+      a file path/pattern, or (2) a str indicating multiple file paths/patterns
+      separated by comma (e.g "a, b, c" or no spaces "a,b,c"), or
+      (3) a list of str, each of which is a file path/pattern or multiple file
+      paths/patterns separated by comma.
+      It should not be specified when the following `tfds_name` is specified.
+    tfds_name: The name of the tensorflow dataset (TFDS). It should not be
+      specified when the above `input_path` is specified.
+    tfds_split: A str indicating which split of the data to load from TFDS. It
+      is required when above `tfds_name` is specified.
+    global_batch_size: The global batch size across all replicas.
+    is_training: Whether this data is used for training or not.
+    drop_remainder: Whether the last batch should be dropped in the case it has
+      fewer than `global_batch_size` elements.
+    shuffle_buffer_size: The buffer size used for shuffling training data.
+    cache: Whether to cache dataset examples. If `True`, we will cache the
+      dataset after applying the decode_fn and parse_fn. It can be used to avoid
+      re-reading from disk, re-decoding and re-parsing the example on the
+      second epoch, but it requires significant memory overhead.
+    cycle_length: The number of files that will be processed concurrently when
+      interleaving files.
+    block_length: The number of consecutive elements to produce from each input
+      element before cycling to another input element when interleaving files.
+    deterministic: A boolean controlling whether determinism should be enforced.
+    sharding: Whether sharding is used in the input pipeline.
+    enable_tf_data_service: A boolean indicating whether to enable tf.data
+      service for the input pipeline.
+    tf_data_service_address: The URI of a tf.data service to offload
+      preprocessing onto during training. The URI should be in the format
+      "protocol://address", e.g. "grpc://tf-data-service:5050". It can be
+      overridden by `FLAGS.tf_data_service` flag in the binary.
+    tf_data_service_job_name: The name of the tf.data service job. This
+      argument makes it possible for multiple datasets to share the same job.
+      The default behavior is that the dataset creates anonymous, exclusively
+      owned jobs.
+    tfds_data_dir: A str specifying the directory to read/write TFDS data.
+    tfds_as_supervised: A bool. When loading dataset from TFDS, if True, the
+      returned tf.data.Dataset will have a 2-tuple structure (input, label)
+      according to builder.info.supervised_keys; if False, the default, the
+      returned tf.data.Dataset will have a dictionary with all the features.
+    tfds_skip_decoding_feature: A str to indicate which features are skipped for
+      decoding when loading dataset from TFDS. Use comma to separate multiple
+      features. The main use case is to skip the image/video decoding for better
+      performance.
+    seed: An optional seed to use for deterministic shuffling/preprocessing.
+  """
+  input_path: Union[Sequence[str], str] = ""
+  tfds_name: str = ""
+  tfds_split: str = ""
+  global_batch_size: int = 0
+  is_training: bool = None
+  drop_remainder: bool = True
+  shuffle_buffer_size: int = 100
+  cache: bool = False
+  cycle_length: Optional[int] = None
+  block_length: int = 1
+  deterministic: Optional[bool] = None
+  sharding: bool = True
+  enable_tf_data_service: bool = False
+  tf_data_service_address: Optional[str] = None
+  tf_data_service_job_name: Optional[str] = None
+  tfds_data_dir: str = ""
+  tfds_as_supervised: bool = False
+  tfds_skip_decoding_feature: str = ""
+  seed: Optional[int] = None
+
+
+@dataclasses.dataclass
+class RuntimeConfig(base_config.Config):
+  """High-level configurations for Runtime.
+
+  These include parameters that are not directly related to the experiment,
+  e.g. directories, accelerator type, etc.
+
+  Attributes:
+    distribution_strategy: e.g. 'mirrored', 'tpu', etc.
+    enable_xla: Whether or not to enable XLA.
+    per_gpu_thread_count: thread count per GPU.
+    gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
+    dataset_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    tpu: The address of the TPU to use, if any.
+    num_gpus: The number of GPUs to use, if any.
+    worker_hosts: comma-separated list of worker ip:port pairs for running
+      multi-worker models with DistributionStrategy.
+    task_index: If multi-worker training, the task index of this worker.
+    all_reduce_alg: Defines the algorithm for performing all-reduce.
+    num_packs: Sets `num_packs` in the cross device ops used in
+      MirroredStrategy.  For details, see tf.distribute.NcclAllReduce.
+    mixed_precision_dtype: dtype of mixed precision policy. It can be 'float32',
+      'float16', or 'bfloat16'.
+    loss_scale: The type of loss scale, or 'float' value. This is used when
+      setting the mixed precision policy.
+    run_eagerly: Whether or not to run the experiment eagerly.
+    batchnorm_spatial_persistent: Whether or not to enable the spatial
+      persistent mode for CuDNN batch norm kernel for improved GPU performance.
+  """
+  distribution_strategy: str = "mirrored"
+  enable_xla: bool = False
+  gpu_thread_mode: Optional[str] = None
+  dataset_num_private_threads: Optional[int] = None
+  per_gpu_thread_count: int = 0
+  tpu: Optional[str] = None
+  num_gpus: int = 0
+  worker_hosts: Optional[str] = None
+  task_index: int = -1
+  all_reduce_alg: Optional[str] = None
+  num_packs: int = 1
+  mixed_precision_dtype: Optional[str] = None
+  loss_scale: Optional[Union[str, float]] = None
+  run_eagerly: bool = False
+  batchnorm_spatial_persistent: bool = False
+
+  # XLA runtime params.
+  # XLA params are only applied to the train_step.
+  # These augments can improve training speed. They can also improve eval, but
+  # may reduce usability and users would need to make changes to code.
+
+  # Whether to enable XLA dynamic padder
+  # infrastructure to handle dynamic shapes inputs inside XLA. True by
+  # default. Disabling this may cause correctness issues with dynamic shapes
+  # inputs, as XLA will just assume the inputs are with padded shapes. However
+  # users can optionally set it to False to improve device time if masking is
+  # already handled in the user side.
+  # If None, will respect XLA default.
+  tpu_enable_xla_dynamic_padder: Optional[bool] = None
+
+  # Global model parallelism configurations.
+  num_cores_per_replica: int = 1
+  default_shard_dim: int = -1
+
+  def model_parallelism(self):
+    return dict(
+        num_cores_per_replica=self.num_cores_per_replica,
+        default_shard_dim=self.default_shard_dim)
+
+
+@dataclasses.dataclass
+class TrainerConfig(base_config.Config):
+  """Configuration for trainer.
+
+  Attributes:
+    optimizer_config: optimizer config, it includes optimizer, learning rate,
+      and warmup schedule configs.
+    train_tf_while_loop: whether or not to use tf while loop.
+    train_tf_function: whether or not to use tf_function for training loop.
+    eval_tf_function: whether or not to use tf_function for eval.
+    allow_tpu_summary: Whether to allow summary happen inside the XLA program
+      runs on TPU through automatic outside compilation.
+    steps_per_loop: number of steps per loop.
+    summary_interval: number of steps between each summary.
+    checkpoint_interval: number of steps between checkpoints.
+    max_to_keep: max checkpoints to keep.
+    continuous_eval_timeout: maximum number of seconds to wait between
+      checkpoints, if set to None, continuous eval will wait indefinitely. This
+      is only used continuous_train_and_eval and continuous_eval modes. Default
+      value is 1 hrs.
+    train_steps: number of train steps.
+    validation_steps: number of eval steps. If `None`, the entire eval dataset
+      is used.
+    validation_interval: number of training steps to run between evaluations.
+    best_checkpoint_export_subdir: if set, the trainer will keep track of the
+      best evaluation metric, and export the corresponding best checkpoint under
+      `model_dir/best_checkpoint_export_subdir`. Note that this only works if
+      mode contains eval (such as `train_and_eval`, `continuous_eval`, and
+      `continuous_train_and_eval`).
+    best_checkpoint_eval_metric: for exporting the best checkpoint, which
+      evaluation metric the trainer should monitor. This can be any evaluation
+      metric appears on tensorboard.
+    best_checkpoint_metric_comp: for exporting the best checkpoint, how the
+      trainer should compare the evaluation metrics. This can be either `higher`
+      (higher the better) or `lower` (lower the better).
+    validation_summary_subdir: A 'str', sub directory for saving eval summary.
+  """
+  optimizer_config: OptimizationConfig = OptimizationConfig()
+  # Orbit settings.
+  train_tf_while_loop: bool = True
+  train_tf_function: bool = True
+  eval_tf_function: bool = True
+  eval_tf_while_loop: bool = False
+  allow_tpu_summary: bool = False
+  # Trainer intervals.
+  steps_per_loop: int = 1000
+  summary_interval: int = 1000
+  checkpoint_interval: int = 1000
+  # Checkpoint manager.
+  max_to_keep: int = 5
+  continuous_eval_timeout: int = 60 * 60
+  # Train/Eval routines.
+  train_steps: int = 0
+  # Sets validation steps to be -1 to evaluate the entire dataset.
+  validation_steps: int = -1
+  validation_interval: int = 1000
+  # Best checkpoint export.
+  best_checkpoint_export_subdir: str = ""
+  best_checkpoint_eval_metric: str = ""
+  best_checkpoint_metric_comp: str = "higher"
+  # Blowup recovery.
+  loss_upper_bound: float = 1e6
+  recovery_begin_steps: int = 0  # Enforcing the loss bound after these steps.
+  # When max trials < 0, no recovery module; max trials = 0, we will check
+  # the condition and fail the job if the condition happens; max trials > 0,
+  # we will retore the model states.
+  recovery_max_trials: int = 0
+  validation_summary_subdir: str = "validation"
+
+
+@dataclasses.dataclass
+class TaskConfig(base_config.Config):
+  init_checkpoint: str = ""
+  model: base_config.Config = None
+  train_data: DataConfig = DataConfig()
+  validation_data: DataConfig = DataConfig()
+
+
+@dataclasses.dataclass
+class ExperimentConfig(base_config.Config):
+  """Top-level configuration."""
+  task: TaskConfig = TaskConfig()
+  trainer: TrainerConfig = TrainerConfig()
+  runtime: RuntimeConfig = RuntimeConfig()
--- a/official/core/exp_factory.py
+++ b/official/core/exp_factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experiment factory methods."""
+
+from official.core import config_definitions as cfg
+from official.core import registry
+
+
+_REGISTERED_CONFIGS = {}
+
+
+def register_config_factory(name):
+  """Register ExperimentConfig factory method."""
+  return registry.register(_REGISTERED_CONFIGS, name)
+
+
+def get_exp_config_creater(exp_name: str):
+  """Looks up ExperimentConfig factory methods."""
+  exp_creater = registry.lookup(_REGISTERED_CONFIGS, exp_name)
+  return exp_creater
+
+
+def get_exp_config(exp_name: str) -> cfg.ExperimentConfig:
+  return get_exp_config_creater(exp_name)()