Update code to v2.8.0

9485aa1d · qianyj · 89cfa348 · f5fc733a · 9485aa1d · 9485aa1d
Commit 9485aa1d authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/benchmark/tflite_utils.py
+++ b/official/benchmark/tflite_utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFLite utils."""
+import orbit
+from official.core import base_task
+from official.core import base_trainer
+from official.core import config_definitions
+
+
+def train_and_evaluate(
+    params: config_definitions.ExperimentConfig,
+    task: base_task.Task,
+    trainer: base_trainer.Trainer,
+    controller: orbit.Controller):
+  """Train and evaluate on TFLite."""
+  raise NotImplementedError('train_and_evaluate on tflite_utils is not '
+                            'implemented yet.')
--- a/official/benchmark/transformer_benchmark.py
+++ b/official/benchmark/transformer_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes Transformer w/Keras benchmark and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import flags
+import tensorflow as tf
+from official.benchmark import benchmark_wrappers
+from official.benchmark import owner_utils
+from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
+from official.legacy.transformer import misc
+from official.legacy.transformer import transformer_main
+from official.utils.flags import core as flags_core
+
+TPU_DATA_DIR = 'gs://mlcompass-data/transformer'
+GPU_DATA_DIR = os.getenv('TMPDIR')
+TRANSFORMER_EN2DE_DATA_DIR_NAME = 'wmt32k-en2de-official'
+EN2DE_2014_BLEU_DATA_DIR_NAME = 'newstest2014'
+FLAGS = flags.FLAGS
+TMP_DIR = os.getenv('TMPDIR')
+
+
+class TransformerBenchmark(PerfZeroBenchmark):
+  """Methods common to executing transformer w/keras tests.
+
+     Code under test for the Transformer Keras models report the same data and
+     require the same FLAG setup.
+
+  """
+
+  def __init__(self, output_dir=None, default_flags=None, root_data_dir=None,
+               flag_methods=None, tpu=None):
+    self._set_data_files(root_data_dir=root_data_dir)
+
+    if default_flags is None:
+      default_flags = {}
+    default_flags['data_dir'] = self.train_data_dir
+    default_flags['vocab_file'] = self.vocab_file
+
+    super(TransformerBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        flag_methods=flag_methods,
+        tpu=tpu)
+
+  def _set_data_files(self, root_data_dir=None, tpu_run=False):
+    """Sets train_data_dir, vocab_file, bleu_source and bleu_ref."""
+    # Use remote storage for TPU, remote storage for GPU if defined, else
+    # use environment provided root_data_dir.
+    if tpu_run:
+      root_data_dir = TPU_DATA_DIR
+    elif GPU_DATA_DIR is not None:
+      root_data_dir = GPU_DATA_DIR
+
+    root_data_dir = root_data_dir if root_data_dir else ''
+
+    self.train_data_dir = os.path.join(root_data_dir,
+                                       TRANSFORMER_EN2DE_DATA_DIR_NAME)
+    self.vocab_file = os.path.join(root_data_dir,
+                                   TRANSFORMER_EN2DE_DATA_DIR_NAME,
+                                   'vocab.ende.32768')
+    self.bleu_source = os.path.join(root_data_dir,
+                                    EN2DE_2014_BLEU_DATA_DIR_NAME,
+                                    'newstest2014.en')
+    self.bleu_ref = os.path.join(root_data_dir,
+                                 EN2DE_2014_BLEU_DATA_DIR_NAME,
+                                 'newstest2014.de')
+
+  def _set_data_file_flags(self):
+    """Sets the FLAGS for the data files."""
+    FLAGS.data_dir = self.train_data_dir
+    FLAGS.vocab_file = self.vocab_file
+    # Sets values directly to avoid validation check.
+    FLAGS['bleu_source'].value = self.bleu_source
+    FLAGS['bleu_ref'].value = self.bleu_ref
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self,
+                                bleu_max=None,
+                                bleu_min=None,
+                                log_steps=None,
+                                total_batch_size=None,
+                                warmup=1):
+    """Report benchmark results by writing to local protobuf file.
+
+    Args:
+      bleu_max: highest passing level for bleu score.
+      bleu_min: lowest passing level for bleu score.
+      log_steps: How often the log was created for stats['step_timestamp_log'].
+      total_batch_size: Global batch-size.
+      warmup: number of entries in stats['step_timestamp_log'] to ignore.
+    """
+    start_time_sec = time.time()
+    task = transformer_main.TransformerTask(FLAGS)
+    stats = task.train()
+    wall_time_sec = time.time() - start_time_sec
+
+    metrics = []
+    if 'bleu_uncased' in stats:
+      if 'bleu_uncased_history' in stats:
+        bleu_uncased_best = max(stats['bleu_uncased_history'],
+                                key=lambda x: x[1])
+        metrics.append({'name': 'bleu_uncased',
+                        'value': bleu_uncased_best[1],
+                        'min_value': bleu_min,
+                        'max_value': bleu_max})
+        metrics.append({'name': 'bleu_best_score_iteration',
+                        'value': bleu_uncased_best[0]})
+        metrics.append({'name': 'bleu_uncased_last',
+                        'value': stats['bleu_uncased']})
+      else:
+        metrics.append({'name': 'bleu_uncased',
+                        'value': stats['bleu_uncased'],
+                        'min_value': bleu_min,
+                        'max_value': bleu_max})
+
+    if (warmup and 'step_timestamp_log' in stats and
+        len(stats['step_timestamp_log']) > warmup + 1):
+      # first entry in the time_log is start of step 1. The rest of the
+      # entries are the end of each step recorded
+      time_log = stats['step_timestamp_log']
+      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      num_examples = (
+          total_batch_size * log_steps * (len(time_log) - warmup - 1))
+      examples_per_sec = num_examples / elapsed
+      metrics.append({'name': 'exp_per_second',
+                      'value': examples_per_sec})
+
+    if 'avg_exp_per_second' in stats:
+      metrics.append({'name': 'avg_exp_per_second',
+                      'value': stats['avg_exp_per_second']})
+
+    if 'step_timestamp_log' in stats:
+      time_log = stats['step_timestamp_log']
+      metrics.append({'name': 'startup_time',
+                      'value': time_log[0].timestamp - start_time_sec})
+
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics,
+                          extras={'flags': flags_str})
+
+
+class TransformerBaseKerasAccuracy(TransformerBenchmark):
+  """Benchmark accuracy tests for Transformer Base model w/ Keras."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Benchmark accuracy tests for Transformer Base model w/ Keras.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    flag_methods = [misc.define_transformer_flags]
+
+    super(TransformerBaseKerasAccuracy, self).__init__(
+        output_dir=output_dir, root_data_dir=root_data_dir,
+        flag_methods=flag_methods)
+
+  def benchmark_1_gpu(self):
+    """Benchmark 1 gpu.
+
+      The paper uses 8 GPUs and a much larger effective batch size, this is will
+      not converge to the 27.3 BLEU (uncased) SOTA.
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 1
+    FLAGS.param_set = 'base'
+    FLAGS.batch_size = 2048
+    FLAGS.train_steps = 1000
+    FLAGS.steps_between_evals = 500
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    # These bleu scores are based on test runs after at this limited
+    # number of steps and batch size after verifying SOTA at 8xV100s.
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=25.3,
+                                   bleu_max=26)
+
+  def benchmark_1_gpu_static_batch(self):
+    """Benchmark 1 gpu with static_batch.
+
+      The paper uses 8 GPUs and a much larger effective batch size, this is will
+      not converge to the 27.3 BLEU (uncased) SOTA.
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 1
+    FLAGS.param_set = 'base'
+    FLAGS.batch_size = 4096
+    FLAGS.train_steps = 100000
+    FLAGS.steps_between_evals = 5000
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_static_batch')
+    # These bleu scores are based on test runs after at this limited
+    # number of steps and batch size after verifying SOTA at 8xV100s.
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=25.3,
+                                   bleu_max=26)
+
+  def benchmark_8_gpu(self):
+    """Benchmark 8 gpu.
+
+      Should converge to 27.3 BLEU (uncased). This has not been confirmed yet.
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 8
+    FLAGS.param_set = 'base'
+    FLAGS.batch_size = 4096*8
+    FLAGS.train_steps = 100000
+    FLAGS.steps_between_evals = 20000
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=27,
+                                   bleu_max=28)
+
+  def benchmark_8_gpu_static_batch(self):
+    """Benchmark 8 gpu.
+
+      Should converge to 27.3 BLEU (uncased). This has not been confirmed yet.
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 8
+    FLAGS.param_set = 'base'
+    FLAGS.batch_size = 4096*8
+    FLAGS.train_steps = 100000
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    FLAGS.steps_between_evals = 5000
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=27,
+                                   bleu_max=28)
+
+
+class TransformerBigKerasAccuracy(TransformerBenchmark):
+  """Benchmark accuracy tests for Transformer Big model w/ Keras."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Benchmark accuracy tests for Transformer Big model w/ Keras.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    flag_methods = [misc.define_transformer_flags]
+
+    super(TransformerBigKerasAccuracy, self).__init__(
+        output_dir=output_dir, root_data_dir=root_data_dir,
+        flag_methods=flag_methods)
+
+  def benchmark_8_gpu(self):
+    """Benchmark 8 gpu.
+
+    Over 6 runs with eval every 20K steps the average highest value was 28.195
+    (bleu uncased). 28.424 was the highest and 27.96 the lowest. The values are
+    the highest value seen during a run and occurred at a median of iteration 9.
+    Iterations are not epochs, an iteration is a number of steps between evals.
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 8
+    FLAGS.param_set = 'big'
+    FLAGS.batch_size = 3072*8
+    FLAGS.train_steps = 20000 * 12
+    FLAGS.steps_between_evals = 20000
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=27.9,
+                                   bleu_max=29.2)
+
+  def benchmark_8_gpu_static_batch(self):
+    """Benchmark 8 gpu.
+
+    Should converge to 28.4 BLEU (uncased). This has not be verified yet."
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 8
+    FLAGS.param_set = 'big'
+    FLAGS.batch_size = 3072*8
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    FLAGS.train_steps = 20000 * 12
+    FLAGS.steps_between_evals = 20000
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=28,
+                                   bleu_max=29.2)
+
+  def benchmark_8_gpu_fp16(self):
+    """Benchmark 8 gpu with dynamic batch and fp16.
+
+    Over 6 runs with eval every 20K steps the average highest value was 28.247
+    (bleu uncased). 28.424 was the highest and 28.09 the lowest. The values are
+    the highest value seen during a run and occurred at a median of iteration
+    11. While this could be interpreted as worse than FP32, if looking at the
+    first iteration at which 28 is passed FP16 performs equal and possibly
+    better. Although not part of the initial test runs, the highest value
+    recorded with the arguments below was 28.9 at iteration 12. Iterations are
+    not epochs, an iteration is a number of steps between evals.
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 8
+    FLAGS.dtype = 'fp16'
+    FLAGS.param_set = 'big'
+    FLAGS.batch_size = 3072*8
+    FLAGS.train_steps = 20000 * 12
+    FLAGS.steps_between_evals = 20000
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=28,
+                                   bleu_max=29.2)
+
+  def benchmark_8_gpu_static_batch_fp16(self):
+    """Benchmark 8 gpu with static batch and fp16.
+
+      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 8
+    FLAGS.dtype = 'fp16'
+    FLAGS.param_set = 'big'
+    FLAGS.batch_size = 3072*8
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    FLAGS.train_steps = 400000
+    FLAGS.steps_between_evals = 20000
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch_fp16')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=28,
+                                   bleu_max=29.2)
+
+  def benchmark_xla_8_gpu_static_batch_fp16(self):
+    """Benchmark 8 gpu with static batch, XLA, and FP16.
+
+      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
+    """
+    self._setup()
+    self._set_data_file_flags()
+    FLAGS.num_gpus = 8
+    FLAGS.dtype = 'fp16'
+    FLAGS.enable_xla = True
+    FLAGS.param_set = 'big'
+    FLAGS.batch_size = 3072*8
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    FLAGS.train_steps = 400000
+    FLAGS.steps_between_evals = 20000
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_xla_8_gpu_static_batch_fp16')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=28,
+                                   bleu_max=29.2)
+
+
+class TransformerKerasBenchmark(TransformerBenchmark):
+  """Benchmarks for Transformer (Base and Big) using Keras."""
+
+  def __init__(self, output_dir=None, default_flags=None,
+               root_data_dir=None, batch_per_gpu=4096, tpu=None):
+    """Initialize.
+
+    Args:
+      output_dir: Based directory for saving artifacts, e.g. checkpoints.
+      default_flags: default flags to use for all tests.
+      root_data_dir: root directory for data, e.g. training.
+      batch_per_gpu: batch size to use per gpu.
+      tpu: Target TPU to use.
+    """
+    flag_methods = [misc.define_transformer_flags]
+    self.batch_per_gpu = batch_per_gpu
+
+    super(TransformerKerasBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        root_data_dir=root_data_dir,
+        flag_methods=flag_methods,
+        tpu=tpu)
+
+  def benchmark_1_gpu_no_dist_strat(self):
+    """Benchmark 1 gpu without distribution strategy."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_1_gpu_no_dist_strat_static_batch(self):
+    """Benchmark 1 gpu without distribution strategy with static batch."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_ds_sb')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_1_gpu(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_1_gpu_fp16(self):
+    """Benchmark 1 gpu FP16."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
+    FLAGS.dtype = 'fp16'
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_1_gpu(self):
+    """Benchmark 1 gpu w/xla."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_1_gpu_fp16(self):
+    """Benchmark 1 gpu w/xla and FP16."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
+    FLAGS.enable_xla = True
+    FLAGS.dtype = 'fp16'
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_1_gpu_static_batch(self):
+    """Benchmark 1 gpu with static batch."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_static_batch')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_1_gpu_static_batch(self):
+    """Benchmark 1 gpu with static batch w/xla."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_static_batch')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_1_gpu_static_batch_fp16(self):
+    """Benchmark 1 gpu with static batch FP16."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_static_batch_fp16')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    FLAGS.dtype = 'fp16'
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_1_gpu_static_batch_fp16(self):
+    """Benchmark 1 gpu with static batch w/xla and FP16."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_xla_1_gpu_static_batch_fp16')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    FLAGS.enable_xla = True
+    FLAGS.dtype = 'fp16'
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_8_gpu(self):
+    """Benchmark 8 gpu. This defaults to using TF32."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_8_gpu_fp16(self):
+    """Benchmark 8 gpu FP16."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_8_gpu(self):
+    """Benchmark 8 gpu w/xla. This defaults to using TF32."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.enable_xla = True
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_8_gpu_fp16(self):
+    """Benchmark 8 gpu w/xla and FP16."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.enable_xla = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_8_gpu_static_batch(self):
+    """Benchmark 8 gpu with static batch."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_static_batch')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_8_gpu_static_batch_fp16(self):
+    """Benchmark 8 gpu with static batch FP16."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_8_gpu_static_batch_fp16')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_8_gpu_static_batch(self):
+    """Benchmark 8 gpu with static batch w/xla."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.enable_xla = True
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_static_batch')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_8_gpu_static_batch_fp16(self):
+    """Benchmark 8 gpu with static batch w/xla and FP16."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.enable_xla = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_xla_8_gpu_static_batch_fp16')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+  def benchmark_xla_8_gpu_static_batch_fp32_no_tf32(self):
+    """Benchmark 8 gpu with static batch w/xla and FP16."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.enable_xla = True
+    FLAGS.batch_size = self.batch_per_gpu * 8
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_xla_8_gpu_static_batch_fp32_no_tf32')
+    FLAGS.static_batch = True
+    FLAGS.max_length = 64
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+
+
+class TransformerBaseKerasBenchmarkReal(TransformerKerasBenchmark):
+  """Transformer based version real data benchmark tests."""
+
+  def __init__(self, output_dir=TMP_DIR, root_data_dir=TMP_DIR, **kwargs):
+    def_flags = {}
+    def_flags['param_set'] = 'base'
+    def_flags['train_steps'] = 50
+    def_flags['log_steps'] = 10
+
+    super(TransformerBaseKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags,
+        root_data_dir=root_data_dir, batch_per_gpu=4096)
+
+
+class TransformerBigKerasBenchmarkReal(TransformerKerasBenchmark):
+  """Transformer based version real data benchmark tests."""
+
+  def __init__(self, output_dir=TMP_DIR, root_data_dir=TMP_DIR,
+               tpu=None, **kwargs):
+    def_flags = {}
+    def_flags['param_set'] = 'big'
+    def_flags['train_steps'] = 50
+    def_flags['log_steps'] = 10
+
+    super(TransformerBigKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags,
+        root_data_dir=root_data_dir, batch_per_gpu=3072,
+        tpu=tpu)
+
+  def _set_df_common(self):
+    self._set_data_files(tpu_run=True)
+    FLAGS.data_dir = self.train_data_dir
+    FLAGS.vocab_file = self.vocab_file
+    FLAGS.distribution_strategy = 'tpu'
+    FLAGS.padded_decode = True
+    FLAGS.train_steps = 300
+    FLAGS.log_steps = 150
+    FLAGS.steps_between_evals = 150
+    FLAGS.static_batch = True
+    FLAGS.use_ctl = True
+    FLAGS.enable_checkpointing = False
+    FLAGS.max_length = 64
+    FLAGS.decode_batch_size = 32
+    FLAGS.decode_max_length = 97
+
+  def benchmark_2x2_tpu(self):
+    """Port of former snaggletooth transformer_big model on 2x2."""
+    self._setup()
+    self._set_df_common()
+    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu')
+    FLAGS.batch_size = 6144
+
+    self._run_and_report_benchmark(
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps)
+
+  @owner_utils.Owner('tf-graph-compiler')
+  def benchmark_2x2_tpu_mlir(self):
+    """Run transformer_big model on 2x2 with the MLIR Bridge enabled."""
+    self._setup()
+    self._set_df_common()
+    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mlir')
+    FLAGS.batch_size = 6144
+    tf.config.experimental.enable_mlir_bridge()
+
+    self._run_and_report_benchmark(
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps)
+
+  def benchmark_4x4_tpu(self):
+    """Port of former GCP transformer_big model on 4x4."""
+    self._setup()
+    self._set_df_common()
+    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu')
+    FLAGS.batch_size = 24576
+
+    self._run_and_report_benchmark(
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps)
+
+  @owner_utils.Owner('tf-graph-compiler')
+  def benchmark_4x4_tpu_mlir(self):
+    """Run transformer_big model on 4x4 with the MLIR Bridge enabled."""
+    self._setup()
+    self._set_df_common()
+    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_mlir')
+    FLAGS.batch_size = 24576
+    tf.config.experimental.enable_mlir_bridge()
+
+    self._run_and_report_benchmark(
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/xlnet_benchmark.py
+++ b/official/benchmark/xlnet_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes XLNet benchmarks and accuracy tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import time
+
+# pylint: disable=g-bad-import-order
+
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf
+# pylint: enable=g-bad-import-order
+
+from official.benchmark import bert_benchmark_utils as benchmark_utils
+from official.benchmark import owner_utils
+from official.nlp.xlnet import run_classifier
+from official.nlp.xlnet import run_squad
+from official.benchmark import benchmark_wrappers
+
+
+# pylint: disable=line-too-long
+PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/xlnet/large/xlnet_model-1'
+CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.train.tf_record'
+CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.dev.eval.tf_record'
+SQUAD_DATA_PATH = 'gs://tf-perfzero-data/xlnet/squadv2_cased/'
+# pylint: enable=line-too-long
+
+FLAGS = flags.FLAGS
+
+
+class XLNetBenchmarkBase(benchmark_utils.BertBenchmarkBase):
+  """Base class to hold methods common to test classes in the module."""
+
+  def __init__(self, output_dir=None, tpu=None):
+    super(XLNetBenchmarkBase, self).__init__(output_dir=output_dir, tpu=tpu)
+    self.num_epochs = None
+    self.num_steps_per_epoch = None
+
+  @flagsaver.flagsaver
+  def _run_xlnet_classifier(self):
+    """Starts XLNet classification task."""
+    run_classifier.main(unused_argv=None)
+
+  @flagsaver.flagsaver
+  def _run_xlnet_squad(self):
+    """Starts XLNet classification task."""
+    run_squad.main(unused_argv=None)
+
+
+class XLNetClassifyAccuracy(XLNetBenchmarkBase):
+  """Short accuracy test for XLNet classifier model.
+
+  Tests XLNet classification task model accuracy. The naming
+  convention of below test cases follow
+  `benchmark_(number of gpus)_gpu_(dataset type)` format.
+  """
+
+  def __init__(self, output_dir=None, tpu=None, **kwargs):
+    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
+    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
+    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
+
+    super(XLNetClassifyAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self,
+                                training_summary_path,
+                                min_accuracy=0.95,
+                                max_accuracy=0.97):
+    """Starts XLNet accuracy benchmark test."""
+
+    start_time_sec = time.time()
+    self._run_xlnet_classifier()
+    wall_time_sec = time.time() - start_time_sec
+
+    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
+      summary = json.loads(reader.read().decode('utf-8'))
+
+    super(XLNetClassifyAccuracy, self)._report_benchmark(
+        stats=summary,
+        wall_time_sec=wall_time_sec,
+        min_accuracy=min_accuracy,
+        max_accuracy=max_accuracy)
+
+  def _setup(self):
+    super(XLNetClassifyAccuracy, self)._setup()
+    FLAGS.test_data_size = 25024
+    FLAGS.train_batch_size = 16
+    FLAGS.seq_len = 512
+    FLAGS.mem_len = 0
+    FLAGS.n_layer = 24
+    FLAGS.d_model = 1024
+    FLAGS.d_embed = 1024
+    FLAGS.n_head = 16
+    FLAGS.d_head = 64
+    FLAGS.d_inner = 4096
+    FLAGS.untie_r = True
+    FLAGS.n_class = 2
+    FLAGS.ff_activation = 'gelu'
+    FLAGS.strategy_type = 'mirror'
+    FLAGS.learning_rate = 2e-5
+    FLAGS.train_steps = 4000
+    FLAGS.warmup_steps = 500
+    FLAGS.iterations = 200
+    FLAGS.bi_data = False
+    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
+    FLAGS.train_tfrecord_path = self.train_data_path
+    FLAGS.test_tfrecord_path = self.eval_data_path
+
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_8_gpu_imdb(self):
+    """Run XLNet model accuracy test with 8 GPUs."""
+    self._setup()
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_imdb')
+    # Sets timer_callback to None as we do not use it now.
+    self.timer_callback = None
+
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_2x2_tpu_imdb(self):
+    """Run XLNet model accuracy test on 2x2 tpu."""
+    self._setup()
+    FLAGS.strategy_type = 'tpu'
+    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_imdb')
+    # Sets timer_callback to None as we do not use it now.
+    self.timer_callback = None
+
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+
+
+class XLNetSquadAccuracy(XLNetBenchmarkBase):
+  """Short accuracy test for XLNet squad model.
+
+  Tests XLNet squad task model accuracy. The naming
+  convention of below test cases follow
+  `benchmark_(number of gpus)_gpu_(dataset type)` format.
+  """
+
+  def __init__(self, output_dir=None, tpu=None, **kwargs):
+    self.train_data_path = SQUAD_DATA_PATH
+    self.predict_file = os.path.join(SQUAD_DATA_PATH, 'dev-v2.0.json')
+    self.test_data_path = os.path.join(SQUAD_DATA_PATH, '12048.eval.tf_record')
+    self.spiece_model_file = os.path.join(SQUAD_DATA_PATH, 'spiece.cased.model')
+    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
+
+    super(XLNetSquadAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self,
+                                training_summary_path,
+                                min_accuracy=87.0,
+                                max_accuracy=89.0):
+    """Starts XLNet accuracy benchmark test."""
+
+    start_time_sec = time.time()
+    self._run_xlnet_squad()
+    wall_time_sec = time.time() - start_time_sec
+
+    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
+      summary = json.loads(reader.read().decode('utf-8'))
+
+    super(XLNetSquadAccuracy, self)._report_benchmark(
+        stats=summary,
+        wall_time_sec=wall_time_sec,
+        min_accuracy=min_accuracy,
+        max_accuracy=max_accuracy)
+
+  def _setup(self):
+    super(XLNetSquadAccuracy, self)._setup()
+    FLAGS.train_batch_size = 16
+    FLAGS.seq_len = 512
+    FLAGS.mem_len = 0
+    FLAGS.n_layer = 24
+    FLAGS.d_model = 1024
+    FLAGS.d_embed = 1024
+    FLAGS.n_head = 16
+    FLAGS.d_head = 64
+    FLAGS.d_inner = 4096
+    FLAGS.untie_r = True
+    FLAGS.ff_activation = 'gelu'
+    FLAGS.strategy_type = 'mirror'
+    FLAGS.learning_rate = 3e-5
+    FLAGS.train_steps = 8000
+    FLAGS.warmup_steps = 1000
+    FLAGS.iterations = 1000
+    FLAGS.bi_data = False
+    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
+    FLAGS.train_tfrecord_path = self.train_data_path
+    FLAGS.test_tfrecord_path = self.test_data_path
+    FLAGS.spiece_model_file = self.spiece_model_file
+    FLAGS.predict_file = self.predict_file
+    FLAGS.adam_epsilon = 1e-6
+    FLAGS.lr_layer_decay_rate = 0.75
+
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_8_gpu_squadv2(self):
+    """Run XLNet model squad v2 accuracy test with 8 GPUs."""
+    self._setup()
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squadv2')
+    FLAGS.predict_dir = FLAGS.model_dir
+    # Sets timer_callback to None as we do not use it now.
+    self.timer_callback = None
+
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_2x2_tpu_squadv2(self):
+    """Run XLNet model squad v2 accuracy test on 2x2 tpu."""
+    self._setup()
+    FLAGS.strategy_type = 'tpu'
+    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_squadv2')
+    FLAGS.predict_dir = FLAGS.model_dir
+    # Sets timer_callback to None as we do not use it now.
+    self.timer_callback = None
+
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/colab/decoding_api_in_tf_nlp.ipynb
+++ b/official/colab/decoding_api_in_tf_nlp.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vXLA5InzXydn"
+      },
+      "source": [
+        "##### Copyright 2021 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "RuRlpLL-X0R_"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fsACVQpVSifi"
+      },
+      "source": [
+        "### Install the TensorFlow Model Garden pip package\n",
+        "\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
+        "*  pip will install all models and dependencies automatically."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hYEwGTeCXnnX"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2j-xhrsVQOQT"
+      },
+      "outputs": [],
+      "source": [
+        "pip install  tf-models-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BjP7zwxmskpY"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "from official import nlp\n",
+        "from official.nlp.modeling.ops import sampling_module\n",
+        "from official.nlp.modeling.ops import beam_search"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0AWgyo-IQ5sP"
+      },
+      "source": [
+        "# Decoding API\n",
+        "This API provides an interface to experiment with different decoding strategies used for auto-regressive models.\n",
+        "\n",
+        "1. The following sampling strategies are provided in sampling_module.py, which inherits from the base Decoding class:\n",
+        "  *   [top_p](https://arxiv.org/abs/1904.09751) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L65) \n",
+        "\n",
+        "      This implementation chooses most probable logits with cumulative probabilities upto top_p.\n",
+        "\n",
+        "  *   [top_k](https://arxiv.org/pdf/1805.04833.pdf) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L48)\n",
+        "\n",
+        "      At each timestep, this implementation samples from top-k logits based on their probability distribution\n",
+        "\n",
+        "  *   Greedy : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L26)\n",
+        "\n",
+        "      This implementation returns the top logits based on probabilities.\n",
+        "\n",
+        "2. Beam search is provided in beam_search.py. [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search.py)\n",
+        "\n",
+        "      This implementation reduces the risk of missing hidden high probability logits by keeping the most likely num_beams of logits at each time step and eventually choosing the logits that has the overall highest probability."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MfOj7oaBRQnS"
+      },
+      "source": [
+        "## Initialize Sampling Module in TF-NLP.\n",
+        "\n",
+        "\n",
+        "\u003e **symbols_to_logits_fn** : This is a closure implemented by the users of the API. The input to this closure will be  \n",
+        "```\n",
+        "Args:\n",
+        "  1] ids [batch_size, .. (index + 1 or 1 if padded_decode is True)],\n",
+        "  2] index [scalar] : current decoded step,\n",
+        "  3] cache [nested dictionary of tensors].\n",
+        "Returns:\n",
+        "  1] tensor for next-step logits [batch_size, vocab]\n",
+        "  2] the updated_cache [nested dictionary of tensors].\n",
+        "```\n",
+        "This closure calls the model to predict the logits for the 'index+1' step. The cache is used for faster decoding.\n",
+        "Here is a [reference](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search_test.py#L88) implementation for the above closure.\n",
+        "\n",
+        "\n",
+        "\u003e **length_normalization_fn** : Closure for returning length normalization parameter.\n",
+        "```\n",
+        "Args: \n",
+        "  1] length : scalar for decoded step index.\n",
+        "  2] dtype : data-type of output tensor\n",
+        "Returns:\n",
+        "  1] value of length normalization factor.\n",
+        "Example :\n",
+        "  def _length_norm(length, dtype):\n",
+        "    return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)\n",
+        "```\n",
+        "\n",
+        "\u003e **vocab_size** : Output vocabulary size.\n",
+        "\n",
+        "\u003e **max_decode_length** : Scalar for total number of decoding steps.\n",
+        "\n",
+        "\u003e **eos_id** : Decoding will stop if all output decoded ids in the batch have this ID.\n",
+        "\n",
+        "\u003e **padded_decode** : Set this to True if running on TPU. Tensors are padded to max_decoding_length if this is True.\n",
+        "\n",
+        "\u003e **top_k** : top_k is enabled if this value is \u003e 1.\n",
+        "\n",
+        "\u003e **top_p** : top_p is enabled if this value is \u003e 0 and \u003c 1.0\n",
+        "\n",
+        "\u003e **sampling_temperature** : This is used to re-estimate the softmax output. Temperature skews the distribution towards high probability tokens and lowers the mass in tail distribution. Value has to be positive. Low temperature is equivalent to greedy and makes the distribution sharper, while high temperature makes it more flat.\n",
+        "\n",
+        "\u003e **enable_greedy** : By default, this is true and greedy decoding is enabled.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lV1RRp6ihnGX"
+      },
+      "source": [
+        "# Initialize the Model Hyper-parameters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "eTsGp2gaKLdE"
+      },
+      "outputs": [],
+      "source": [
+        "params = {}\n",
+        "params['num_heads'] = 2\n",
+        "params['num_layers'] = 2\n",
+        "params['batch_size'] = 2\n",
+        "params['n_dims'] = 256\n",
+        "params['max_decode_length'] = 4"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UGvmd0_dRFYI"
+      },
+      "source": [
+        "## What is a Cache?\n",
+        "In auto-regressive architectures like Transformer based [Encoder-Decoder](https://arxiv.org/abs/1706.03762) models, \n",
+        "Cache is used for fast sequential decoding.\n",
+        "It is a nested dictionary storing pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) for every layer.\n",
+        "\n",
+        "```\n",
+        "{\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers']),\n",
+        "    'model_specific_item' : Model specific tensor shape,\n",
+        "}\n",
+        "\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CYXkoplAij01"
+      },
+      "source": [
+        "# Initialize cache. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "D6kfZOOKgkm1"
+      },
+      "outputs": [],
+      "source": [
+        "cache = {\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers'])\n",
+        "    }\n",
+        "print(\"cache key shape for layer 1 :\", cache['layer_1']['k'].shape)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nNY3Xn8SiblP"
+      },
+      "source": [
+        "# Define closure for length normalization. **optional.**\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T92ccAzlnGqh"
+      },
+      "outputs": [],
+      "source": [
+        "def length_norm(length, dtype):\n",
+        "  \"\"\"Return length normalization factor.\"\"\"\n",
+        "  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "syl7I5nURPgW"
+      },
+      "source": [
+        "# Create model_fn\n",
+        "  In practice, this will be replaced by an actual model implementation such as [here](https://github.com/tensorflow/models/blob/master/official/nlp/transformer/transformer.py#L236)\n",
+        "```\n",
+        "Args:\n",
+        "i : Step that is being decoded.\n",
+        "Returns:\n",
+        "  logit probabilities of size [batch_size, 1, vocab_size]\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AhzSkRisRdB6"
+      },
+      "outputs": [],
+      "source": [
+        "probabilities = tf.constant([[[0.3, 0.4, 0.3], [0.3, 0.3, 0.4],\n",
+        "                              [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],\n",
+        "                            [[0.2, 0.5, 0.3], [0.2, 0.7, 0.1],\n",
+        "                              [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]])\n",
+        "def model_fn(i):\n",
+        "  return probabilities[:, i, :]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DBMUkaVmVZBg"
+      },
+      "source": [
+        "# Initialize symbols_to_logits_fn\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FAJ4CpbfVdjr"
+      },
+      "outputs": [],
+      "source": [
+        "def _symbols_to_logits_fn():\n",
+        "  \"\"\"Calculates logits of the next tokens.\"\"\"\n",
+        "  def symbols_to_logits_fn(ids, i, temp_cache):\n",
+        "    del ids\n",
+        "    logits = tf.cast(tf.math.log(model_fn(i)), tf.float32)\n",
+        "    return logits, temp_cache\n",
+        "  return symbols_to_logits_fn"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "R_tV3jyWVL47"
+      },
+      "source": [
+        "# Greedy \n",
+        "Greedy decoding selects the token id with the highest probability as its next id: $id_t = argmax_{w}P(id | id_{1:t-1})$ at each timestep $t$. The following sketch shows greedy decoding. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aGt9idSkVQEJ"
+      },
+      "outputs": [],
+      "source": [
+        "greedy_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=None,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    padded_decode=False)\n",
+        "ids, _ = greedy_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"Greedy Decoded Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "s4pTTsQXVz5O"
+      },
+      "source": [
+        "# top_k sampling\n",
+        "In *Top-K* sampling, the *K* most likely next token ids are filtered and the probability mass is redistributed among only those *K* ids. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pCLWIn6GV5_G"
+      },
+      "outputs": [],
+      "source": [
+        "top_k_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=length_norm,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    sample_temperature=tf.constant(1.0),\n",
+        "    top_k=tf.constant(3),\n",
+        "    padded_decode=False,\n",
+        "    enable_greedy=False)\n",
+        "ids, _ = top_k_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"top-k sampled Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Jp3G-eE_WI4Y"
+      },
+      "source": [
+        "# top_p sampling\n",
+        "Instead of sampling only from the most likely *K* token ids, in *Top-p* sampling chooses from the smallest possible set of ids whose cumulative probability exceeds the probability *p*."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rEGdIWcuWILO"
+      },
+      "outputs": [],
+      "source": [
+        "top_p_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=length_norm,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    sample_temperature=tf.constant(1.0),\n",
+        "    top_p=tf.constant(0.9),\n",
+        "    padded_decode=False,\n",
+        "    enable_greedy=False)\n",
+        "ids, _ = top_p_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"top-p sampled Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2hcuyJ2VWjDz"
+      },
+      "source": [
+        "# Beam search decoding\n",
+        "Beam search reduces the risk of missing hidden high probability token ids by keeping the most likely num_beams of hypotheses at each time step and eventually choosing the hypothesis that has the overall highest probability. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cJ3WzvSrWmSA"
+      },
+      "outputs": [],
+      "source": [
+        "beam_size = 2\n",
+        "params['batch_size'] = 1\n",
+        "beam_cache = {\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers'])\n",
+        "    }\n",
+        "print(\"cache key shape for layer 1 :\", beam_cache['layer_1']['k'].shape)\n",
+        "ids, _ = beam_search.sequence_beam_search(\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    initial_ids=tf.constant([9], tf.int32),\n",
+        "    initial_cache=beam_cache,\n",
+        "    vocab_size=3,\n",
+        "    beam_size=beam_size,\n",
+        "    alpha=0.6,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    padded_decode=False,\n",
+        "    dtype=tf.float32)\n",
+        "print(\"Beam search ids:\", ids)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "decoding_api_in_tf_nlp.ipynb",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/official/colab/nlp/customize_encoder.ipynb
+++ b/official/colab/nlp/customize_encoder.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Customizing a Transformer Encoder",
+      "private_outputs": true,
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Bp8t2AI8i7uP"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "cellView": "form",
+        "id": "rxPj2Lsni9O4"
+      },
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6xS-9i5DrRvO"
+      },
+      "source": [
+        "# Customizing a Transformer Encoder"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Mwb9uw1cDXsa"
+      },
+      "source": [
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iLrcV4IyrcGX"
+      },
+      "source": [
+        "## Learning objectives\n",
+        "\n",
+        "The [TensorFlow Models NLP library](https://github.com/tensorflow/models/tree/master/official/nlp/modeling) is a collection of tools for building and training modern high performance natural language models.\n",
+        "\n",
+        "The [TransformEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py) is the core of this library, and lots of new network architectures are proposed to improve the encoder. In this Colab notebook, we will learn how to customize the encoder to employ new network architectures."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YYxdyoWgsl8t"
+      },
+      "source": [
+        "## Install and import"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fEJSFutUsn_h"
+      },
+      "source": [
+        "### Install the TensorFlow Model Garden pip package\n",
+        "\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
+        "*  `pip` will install all models and dependencies automatically."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "thsKZDjhswhR"
+      },
+      "source": [
+        "!pip install -q tf-models-official==2.4.0"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hpf7JPCVsqtv"
+      },
+      "source": [
+        "### Import Tensorflow and other libraries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "my4dp-RMssQe"
+      },
+      "source": [
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "from official.modeling import activations\n",
+        "from official.nlp import modeling\n",
+        "from official.nlp.modeling import layers, losses, models, networks"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vjDmVsFfs85n"
+      },
+      "source": [
+        "## Canonical BERT encoder\n",
+        "\n",
+        "Before learning how to customize the encoder, let's firstly create a canonical BERT enoder and use it to instantiate a `BertClassifier` for classification task."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Oav8sbgstWc-"
+      },
+      "source": [
+        "cfg = {\n",
+        "    \"vocab_size\": 100,\n",
+        "    \"hidden_size\": 32,\n",
+        "    \"num_layers\": 3,\n",
+        "    \"num_attention_heads\": 4,\n",
+        "    \"intermediate_size\": 64,\n",
+        "    \"activation\": activations.gelu,\n",
+        "    \"dropout_rate\": 0.1,\n",
+        "    \"attention_dropout_rate\": 0.1,\n",
+        "    \"max_sequence_length\": 16,\n",
+        "    \"type_vocab_size\": 2,\n",
+        "    \"initializer\": tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
+        "}\n",
+        "bert_encoder = modeling.networks.BertEncoder(**cfg)\n",
+        "\n",
+        "def build_classifier(bert_encoder):\n",
+        "  return modeling.models.BertClassifier(bert_encoder, num_classes=2)\n",
+        "\n",
+        "canonical_classifier_model = build_classifier(bert_encoder)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Qe2UWI6_tsHo"
+      },
+      "source": [
+        "`canonical_classifier_model` can be trained using the training data. For details about how to train the model, please see the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb). We skip the code that trains the model here.\n",
+        "\n",
+        "After training, we can apply the model to do prediction.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "csED2d-Yt5h6"
+      },
+      "source": [
+        "def predict(model):\n",
+        "  batch_size = 3\n",
+        "  np.random.seed(0)\n",
+        "  word_ids = np.random.randint(\n",
+        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
+        "  mask = np.random.randint(2, size=(batch_size, cfg[\"max_sequence_length\"]))\n",
+        "  type_ids = np.random.randint(\n",
+        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
+        "  print(model([word_ids, mask, type_ids], training=False))\n",
+        "\n",
+        "predict(canonical_classifier_model)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PzKStEK9t_Pb"
+      },
+      "source": [
+        "## Customize BERT encoder\n",
+        "\n",
+        "One BERT encoder consists of an embedding network and multiple transformer blocks, and each transformer block contains an attention layer and a feedforward layer."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rmwQfhj6fmKz"
+      },
+      "source": [
+        "We provide easy ways to customize each of those components via (1)\n",
+        "[EncoderScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py) and (2) [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xsMgEVHAui11"
+      },
+      "source": [
+        "### Use EncoderScaffold\n",
+        "\n",
+        "`EncoderScaffold` allows users to provide a custom embedding subnetwork\n",
+        "  (which will replace the standard embedding logic) and/or a custom hidden layer class (which will replace the `Transformer` instantiation in the encoder)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-JBabpa2AOz8"
+      },
+      "source": [
+        "#### Without Customization\n",
+        "\n",
+        "Without any customization, `EncoderScaffold` behaves the same the canonical `BertEncoder`.\n",
+        "\n",
+        "As shown in the following example, `EncoderScaffold` can load `BertEncoder`'s weights and output the same values:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ktNzKuVByZQf"
+      },
+      "source": [
+        "default_hidden_cfg = dict(\n",
+        "    num_attention_heads=cfg[\"num_attention_heads\"],\n",
+        "    intermediate_size=cfg[\"intermediate_size\"],\n",
+        "    intermediate_activation=activations.gelu,\n",
+        "    dropout_rate=cfg[\"dropout_rate\"],\n",
+        "    attention_dropout_rate=cfg[\"attention_dropout_rate\"],\n",
+        "    kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
+        ")\n",
+        "default_embedding_cfg = dict(\n",
+        "    vocab_size=cfg[\"vocab_size\"],\n",
+        "    type_vocab_size=cfg[\"type_vocab_size\"],\n",
+        "    hidden_size=cfg[\"hidden_size\"],\n",
+        "    initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
+        "    dropout_rate=cfg[\"dropout_rate\"],\n",
+        "    max_seq_length=cfg[\"max_sequence_length\"]\n",
+        ")\n",
+        "default_kwargs = dict(\n",
+        "    hidden_cfg=default_hidden_cfg,\n",
+        "    embedding_cfg=default_embedding_cfg,\n",
+        "    num_hidden_instances=cfg[\"num_layers\"],\n",
+        "    pooled_output_dim=cfg[\"hidden_size\"],\n",
+        "    return_all_layer_outputs=True,\n",
+        "    pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
+        ")\n",
+        "\n",
+        "encoder_scaffold = modeling.networks.EncoderScaffold(**default_kwargs)\n",
+        "classifier_model_from_encoder_scaffold = build_classifier(encoder_scaffold)\n",
+        "classifier_model_from_encoder_scaffold.set_weights(\n",
+        "    canonical_classifier_model.get_weights())\n",
+        "predict(classifier_model_from_encoder_scaffold)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sMaUmLyIuwcs"
+      },
+      "source": [
+        "#### Customize Embedding\n",
+        "\n",
+        "Next, we show how to use a customized embedding network.\n",
+        "\n",
+        "We firstly build an embedding network that will replace the default network. This one will have 2 inputs (`mask` and `word_ids`) instead of 3, and won't use positional embeddings."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LTinnaG6vcsw"
+      },
+      "source": [
+        "word_ids = tf.keras.layers.Input(\n",
+        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
+        "mask = tf.keras.layers.Input(\n",
+        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
+        "embedding_layer = modeling.layers.OnDeviceEmbedding(\n",
+        "    vocab_size=cfg['vocab_size'],\n",
+        "    embedding_width=cfg['hidden_size'],\n",
+        "    initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
+        "    name=\"word_embeddings\")\n",
+        "word_embeddings = embedding_layer(word_ids)\n",
+        "attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])\n",
+        "new_embedding_network = tf.keras.Model([word_ids, mask],\n",
+        "                                       [word_embeddings, attention_mask])"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HN7_yu-6O3qI"
+      },
+      "source": [
+        "Inspecting `new_embedding_network`, we can see it takes two inputs:\n",
+        "`input_word_ids` and `input_mask`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fO9zKFE4OpHp"
+      },
+      "source": [
+        "tf.keras.utils.plot_model(new_embedding_network, show_shapes=True, dpi=48)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9cOaGQHLv12W"
+      },
+      "source": [
+        "We then can build a new encoder using the above `new_embedding_network`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "mtFDMNf2vIl9"
+      },
+      "source": [
+        "kwargs = dict(default_kwargs)\n",
+        "\n",
+        "# Use new embedding network.\n",
+        "kwargs['embedding_cls'] = new_embedding_network\n",
+        "kwargs['embedding_data'] = embedding_layer.embeddings\n",
+        "\n",
+        "encoder_with_customized_embedding = modeling.networks.EncoderScaffold(**kwargs)\n",
+        "classifier_model = build_classifier(encoder_with_customized_embedding)\n",
+        "# ... Train the model ...\n",
+        "print(classifier_model.inputs)\n",
+        "\n",
+        "# Assert that there are only two inputs.\n",
+        "assert len(classifier_model.inputs) == 2"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Z73ZQDtmwg9K"
+      },
+      "source": [
+        "#### Customized Transformer\n",
+        "\n",
+        "User can also override the [hidden_cls](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py#L103) argument in `EncoderScaffold`'s constructor to employ a customized Transformer layer.\n",
+        "\n",
+        "See [ReZeroTransformer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/rezero_transformer.py) for how to implement a customized Transformer layer.\n",
+        "\n",
+        "Following is an example of using `ReZeroTransformer`:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "uAIarLZgw6pA"
+      },
+      "source": [
+        "kwargs = dict(default_kwargs)\n",
+        "\n",
+        "# Use ReZeroTransformer.\n",
+        "kwargs['hidden_cls'] = modeling.layers.ReZeroTransformer\n",
+        "\n",
+        "encoder_with_rezero_transformer = modeling.networks.EncoderScaffold(**kwargs)\n",
+        "classifier_model = build_classifier(encoder_with_rezero_transformer)\n",
+        "# ... Train the model ...\n",
+        "predict(classifier_model)\n",
+        "\n",
+        "# Assert that the variable `rezero_alpha` from ReZeroTransformer exists.\n",
+        "assert 'rezero_alpha' in ''.join([x.name for x in classifier_model.trainable_weights])"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6PMHFdvnxvR0"
+      },
+      "source": [
+        "### Use [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py)\n",
+        "\n",
+        "The above method of customizing `Transformer` requires rewriting the whole `Transformer` layer, while sometimes you may only want to customize either attention layer or feedforward block. In this case, [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py) can be used.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "D6FejlgwyAy_"
+      },
+      "source": [
+        "#### Customize Attention Layer\n",
+        "\n",
+        "User can also override the [attention_cls](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py#L45) argument in `TransformerScaffold`'s constructor to employ a customized Attention layer.\n",
+        "\n",
+        "See [TalkingHeadsAttention](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/talking_heads_attention.py) for how to implement a customized `Attention` layer.\n",
+        "\n",
+        "Following is an example of using [TalkingHeadsAttention](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/talking_heads_attention.py):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nFrSMrZuyNeQ"
+      },
+      "source": [
+        "# Use TalkingHeadsAttention\n",
+        "hidden_cfg = dict(default_hidden_cfg)\n",
+        "hidden_cfg['attention_cls'] = modeling.layers.TalkingHeadsAttention\n",
+        "\n",
+        "kwargs = dict(default_kwargs)\n",
+        "kwargs['hidden_cls'] = modeling.layers.TransformerScaffold\n",
+        "kwargs['hidden_cfg'] = hidden_cfg\n",
+        "\n",
+        "encoder = modeling.networks.EncoderScaffold(**kwargs)\n",
+        "classifier_model = build_classifier(encoder)\n",
+        "# ... Train the model ...\n",
+        "predict(classifier_model)\n",
+        "\n",
+        "# Assert that the variable `pre_softmax_weight` from TalkingHeadsAttention exists.\n",
+        "assert 'pre_softmax_weight' in ''.join([x.name for x in classifier_model.trainable_weights])"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kuEJcTyByVvI"
+      },
+      "source": [
+        "#### Customize Feedforward Layer\n",
+        "\n",
+        "Similiarly, one could also customize the feedforward layer.\n",
+        "\n",
+        "See [GatedFeedforward](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/gated_feedforward.py) for how to implement a customized feedforward layer.\n",
+        "\n",
+        "Following is an example of using [GatedFeedforward](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/gated_feedforward.py)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XAbKy_l4y_-i"
+      },
+      "source": [
+        "# Use TalkingHeadsAttention\n",
+        "hidden_cfg = dict(default_hidden_cfg)\n",
+        "hidden_cfg['feedforward_cls'] = modeling.layers.GatedFeedforward\n",
+        "\n",
+        "kwargs = dict(default_kwargs)\n",
+        "kwargs['hidden_cls'] = modeling.layers.TransformerScaffold\n",
+        "kwargs['hidden_cfg'] = hidden_cfg\n",
+        "\n",
+        "encoder_with_gated_feedforward = modeling.networks.EncoderScaffold(**kwargs)\n",
+        "classifier_model = build_classifier(encoder_with_gated_feedforward)\n",
+        "# ... Train the model ...\n",
+        "predict(classifier_model)\n",
+        "\n",
+        "# Assert that the variable `gate` from GatedFeedforward exists.\n",
+        "assert 'gate' in ''.join([x.name for x in classifier_model.trainable_weights])"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "a_8NWUhkzeAq"
+      },
+      "source": [
+        "### Build a new Encoder using building blocks from KerasBERT.\n",
+        "\n",
+        "Finally, you could also build a new encoder using building blocks in the modeling library.\n",
+        "\n",
+        "See [AlbertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_encoder.py) as an example:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xsiA3RzUzmUM"
+      },
+      "source": [
+        "albert_encoder = modeling.networks.AlbertEncoder(**cfg)\n",
+        "classifier_model = build_classifier(albert_encoder)\n",
+        "# ... Train the model ...\n",
+        "predict(classifier_model)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MeidDfhlHKSO"
+      },
+      "source": [
+        "Inspecting the `albert_encoder`, we see it stacks the same `Transformer` layer multiple times."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Uv_juT22HERW"
+      },
+      "source": [
+        "tf.keras.utils.plot_model(albert_encoder, show_shapes=True, dpi=48)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
--- a/official/colab/nlp/nlp_modeling_library_intro.ipynb
+++ b/official/colab/nlp/nlp_modeling_library_intro.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "80xnUmoI7fBX"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "8nvTnfs6Q692"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WmfcMK5P5C1G"
+      },
+      "source": [
+        "# Introduction to the TensorFlow Models NLP library"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cH-oJ8R6AHMK"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0H_EFIhq4-MJ"
+      },
+      "source": [
+        "## Learning objectives\n",
+        "\n",
+        "In this Colab notebook, you will learn how to build transformer-based models for common NLP tasks including pretraining, span labelling and classification using the building blocks from [NLP modeling library](https://github.com/tensorflow/models/tree/master/official/nlp/modeling)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2N97-dps_nUk"
+      },
+      "source": [
+        "## Install and import"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "459ygAVl_rg0"
+      },
+      "source": [
+        "### Install the TensorFlow Model Garden pip package\n",
+        "\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
+        "*  `pip` will install all models and dependencies automatically."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Y-qGkdh6_sZc"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -q tf-models-official==2.4.0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "e4huSSwyAG_5"
+      },
+      "source": [
+        "### Import Tensorflow and other libraries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jqYXqtjBAJd9"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "from official.nlp import modeling\n",
+        "from official.nlp.modeling import layers, losses, models, networks"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "djBQWjvy-60Y"
+      },
+      "source": [
+        "## BERT pretraining model\n",
+        "\n",
+        "BERT ([Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)) introduced the method of pre-training language representations on a large text corpus and then using that model for downstream NLP tasks.\n",
+        "\n",
+        "In this section, we will learn how to build a model to pretrain BERT on the masked language modeling task and next sentence prediction task. For simplicity, we only show the minimum example and use dummy data."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MKuHVlsCHmiq"
+      },
+      "source": [
+        "### Build a `BertPretrainer` model wrapping `BertEncoder`\n",
+        "\n",
+        "The [BertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/bert_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
+        "\n",
+        "The [BertPretrainer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_pretrainer.py) allows a user to pass in a transformer stack, and instantiates the masked language model and classification networks that are used to create the training objectives."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "EXkcXz-9BwB3"
+      },
+      "outputs": [],
+      "source": [
+        "# Build a small transformer network.\n",
+        "vocab_size = 100\n",
+        "sequence_length = 16\n",
+        "network = modeling.networks.BertEncoder(\n",
+        "    vocab_size=vocab_size, num_layers=2, sequence_length=16)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0NH5irV5KTMS"
+      },
+      "source": [
+        "Inspecting the encoder, we see it contains few embedding layers, stacked `Transformer` layers and are connected to three input layers:\n",
+        "\n",
+        "`input_word_ids`, `input_type_ids` and `input_mask`.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lZNoZkBrIoff"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(network, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "o7eFOZXiIl-b"
+      },
+      "outputs": [],
+      "source": [
+        "# Create a BERT pretrainer with the created network.\n",
+        "num_token_predictions = 8\n",
+        "bert_pretrainer = modeling.models.BertPretrainer(\n",
+        "    network, num_classes=2, num_token_predictions=num_token_predictions, output='predictions')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "d5h5HT7gNHx_"
+      },
+      "source": [
+        "Inspecting the `bert_pretrainer`, we see it wraps the `encoder` with additional `MaskedLM` and `Classification` heads."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2tcNfm03IBF7"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(bert_pretrainer, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "F2oHrXGUIS0M"
+      },
+      "outputs": [],
+      "source": [
+        "# We can feed some dummy data to get masked language model and sentence output.\n",
+        "batch_size = 2\n",
+        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
+        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "masked_lm_positions_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
+        "\n",
+        "outputs = bert_pretrainer(\n",
+        "    [word_id_data, mask_data, type_id_data, masked_lm_positions_data])\n",
+        "lm_output = outputs[\"masked_lm\"]\n",
+        "sentence_output = outputs[\"classification\"]\n",
+        "print(lm_output)\n",
+        "print(sentence_output)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bnx3UCHniCS5"
+      },
+      "source": [
+        "### Compute loss\n",
+        "Next, we can use `lm_output` and `sentence_output` to compute `loss`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "k30H4Q86f52x"
+      },
+      "outputs": [],
+      "source": [
+        "masked_lm_ids_data = np.random.randint(vocab_size, size=(batch_size, num_token_predictions))\n",
+        "masked_lm_weights_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
+        "next_sentence_labels_data = np.random.randint(2, size=(batch_size))\n",
+        "\n",
+        "mlm_loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
+        "    labels=masked_lm_ids_data,\n",
+        "    predictions=lm_output,\n",
+        "    weights=masked_lm_weights_data)\n",
+        "sentence_loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
+        "    labels=next_sentence_labels_data,\n",
+        "    predictions=sentence_output)\n",
+        "loss = mlm_loss + sentence_loss\n",
+        "print(loss)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wrmSs8GjHxVw"
+      },
+      "source": [
+        "With the loss, you can optimize the model.\n",
+        "After training, we can save the weights of TransformerEncoder for the downstream fine-tuning tasks. Please see [run_pretraining.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_pretraining.py) for the full example.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "k8cQVFvBCV4s"
+      },
+      "source": [
+        "## Span labeling model\n",
+        "\n",
+        "Span labeling is the task to assign labels to a span of the text, for example, label a span of text as the answer of a given question.\n",
+        "\n",
+        "In this section, we will learn how to build a span labeling model. Again, we use dummy data for simplicity."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xrLLEWpfknUW"
+      },
+      "source": [
+        "### Build a BertSpanLabeler wrapping BertEncoder\n",
+        "\n",
+        "[BertSpanLabeler](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_span_labeler.py) implements a simple single-span start-end predictor (that is, a model that predicts two values: a start token index and an end token index), suitable for SQuAD-style tasks.\n",
+        "\n",
+        "Note that `BertSpanLabeler` wraps a `BertEncoder`, the weights of which can be restored from the above pretraining model.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "B941M4iUCejO"
+      },
+      "outputs": [],
+      "source": [
+        "network = modeling.networks.BertEncoder(\n",
+        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
+        "\n",
+        "# Create a BERT trainer with the created network.\n",
+        "bert_span_labeler = modeling.models.BertSpanLabeler(network)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QpB9pgj4PpMg"
+      },
+      "source": [
+        "Inspecting the `bert_span_labeler`, we see it wraps the encoder with additional `SpanLabeling` that outputs `start_position` and `end_postion`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RbqRNJCLJu4H"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(bert_span_labeler, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fUf1vRxZJwio"
+      },
+      "outputs": [],
+      "source": [
+        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
+        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
+        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "\n",
+        "# Feed the data to the model.\n",
+        "start_logits, end_logits = bert_span_labeler([word_id_data, mask_data, type_id_data])\n",
+        "print(start_logits)\n",
+        "print(end_logits)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WqhgQaN1lt-G"
+      },
+      "source": [
+        "### Compute loss\n",
+        "With `start_logits` and `end_logits`, we can compute loss:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "waqs6azNl3Nn"
+      },
+      "outputs": [],
+      "source": [
+        "start_positions = np.random.randint(sequence_length, size=(batch_size))\n",
+        "end_positions = np.random.randint(sequence_length, size=(batch_size))\n",
+        "\n",
+        "start_loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
+        "    start_positions, start_logits, from_logits=True)\n",
+        "end_loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
+        "    end_positions, end_logits, from_logits=True)\n",
+        "\n",
+        "total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2\n",
+        "print(total_loss)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Zdf03YtZmd_d"
+      },
+      "source": [
+        "With the `loss`, you can optimize the model. Please see [run_squad.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_squad.py) for the full example."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0A1XnGSTChg9"
+      },
+      "source": [
+        "## Classification model\n",
+        "\n",
+        "In the last section, we show how to build a text classification model.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MSK8OpZgnQa9"
+      },
+      "source": [
+        "### Build a BertClassifier model wrapping BertEncoder\n",
+        "\n",
+        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a [CLS] token classification model containing a single classification head."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cXXCsffkCphk"
+      },
+      "outputs": [],
+      "source": [
+        "network = modeling.networks.BertEncoder(\n",
+        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
+        "\n",
+        "# Create a BERT trainer with the created network.\n",
+        "num_classes = 2\n",
+        "bert_classifier = modeling.models.BertClassifier(\n",
+        "    network, num_classes=num_classes)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8tZKueKYP4bB"
+      },
+      "source": [
+        "Inspecting the `bert_classifier`, we see it wraps the `encoder` with additional `Classification` head."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "snlutm9ZJgEZ"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yyHPHsqBJkCz"
+      },
+      "outputs": [],
+      "source": [
+        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
+        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
+        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "\n",
+        "# Feed the data to the model.\n",
+        "logits = bert_classifier([word_id_data, mask_data, type_id_data])\n",
+        "print(logits)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "w--a2mg4nzKm"
+      },
+      "source": [
+        "### Compute loss\n",
+        "\n",
+        "With `logits`, we can compute `loss`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9X0S1DoFn_5Q"
+      },
+      "outputs": [],
+      "source": [
+        "labels = np.random.randint(num_classes, size=(batch_size))\n",
+        "\n",
+        "loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
+        "    labels, logits, from_logits=True)\n",
+        "print(loss)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mzBqOylZo3og"
+      },
+      "source": [
+        "With the `loss`, you can optimize the model. Please see [run_classifier.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_classifier.py) or the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb) for the full example."
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Introduction to the TensorFlow Models NLP library",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/official/common/distribute_utils_test.py
+++ b/official/common/distribute_utils_test.py
@@ -14,10 +14,13 @@

 """Tests for distribution util functions."""

+import sys
 import tensorflow as tf

 from official.common import distribute_utils

+TPU_TEST = 'test_tpu' in sys.argv[0]
+

 class DistributeUtilsTest(tf.test.TestCase):
  """Tests for distribute util functions."""
@@ -51,6 +54,9 @@ class DistributeUtilsTest(tf.test.TestCase):
    self.assertIn('GPU', ds.extended.worker_devices[0])

  def test_mirrored_strategy(self):
+    # CPU only.
+    _ = distribute_utils.get_distribution_strategy(num_gpus=0)
+    # 5 GPUs.
    ds = distribute_utils.get_distribution_strategy(num_gpus=5)
    self.assertEquals(ds.num_replicas_in_sync, 5)
    self.assertEquals(len(ds.extended.worker_devices), 5)
@@ -78,10 +84,26 @@ class DistributeUtilsTest(tf.test.TestCase):
    self.assertIsInstance(
        ds, tf.distribute.experimental.MultiWorkerMirroredStrategy)

+    with self.assertRaisesRegex(
+        ValueError,
+        'When used with `multi_worker_mirrored`, valid values.*'):
+      _ = distribute_utils.get_distribution_strategy(
+          'multi_worker_mirrored', all_reduce_alg='dummy')
+
  def test_no_strategy(self):
    ds = distribute_utils.get_distribution_strategy('off')
    self.assertIs(ds, tf.distribute.get_strategy())

+  def test_tpu_strategy(self):
+    if not TPU_TEST:
+      self.skipTest('Only Cloud TPU VM instances can have local TPUs.')
+    with self.assertRaises(ValueError):
+      _ = distribute_utils.get_distribution_strategy('tpu')
+
+    ds = distribute_utils.get_distribution_strategy('tpu', tpu_address='local')
+    self.assertIsInstance(
+        ds, tf.distribute.TPUStrategy)
+
  def test_invalid_strategy(self):
    with self.assertRaisesRegexp(
        ValueError,

--- a/official/core/actions.py
+++ b/official/core/actions.py
@@ -28,7 +28,7 @@ from official.core import config_definitions
 from official.modeling import optimization


-class PruningActions:
+class PruningAction:
  """Train action to updates pruning related information.

  This action updates pruning steps at the end of trainig loop, and log
@@ -66,7 +66,7 @@ class PruningActions:
    """Update pruning step and log pruning summaries.

    Args:
-      output: The train output to test.
+      output: The train output.
    """
    self.update_pruning_step.on_epoch_end(batch=None)
    self.pruning_summaries.on_epoch_begin(epoch=None)
@@ -81,8 +81,11 @@ class EMACheckpointing:
  than training.
  """

-  def __init__(self, export_dir: str, optimizer: tf.keras.optimizers.Optimizer,
-               checkpoint: tf.train.Checkpoint, max_to_keep: int = 1):
+  def __init__(self,
+               export_dir: str,
+               optimizer: tf.keras.optimizers.Optimizer,
+               checkpoint: tf.train.Checkpoint,
+               max_to_keep: int = 1):
    """Initializes the instance.

    Args:
@@ -99,8 +102,7 @@ class EMACheckpointing:
                       'EMACheckpointing action')

    export_dir = os.path.join(export_dir, 'ema_checkpoints')
-    tf.io.gfile.makedirs(
-        os.path.dirname(export_dir))
+    tf.io.gfile.makedirs(os.path.dirname(export_dir))
    self._optimizer = optimizer
    self._checkpoint = checkpoint
    self._checkpoint_manager = tf.train.CheckpointManager(
@@ -113,7 +115,7 @@ class EMACheckpointing:
    """Swaps model weights, and saves the checkpoint.

    Args:
-      output: The train or eval output to test.
+      output: The train or eval output.
    """
    self._optimizer.swap_weights()
    self._checkpoint_manager.save(checkpoint_number=self._optimizer.iterations)
@@ -173,10 +175,9 @@ class RecoveryCondition:


 @gin.configurable
-def get_eval_actions(
-    params: config_definitions.ExperimentConfig,
-    trainer: base_trainer.Trainer,
-    model_dir: str) -> List[orbit.Action]:
+def get_eval_actions(params: config_definitions.ExperimentConfig,
+                     trainer: base_trainer.Trainer,
+                     model_dir: str) -> List[orbit.Action]:
  """Gets eval actions for TFM trainer."""
  eval_actions = []
  # Adds ema checkpointing action to save the average weights under
@@ -202,7 +203,7 @@ def get_train_actions(
  # Adds pruning callback actions.
  if hasattr(params.task, 'pruning'):
    train_actions.append(
-        PruningActions(
+        PruningAction(
            export_dir=model_dir,
            model=trainer.model,
            optimizer=trainer.optimizer))

--- a/official/core/actions_test.py
+++ b/official/core/actions_test.py
@@ -27,14 +27,16 @@ from official.core import actions
 from official.modeling import optimization


-class TestModel(tf.Module):
+class TestModel(tf.keras.Model):

  def __init__(self):
-    self.value = tf.Variable(0)
+    super().__init__()
+    self.value = tf.Variable(0.0)
+    self.dense = tf.keras.layers.Dense(2)
+    _ = self.dense(tf.zeros((2, 2), tf.float32))

-  @tf.function(input_signature=[])
-  def __call__(self):
-    return self.value
+  def call(self, x, training=None):
+    return self.value + x


 class ActionsTest(tf.test.TestCase, parameterized.TestCase):
@@ -43,7 +45,7 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
      combinations.combine(
          distribution=[
              strategy_combinations.cloud_tpu_strategy,
-              strategy_combinations.one_device_strategy_gpu,
+              strategy_combinations.one_device_strategy,
          ],))
  def test_ema_checkpointing(self, distribution):
    with distribution.scope():
@@ -62,18 +64,25 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
      model.value.assign(3)

      # Checks model.value is 3
-      self.assertEqual(model(), 3)
+      self.assertEqual(model(0.), 3)
      ema_action = actions.EMACheckpointing(directory, optimizer, checkpoint)

      ema_action({})
      self.assertNotEmpty(
          tf.io.gfile.glob(os.path.join(directory, 'ema_checkpoints')))

-      checkpoint.read(tf.train.latest_checkpoint(
-          os.path.join(directory, 'ema_checkpoints')))
+      checkpoint.read(
+          tf.train.latest_checkpoint(
+              os.path.join(directory, 'ema_checkpoints')))

      # Checks model.value is 0 after swapping.
-      self.assertEqual(model(), 0)
+      self.assertEqual(model(0.), 0)
+
+      # Raises an error for a normal optimizer.
+      with self.assertRaisesRegex(ValueError,
+                                  'Optimizer has to be instance of.*'):
+        _ = actions.EMACheckpointing(directory, tf.keras.optimizers.SGD(),
+                                     checkpoint)

  @combinations.generate(
      combinations.combine(
@@ -102,6 +111,21 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
      with self.assertRaises(RuntimeError):
        recover_condition(outputs)

+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy_gpu,
+              strategy_combinations.one_device_strategy,
+          ],))
+  def test_pruning(self, distribution):
+    with distribution.scope():
+      directory = self.get_temp_dir()
+      model = TestModel()
+      optimizer = tf.keras.optimizers.SGD()
+      pruning = actions.PruningAction(directory, model, optimizer)
+
+      pruning({})
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/core/base_trainer.py
+++ b/official/core/base_trainer.py
@@ -247,14 +247,12 @@ class Trainer(_AsyncTrainer):
    self._validation_loss = tf.keras.metrics.Mean(
        "validation_loss", dtype=tf.float32)
    model_metrics = model.metrics if hasattr(model, "metrics") else []
-    self._train_metrics = self.task.build_metrics(
-        training=True) + model_metrics
-    self._validation_metrics = self.task.build_metrics(
-        training=False) + model_metrics

    self.init_async()

    if train:
+      self._train_metrics = self.task.build_metrics(
+          training=True) + model_metrics
      train_dataset = train_dataset or self.distribute_dataset(
          self.task.build_inputs, self.config.task.train_data)
      orbit.StandardTrainer.__init__(
@@ -266,6 +264,8 @@ class Trainer(_AsyncTrainer):
              use_tpu_summary_optimization=config.trainer.allow_tpu_summary))

    if evaluate:
+      self._validation_metrics = self.task.build_metrics(
+          training=False) + model_metrics
      validation_dataset = validation_dataset or self.distribute_dataset(
          self.task.build_inputs, self.config.task.validation_data)
      orbit.StandardEvaluator.__init__(
@@ -370,16 +370,6 @@ class Trainer(_AsyncTrainer):
    """Accesses the training checkpoint."""
    return self._checkpoint

-  # TODO(yejiayu): Remove this once all deps are fixed.
-  def add_recovery(self, params: TrainerConfig,
-                   checkpoint_manager: tf.train.CheckpointManager):
-    if params.recovery_max_trials >= 0:
-      self._recovery = Recovery(
-          loss_upper_bound=params.loss_upper_bound,
-          recovery_begin_steps=params.recovery_begin_steps,
-          recovery_max_trials=params.recovery_max_trials,
-          checkpoint_manager=checkpoint_manager)
-
  def train_loop_end(self):
    """See base class."""
    self.join()

--- a/official/core/export_base.py
+++ b/official/core/export_base.py
@@ -16,10 +16,13 @@

 import abc
 import functools
+import time
 from typing import Any, Callable, Dict, Mapping, List, Optional, Text, Union

+from absl import logging
 import tensorflow as tf
-from tensorflow.python.saved_model.model_utils import export_utils
+
+MAX_DIRECTORY_CREATION_ATTEMPTS = 10


 class ExportModule(tf.Module, metaclass=abc.ABCMeta):
@@ -89,7 +92,8 @@ def export(export_module: ExportModule,
           export_savedmodel_dir: Text,
           checkpoint_path: Optional[Text] = None,
           timestamped: bool = True,
-           save_options: Optional[tf.saved_model.SaveOptions] = None) -> Text:
+           save_options: Optional[tf.saved_model.SaveOptions] = None,
+           checkpoint: Optional[tf.train.Checkpoint] = None) -> Text:
  """Exports to SavedModel format.

  Args:
@@ -101,6 +105,8 @@ def export(export_module: ExportModule,
    checkpoint_path: Object-based checkpoint path or directory.
    timestamped: Whether to export the savedmodel to a timestamped directory.
    save_options: `SaveOptions` for `tf.saved_model.save`.
+    checkpoint: An optional tf.train.Checkpoint. If provided, the export module
+      will use it to read the weights.

  Returns:
    The savedmodel directory path.
@@ -109,7 +115,8 @@ def export(export_module: ExportModule,
  if ckpt_dir_or_file is not None and tf.io.gfile.isdir(ckpt_dir_or_file):
    ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
  if ckpt_dir_or_file:
-    checkpoint = tf.train.Checkpoint(model=export_module.model)
+    if checkpoint is None:
+      checkpoint = tf.train.Checkpoint(model=export_module.model)
    checkpoint.read(
        ckpt_dir_or_file).assert_existing_objects_matched().expect_partial()
  if isinstance(function_keys, list):
@@ -119,15 +126,48 @@ def export(export_module: ExportModule,
      }
    else:
      raise ValueError(
-          "If the function_keys is a list, it must contain a single element. %s"
+          'If the function_keys is a list, it must contain a single element. %s'
          % function_keys)

  signatures = export_module.get_inference_signatures(function_keys)
  if timestamped:
-    export_dir = export_utils.get_timestamped_export_dir(
-        export_savedmodel_dir).decode("utf-8")
+    export_dir = get_timestamped_export_dir(export_savedmodel_dir).decode(
+        'utf-8')
  else:
    export_dir = export_savedmodel_dir
  tf.saved_model.save(
      export_module, export_dir, signatures=signatures, options=save_options)
  return export_dir
+
+
+def get_timestamped_export_dir(export_dir_base):
+  """Builds a path to a new subdirectory within the base directory.
+
+  Args:
+    export_dir_base: A string containing a directory to write the exported graph
+      and checkpoints.
+
+  Returns:
+    The full path of the new subdirectory (which is not actually created yet).
+
+  Raises:
+    RuntimeError: if repeated attempts fail to obtain a unique timestamped
+      directory name.
+  """
+  attempts = 0
+  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
+    timestamp = int(time.time())
+
+    result_dir = tf.io.gfile.join(
+        tf.compat.as_bytes(export_dir_base), tf.compat.as_bytes(str(timestamp)))
+    if not tf.io.gfile.exists(result_dir):
+      # Collisions are still possible (though extremely unlikely): this
+      # directory is not actually created yet, but it will be almost
+      # instantly on return from this function.
+      return result_dir
+    time.sleep(1)
+    attempts += 1
+    logging.warning('Directory %s already exists; retrying (attempt %s/%s)',
+                    str(result_dir), attempts, MAX_DIRECTORY_CREATION_ATTEMPTS)
+  raise RuntimeError('Failed to obtain a unique export directory name after '
+                     f'{MAX_DIRECTORY_CREATION_ATTEMPTS} attempts.')
--- a/official/core/export_base_test.py
+++ b/official/core/export_base_test.py
@@ -121,6 +121,13 @@ class ExportBaseTest(tf.test.TestCase):
    output = module.serve(inputs)
    self.assertAllClose(output['outputs'].numpy(), 1.11)

+  def test_get_timestamped_export_dir(self):
+    export_dir = self.get_temp_dir()
+    timed_dir = export_base.get_timestamped_export_dir(
+        export_dir_base=export_dir)
+    self.assertFalse(tf.io.gfile.exists(timed_dir))
+    self.assertIn(export_dir, str(timed_dir))
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/core/train_utils.py
+++ b/official/core/train_utils.py
@@ -14,13 +14,13 @@

 """Training utils."""
 import copy
+import dataclasses
 import json
 import os
 import pprint
 from typing import Any, Callable, Dict, List, Optional, Union

 from absl import logging
-import dataclasses
 import gin
 import orbit
 import tensorflow as tf
@@ -244,49 +244,87 @@ class ParseConfigOptions:
    return name in dataclasses.asdict(self)


-def parse_configuration(flags_obj, lock_return=True, print_return=True):
-  """Parses ExperimentConfig from flags."""
+class ExperimentParser:
+  """Constructs the Experiment config from Flags or equivalent object.
+
+  Most of the cases, users only need to call the `parse()` function:
+  ```
+  builder = ExperimentParser(FLAGS)
+  params = builder.parse()
+  ```

-  if flags_obj.experiment is None:
-    raise ValueError('The flag --experiment must be specified.')
-
-  # 1. Get the default config from the registered experiment.
-  params = exp_factory.get_exp_config(flags_obj.experiment)
-
-  # 2. Get the first level of override from `--config_file`.
-  #    `--config_file` is typically used as a template that specifies the common
-  #    override for a particular experiment.
-  for config_file in flags_obj.config_file or []:
-    params = hyperparams.override_params_dict(
-        params, config_file, is_strict=True)
-
-  # 3. Override the TPU address and tf.data service address.
-  params.override({
-      'runtime': {
-          'tpu': flags_obj.tpu,
-      },
-  })
-  if ('tf_data_service' in flags_obj and flags_obj.tf_data_service and
-      isinstance(params.task, config_definitions.TaskConfig)):
+  The advanced users can modify the flow by calling the parse_*() functions
+  separately.
+  """
+
+  def __init__(self, flags_obj):
+    self._flags_obj = flags_obj
+
+  def parse(self):
+    """Overrall process of constructing Experiment config."""
+    params = self.base_experiment()
+    params = self.parse_config_file(params)
+    params = self.parse_runtime(params)
+    params = self.parse_data_service(params)
+    params = self.parse_params_override(params)
+    return params
+
+  def base_experiment(self):
+    """Get the base experiment config from --experiment field."""
+    if self._flags_obj.experiment is None:
+      raise ValueError('The flag --experiment must be specified.')
+    return exp_factory.get_exp_config(self._flags_obj.experiment)
+
+  def parse_config_file(self, params):
+    """Override the configs of params from the config_file."""
+    for config_file in self._flags_obj.config_file or []:
+      params = hyperparams.override_params_dict(
+          params, config_file, is_strict=True)
+    return params
+
+  def parse_runtime(self, params):
+    """Override the runtime configs of params from flags."""
+    # Override the TPU address and tf.data service address.
    params.override({
-        'task': {
-            'train_data': {
-                'tf_data_service_address': flags_obj.tf_data_service,
-            },
-            'validation_data': {
-                'tf_data_service_address': flags_obj.tf_data_service,
-            }
-        }
+        'runtime': {
+            'tpu': self._flags_obj.tpu,
+        },
    })
+    return params
+
+  def parse_data_service(self, params):
+    """Override the data service configs of params from flags."""
+    if ('tf_data_service' in self._flags_obj and
+        self._flags_obj.tf_data_service and
+        isinstance(params.task, config_definitions.TaskConfig)):
+      params.override({
+          'task': {
+              'train_data': {
+                  'tf_data_service_address': self._flags_obj.tf_data_service,
+              },
+              'validation_data': {
+                  'tf_data_service_address': self._flags_obj.tf_data_service,
+              }
+          }
+      })
+    return params
+
+  def parse_params_override(self, params):
+    # Get the second level of override from `--params_override`.
+    # `--params_override` is typically used as a further override over the
+    # template. For example, one may define a particular template for training
+    # ResNet50 on ImageNet in a config file and pass it via `--config_file`,
+    # then define different learning rates and pass it via `--params_override`.
+    if self._flags_obj.params_override:
+      params = hyperparams.override_params_dict(
+          params, self._flags_obj.params_override, is_strict=True)
+    return params
+
+
+def parse_configuration(flags_obj, lock_return=True, print_return=True):
+  """Parses ExperimentConfig from flags."""

-  # 4. Get the second level of override from `--params_override`.
-  #    `--params_override` is typically used as a further override over the
-  #    template. For example, one may define a particular template for training
-  #    ResNet50 on ImageNet in a config file and pass it via `--config_file`,
-  #    then define different learning rates and pass it via `--params_override`.
-  if flags_obj.params_override:
-    params = hyperparams.override_params_dict(
-        params, flags_obj.params_override, is_strict=True)
+  params = ExperimentParser(flags_obj).parse()

  params.validate()
  if lock_return:

--- a/official/core/train_utils_test.py
+++ b/official/core/train_utils_test.py
@@ -13,14 +13,37 @@
 # limitations under the License.

 """Tests for official.core.train_utils."""
-
 import os
+import pprint

 import numpy as np
 import tensorflow as tf

+from official.core import exp_factory
 from official.core import test_utils
 from official.core import train_utils
+from official.modeling import hyperparams
+
+
+@exp_factory.register_config_factory('foo')
+def foo():
+  """Multitask experiment for test."""
+  experiment_config = hyperparams.Config(
+      default_params={
+          'runtime': {
+              'tpu': 'fake',
+          },
+          'task': {
+              'model': {
+                  'model_id': 'bar',
+              },
+          },
+          'trainer': {
+              'train_steps': -1,
+              'validation_steps': -1,
+          },
+      })
+  return experiment_config


 class TrainUtilsTest(tf.test.TestCase):
@@ -93,6 +116,27 @@ class TrainUtilsTest(tf.test.TestCase):
    ]
    self.assertEqual(actual, expected)

+  def test_construct_experiment_from_flags(self):
+    options = train_utils.ParseConfigOptions(
+        experiment='foo',
+        config_file=[],
+        tpu='bar',
+        tf_data_service='',
+        params_override='task.model.model_id=new,'
+        'trainer.train_steps=10,'
+        'trainer.validation_steps=11')
+    builder = train_utils.ExperimentParser(options)
+    params_from_obj = builder.parse()
+    params_from_func = train_utils.parse_configuration(options)
+    pp = pprint.PrettyPrinter()
+    self.assertEqual(
+        pp.pformat(params_from_obj.as_dict()),
+        pp.pformat(params_from_func.as_dict()))
+    self.assertEqual(params_from_obj.runtime.tpu, 'bar')
+    self.assertEqual(params_from_obj.task.model.model_id, 'new')
+    self.assertEqual(params_from_obj.trainer.train_steps, 10)
+    self.assertEqual(params_from_obj.trainer.validation_steps, 11)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/legacy/__init__.py
+++ b/official/legacy/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/legacy/detection/README.md
+++ b/official/legacy/detection/README.md
+# Object Detection Models on TensorFlow 2
+
+**WARNING**: This repository will be deprecated and replaced by the solid
+implementations inside vision/beta/.
+
+## Prerequsite
+To get started, download the code from TensorFlow models GitHub repository or
+use the pre-installed Google Cloud VM.
+
+```bash
+git clone https://github.com/tensorflow/models.git
+```
+
+Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
+a few package you need to install to get started:
+
+```bash
+sudo apt-get install -y python-tk && \
+pip3 install -r ~/models/official/requirements.txt
+```
+
+## Train RetinaNet on TPU
+
+### Train a vanilla ResNet-50 based RetinaNet.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --params_override="{ type: retinanet, train: { checkpoint: { path: ${RESNET_CHECKPOINT?}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
+
+The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
+
+Note: The ResNet implementation under
+[detection/](https://github.com/tensorflow/models/tree/master/official/legacy/detection)
+is currently different from the one under
+[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
+so the checkpoints are not compatible.
+We will unify the implementation soon.
+
+
+### Train a SpineNet-49 based RetinaNet.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --params_override="{ type: retinanet, architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
+
+
+### Train a custom RetinaNet using the config file.
+
+First, create a YAML config file, e.g. *my_retinanet.yaml*. This file specifies
+the parameters to be overridden, which should at least include the following
+fields.
+
+```YAML
+# my_retinanet.yaml
+type: 'retinanet'
+train:
+  train_file_pattern: <path to the TFRecord training data>
+eval:
+  eval_file_pattern: <path to the TFRecord validation data>
+  val_json_file: <path to the validation annotation JSON file>
+```
+
+Once the YAML config file is created, you can launch the training using the
+following command.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --config_file="my_retinanet.yaml"
+```
+
+## Train RetinaNet on GPU
+
+Training on GPU is similar to that on TPU. The major change is the strategy
+type (use "[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)" for multiple GPU and
+"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)" for single GPU).
+
+Multi-GPUs example (assuming there are 8GPU connected to the host):
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=mirrored \
+  --num_gpus=8 \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --config_file="my_retinanet.yaml"
+```
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --config_file="my_retinanet.yaml"
+```
+
+An example with inline configuration (YAML or JSON format):
+
+```
+python3 ~/models/official/legacy/detection/main.py \
+  --model_dir=<model folder> \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --mode=train \
+  --params_override="eval:
+ eval_file_pattern: <Eval TFRecord file pattern>
+ batch_size: 8
+ val_json_file: <COCO format groundtruth JSON file>
+predict:
+ predict_batch_size: 8
+architecture:
+ use_bfloat16: False
+train:
+ total_steps: 1
+ batch_size: 8
+ train_file_pattern: <Eval TFRecord file pattern>
+use_tpu: False
+"
+```
+
+---
+
+## Train Mask R-CNN on TPU
+
+### Train a vanilla ResNet-50 based Mask R-CNN.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
+```
+
+The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
+
+Note: The ResNet implementation under
+[detection/](https://github.com/tensorflow/models/tree/master/official/legacy/detection)
+is currently different from the one under
+[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
+so the checkpoints are not compatible.
+We will unify the implementation soon.
+
+
+### Train a SpineNet-49 based Mask R-CNN.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="{architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
+
+
+### Train a custom Mask R-CNN using the config file.
+
+First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
+This file specifies the parameters to be overridden,
+which should at least include the following fields.
+
+```YAML
+# my_maskrcnn.yaml
+train:
+  train_file_pattern: <path to the TFRecord training data>
+eval:
+  eval_file_pattern: <path to the TFRecord validation data>
+  val_json_file: <path to the validation annotation JSON file>
+```
+
+Once the YAML config file is created, you can launch the training using the
+following command.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+
+## Train Mask R-CNN on GPU
+
+Training on GPU is similar to that on TPU. The major change is the strategy type
+(use
+"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
+for multiple GPU and
+"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
+for single GPU).
+
+Multi-GPUs example (assuming there are 8GPU connected to the host):
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=mirrored \
+  --num_gpus=8 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+
+An example with inline configuration (YAML or JSON format):
+
+```
+python3 ~/models/official/legacy/detection/main.py \
+  --model_dir=<model folder> \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="eval:
+ eval_file_pattern: <Eval TFRecord file pattern>
+ batch_size: 8
+ val_json_file: <COCO format groundtruth JSON file>
+predict:
+ predict_batch_size: 8
+architecture:
+ use_bfloat16: False
+train:
+ total_steps: 1000
+ batch_size: 8
+ train_file_pattern: <Eval TFRecord file pattern>
+use_tpu: False
+"
+```
+
+## Train ShapeMask on TPU
+
+### Train a ResNet-50 based ShapeMask.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+SHAPE_PRIOR_PATH="<path to shape priors>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } shapemask_head: {use_category_for_mask: true, shape_prior_path: ${SHAPE_PRIOR_PATH}} }"
+```
+
+The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
+
+The shape priors can be downloaded [here]
+(https://storage.googleapis.com/cloud-tpu-checkpoints/shapemask/kmeans_class_priors_91x20x32x32.npy)
+
+
+### Train a custom ShapeMask using the config file.
+
+First, create a YAML config file, e.g. *my_shapemask.yaml*.
+This file specifies the parameters to be overridden:
+
+```YAML
+# my_shapemask.yaml
+train:
+  train_file_pattern: <path to the TFRecord training data>
+  total_steps: <total steps to train>
+  batch_size: <training batch size>
+eval:
+  eval_file_pattern: <path to the TFRecord validation data>
+  val_json_file: <path to the validation annotation JSON file>
+  batch_size: <evaluation batch size>
+shapemask_head:
+  shape_prior_path: <path to shape priors>
+```
+
+Once the YAML config file is created, you can launch the training using the
+following command.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+## Train ShapeMask on GPU
+
+Training on GPU is similar to that on TPU. The major change is the strategy type
+(use
+"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
+for multiple GPU and
+"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
+for single GPU).
+
+Multi-GPUs example (assuming there are 8GPU connected to the host):
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=mirrored \
+  --num_gpus=8 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+A single GPU example
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+
+An example with inline configuration (YAML or JSON format):
+
+```
+python3 ~/models/official/legacy/detection/main.py \
+  --model_dir=<model folder> \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --mode=train \
+  --model=shapemask \
+  --params_override="eval:
+ eval_file_pattern: <Eval TFRecord file pattern>
+ batch_size: 8
+ val_json_file: <COCO format groundtruth JSON file>
+train:
+ total_steps: 1000
+ batch_size: 8
+ train_file_pattern: <Eval TFRecord file pattern>
+use_tpu: False
+"
+```
+
+
+### Run the evaluation (after training)
+
+```
+python3 /usr/share/models/official/legacy/detection/main.py \
+   --strategy_type=tpu \
+   --tpu=${TPU_NAME} \
+   --model_dir=${MODEL_DIR} \
+   --mode=eval \
+   --model=shapemask \
+   --params_override="{eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN}, eval_samples: 5000 } }"
+```
+
+`MODEL_DIR` needs to point to the trained path of ShapeMask model.
+Change `strategy_type=mirrored` and `num_gpus=1` to run on a GPU.
+
+Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
+downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.
+
+## References
+
+1.  [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
+    Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Dollár. IEEE
+    International Conference on Computer Vision (ICCV), 2017.
--- a/official/legacy/detection/__init__.py
+++ b/official/legacy/detection/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/legacy/detection/configs/__init__.py
+++ b/official/legacy/detection/configs/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/legacy/detection/configs/base_config.py
+++ b/official/legacy/detection/configs/base_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base config template."""
+
+
+BACKBONES = [
+    'resnet',
+    'spinenet',
+]
+
+MULTILEVEL_FEATURES = [
+    'fpn',
+    'identity',
+]
+
+# pylint: disable=line-too-long
+# For ResNet, this freezes the variables of the first conv1 and conv2_x
+# layers [1], which leads to higher training speed and slightly better testing
+# accuracy. The intuition is that the low-level architecture (e.g., ResNet-50)
+# is able to capture low-level features such as edges; therefore, it does not
+# need to be fine-tuned for the detection task.
+# Note that we need to trailing `/` to avoid the incorrect match.
+# [1]: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L198
+RESNET_FROZEN_VAR_PREFIX = r'(resnet\d+)\/(conv2d(|_([1-9]|10))|batch_normalization(|_([1-9]|10)))\/'
+REGULARIZATION_VAR_REGEX = r'.*(kernel|weight):0$'
+
+BASE_CFG = {
+    'model_dir': '',
+    'use_tpu': True,
+    'strategy_type': 'tpu',
+    'isolate_session_state': False,
+    'train': {
+        'iterations_per_loop': 100,
+        'batch_size': 64,
+        'total_steps': 22500,
+        'num_cores_per_replica': None,
+        'input_partition_dims': None,
+        'optimizer': {
+            'type': 'momentum',
+            'momentum': 0.9,
+            'nesterov': True,  # `False` is better for TPU v3-128.
+        },
+        'learning_rate': {
+            'type': 'step',
+            'warmup_learning_rate': 0.0067,
+            'warmup_steps': 500,
+            'init_learning_rate': 0.08,
+            'learning_rate_levels': [0.008, 0.0008],
+            'learning_rate_steps': [15000, 20000],
+        },
+        'checkpoint': {
+            'path': '',
+            'prefix': '',
+        },
+        # One can use 'RESNET_FROZEN_VAR_PREFIX' to speed up ResNet training
+        # when loading from the checkpoint.
+        'frozen_variable_prefix': '',
+        'train_file_pattern': '',
+        'train_dataset_type': 'tfrecord',
+        # TODO(b/142174042): Support transpose_input option.
+        'transpose_input': False,
+        'regularization_variable_regex': REGULARIZATION_VAR_REGEX,
+        'l2_weight_decay': 0.0001,
+        'gradient_clip_norm': 0.0,
+        'input_sharding': False,
+    },
+    'eval': {
+        'input_sharding': True,
+        'batch_size': 8,
+        'eval_samples': 5000,
+        'min_eval_interval': 180,
+        'eval_timeout': None,
+        'num_steps_per_eval': 1000,
+        'type': 'box',
+        'use_json_file': True,
+        'val_json_file': '',
+        'eval_file_pattern': '',
+        'eval_dataset_type': 'tfrecord',
+        # When visualizing images, set evaluation batch size to 40 to avoid
+        # potential OOM.
+        'num_images_to_visualize': 0,
+    },
+    'predict': {
+        'batch_size': 8,
+    },
+    'architecture': {
+        'backbone': 'resnet',
+        'min_level': 3,
+        'max_level': 7,
+        'multilevel_features': 'fpn',
+        'use_bfloat16': True,
+        # Note that `num_classes` is the total number of classes including
+        # one background classes whose index is 0.
+        'num_classes': 91,
+    },
+    'anchor': {
+        'num_scales': 3,
+        'aspect_ratios': [1.0, 2.0, 0.5],
+        'anchor_size': 4.0,
+    },
+    'norm_activation': {
+        'activation': 'relu',
+        'batch_norm_momentum': 0.997,
+        'batch_norm_epsilon': 1e-4,
+        'batch_norm_trainable': True,
+        'use_sync_bn': False,
+    },
+    'resnet': {
+        'resnet_depth': 50,
+    },
+    'spinenet': {
+        'model_id': '49',
+    },
+    'fpn': {
+        'fpn_feat_dims': 256,
+        'use_separable_conv': False,
+        'use_batch_norm': True,
+    },
+    'postprocess': {
+        'use_batched_nms': False,
+        'max_total_size': 100,
+        'nms_iou_threshold': 0.5,
+        'score_threshold': 0.05,
+        'pre_nms_num_boxes': 5000,
+    },
+    'enable_summary': False,
+}
+# pylint: enable=line-too-long
--- a/official/legacy/detection/configs/factory.py
+++ b/official/legacy/detection/configs/factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Factory to provide model configs."""
+
+from official.legacy.detection.configs import maskrcnn_config
+from official.legacy.detection.configs import olnmask_config
+from official.legacy.detection.configs import retinanet_config
+from official.legacy.detection.configs import shapemask_config
+from official.modeling.hyperparams import params_dict
+
+
+def config_generator(model):
+  """Model function generator."""
+  if model == 'retinanet':
+    default_config = retinanet_config.RETINANET_CFG
+    restrictions = retinanet_config.RETINANET_RESTRICTIONS
+  elif model == 'mask_rcnn':
+    default_config = maskrcnn_config.MASKRCNN_CFG
+    restrictions = maskrcnn_config.MASKRCNN_RESTRICTIONS
+  elif model == 'olnmask':
+    default_config = olnmask_config.OLNMASK_CFG
+    restrictions = olnmask_config.OLNMASK_RESTRICTIONS
+  elif model == 'shapemask':
+    default_config = shapemask_config.SHAPEMASK_CFG
+    restrictions = shapemask_config.SHAPEMASK_RESTRICTIONS
+  else:
+    raise ValueError('Model %s is not supported.' % model)
+
+  return params_dict.ParamsDict(default_config, restrictions)