Merge pull request #1 from tensorflow/master

new pull

Merge pull request #1 from tensorflow/master
new pull
f16a7b5b · vedanshu · GitHub · 8e9296ff · 8f58f396 · f16a7b5b
Unverified Commit f16a7b5b authored May 04, 2021 by vedanshu Committed by GitHub May 04, 2021
20 changed files
--- a/official/nlp/data/train_sentencepiece.py
+++ b/official/nlp/data/train_sentencepiece.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A script to train sentencepiece model from tensorflow datasets.
+
+Reserved tokens:
+pad: 0,
+eos: 1,
+unk: 2
+(bos is not reserved)
+"""
+
+import os
+import tempfile
+from typing import List, Tuple
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from sentencepiece import SentencePieceTrainer
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("output_model_path", None,
+                    "Path to save the the sentencepiece model.")
+flags.mark_flag_as_required("output_model_path")
+
+flags.DEFINE_string("tfds_dir", None, "Directory of the tfds.")
+flags.DEFINE_string("tfds_name", "wmt14_translate/de-en",
+                    "Name of the dataset we generate vacabulay from.")
+flags.DEFINE_string("tfds_split", "train", "Split of the dataset.")
+flags.DEFINE_integer("vocab_size", 32000, "Size of vocabulary.")
+flags.DEFINE_integer(
+    "max_char", -1,
+    "Maximum number of characters to use. "
+    "If a non-positive number is provided, all sentences are used.")
+flags.DEFINE_string("model_type", "bpe",
+                    "Model algorithm: unigram, bpe, word or char.")
+flags.DEFINE_float("character_coverage", 0.9995,
+                   "Character coverage to determine the minimum symbols")
+flags.DEFINE_list(
+    "data_keys", ["en", "de"],
+    "Comma-separated list of keys to use for training the vocabulary.")
+
+
+def dump_chars_to_textfile(dataset: tf.data.Dataset,
+                           data_keys: Tuple[str],
+                           max_char: int = -1):
+  """Write part of a TFDS sentence dataset to lines in a text file.
+
+  Args:
+    dataset: tf.dataset containing string-data.
+    data_keys: what keys in dataset to dump from.
+    max_char: max character to dump to text file.
+
+  Returns:
+    name of temp file with dataset bytes, exact number of characters dumped.
+  """
+  ds_iter = dataset.as_numpy_iterator()
+  with tempfile.NamedTemporaryFile(delete=False) as outfp:
+    char_count = 0
+    while True:
+      example = next(ds_iter, None)
+      if example is None or (
+          max_char > 0 and char_count > max_char):
+        break
+      for k in data_keys:
+        line = example[k] + b"\n"
+        char_count += len(line)
+        outfp.write(line)
+  return outfp.name
+
+
+def train_sentencepiece(
+    file_path: str,
+    model_path: str,
+    vocab_size: int,
+    character_coverage: float,
+    model_type: str):
+  """Train SentencePiece tokenizer from subset of tf dataset.
+
+  Args:
+    file_path: path of data to train sentencepiece.
+    model_path: path of model file to save vocab model to.
+    vocab_size: size of vocab tokens to train.
+    character_coverage: amount of characters covered by the model, good defaults
+      are 0.9995 for languages with rich character set like Japanese or Chinese
+      and 1.0 for other languages with small character set.
+    model_type: type of sentencepiece vocab to train.
+
+  Returns:
+    path to the trained sentencepiece vocabulary model.
+  """
+  argstr = " ".join([
+      f"--input={file_path}", f"--vocab_size={vocab_size}",
+      f"--character_coverage={character_coverage}",
+      f"--model_prefix={model_path}", f"--model_type={model_type}",
+      "--bos_id=-1", "--pad_id=0", "--eos_id=1", "--unk_id=2"
+  ])
+  SentencePieceTrainer.Train(argstr)
+
+
+def main(argv: List[str]):
+  del argv
+  builder = tfds.builder(FLAGS.tfds_name, data_dir=FLAGS.tfds_dir)
+  ds = builder.as_dataset(split=FLAGS.tfds_split)
+  tmp_filename = dump_chars_to_textfile(ds, FLAGS.data_keys, FLAGS.max_char)
+  logging.info("Sentencepiece model will be placed here: %s",
+               FLAGS.output_model_path)
+  train_sentencepiece(tmp_filename,
+                      FLAGS.output_model_path,
+                      FLAGS.vocab_size,
+                      FLAGS.character_coverage,
+                      FLAGS.model_type)
+  os.remove(tmp_filename)
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/official/nlp/data/wmt_dataloader.py
+++ b/official/nlp/data/wmt_dataloader.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Input pipeline for the transformer model to read, filter, and batch examples.
+
+Batching scheme
+
+   Prior to batching, elements in the dataset are grouped by length (max between
+   'inputs' and 'targets' length). Each group is then batched such that:
+     group_batch_size * length <= batch_size.
+
+   Another way to view batch_size is the maximum number of tokens in each batch.
+
+   Once batched, each element in the dataset will have the shape:
+     {'inputs': [group_batch_size, padded_input_length],
+      'targets': [group_batch_size, padded_target_length]}
+   Lengths are padded to the longest 'inputs' or 'targets' sequence in the batch
+   (padded_input_length and padded_target_length can be different).
+
+   This batching scheme decreases the fraction of padding tokens per training
+   batch, thus improving the training speed significantly.
+"""
+from typing import Dict, Optional
+
+import dataclasses
+import tensorflow as tf
+import tensorflow_text as tftxt
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+
+# Example grouping constants. Defines length boundaries for each group.
+# These values are the defaults used in Tensor2Tensor.
+_MIN_BOUNDARY = 8
+_BOUNDARY_SCALE = 1.1
+
+
+def _get_example_length(example):
+  """Returns the maximum length between the example inputs and targets."""
+  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
+  return length
+
+
+def _create_min_max_boundaries(max_length,
+                               min_boundary=_MIN_BOUNDARY,
+                               boundary_scale=_BOUNDARY_SCALE):
+  """Create min and max boundary lists up to max_length.
+
+  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
+  returned values will be:
+    buckets_min = [0, 4, 8, 16]
+    buckets_max = [4, 8, 16, 25]
+
+  Args:
+    max_length: The maximum length of example in dataset.
+    min_boundary: Minimum length in boundary.
+    boundary_scale: Amount to scale consecutive boundaries in the list.
+
+  Returns:
+    min and max boundary lists
+
+  """
+  # Create bucket boundaries list by scaling the previous boundary or adding 1
+  # (to ensure increasing boundary sizes).
+  bucket_boundaries = []
+  x = min_boundary
+  while x < max_length:
+    bucket_boundaries.append(x)
+    x = max(x + 1, int(x * boundary_scale))
+
+  # Create min and max boundary lists from the initial list.
+  buckets_min = [0] + bucket_boundaries
+  buckets_max = bucket_boundaries + [max_length + 1]
+  return buckets_min, buckets_max
+
+
+def _batch_examples(dataset, batch_size, max_length):
+  """Group examples by similar lengths, and return batched dataset.
+
+  Each batch of similar-length examples are padded to the same length, and may
+  have different number of elements in each batch, such that:
+    group_batch_size * padded_length <= batch_size.
+
+  This decreases the number of padding tokens per batch, which improves the
+  training speed.
+
+  Args:
+    dataset: Dataset of unbatched examples.
+    batch_size: Max number of tokens per batch of examples.
+    max_length: Max number of tokens in an example input or target sequence.
+
+  Returns:
+    Dataset of batched examples with similar lengths.
+  """
+  # Get min and max boundary lists for each example. These are used to calculate
+  # the `bucket_id`, which is the index at which:
+  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
+  # Note that using both min and max lists improves the performance.
+  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
+
+  # Create list of batch sizes for each bucket_id, so that
+  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
+  bucket_batch_sizes = [int(batch_size) // x for x in buckets_max]
+
+  # Validates bucket batch sizes.
+  if any([batch_size <= 0 for batch_size in bucket_batch_sizes]):
+    raise ValueError(
+        'The token budget, global batch size, is too small to yeild 0 bucket '
+        'window: %s' % str(bucket_batch_sizes))
+
+  # bucket_id will be a tensor, so convert this list to a tensor as well.
+  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
+
+  def example_to_bucket_id(example):
+    """Return int64 bucket id for this example, calculated based on length."""
+    example_input = example['inputs']
+    example_target = example['targets']
+    seq_length = _get_example_length((example_input, example_target))
+
+    conditions_c = tf.logical_and(
+        tf.less_equal(buckets_min, seq_length), tf.less(seq_length,
+                                                        buckets_max))
+    bucket_id = tf.reduce_min(tf.where(conditions_c))
+    return bucket_id
+
+  def window_size_fn(bucket_id):
+    """Return number of examples to be grouped when given a bucket id."""
+    return bucket_batch_sizes[bucket_id]
+
+  def batching_fn(bucket_id, grouped_dataset):
+    """Batch and add padding to a dataset of elements with similar lengths."""
+    bucket_batch_size = window_size_fn(bucket_id)
+
+    # Batch the dataset and add padding so that all input sequences in the
+    # examples have the same length, and all target sequences have the same
+    # lengths as well. Resulting lengths of inputs and targets can differ.
+    padded_shapes = dict([
+        (name, [None] * len(spec.shape))
+        for name, spec in grouped_dataset.element_spec.items()
+    ])
+    return grouped_dataset.padded_batch(bucket_batch_size, padded_shapes)
+
+  return dataset.apply(
+      tf.data.experimental.group_by_window(
+          key_func=example_to_bucket_id,
+          reduce_func=batching_fn,
+          window_size=None,
+          window_size_func=window_size_fn))
+
+
+@dataclasses.dataclass
+class WMTDataConfig(cfg.DataConfig):
+  """Data config for WMT translation."""
+  max_seq_length: int = 64
+  static_batch: bool = False
+  sentencepiece_model_path: str = ''
+  src_lang: str = ''
+  tgt_lang: str = ''
+  transform_and_batch: bool = True
+  has_unique_id: bool = False
+
+
+@data_loader_factory.register_data_loader_cls(WMTDataConfig)
+class WMTDataLoader(data_loader.DataLoader):
+  """A class to load dataset for WMT translation task."""
+
+  def __init__(self, params: WMTDataConfig):
+    self._params = params
+    self._max_seq_length = params.max_seq_length
+    self._static_batch = params.static_batch
+    self._global_batch_size = params.global_batch_size
+    if self._params.transform_and_batch:
+      self._tokenizer = tftxt.SentencepieceTokenizer(
+          model=tf.io.gfile.GFile(params.sentencepiece_model_path, 'rb').read(),
+          add_eos=True)
+
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        self._params.src_lang: tf.io.FixedLenFeature([], tf.string),
+        self._params.tgt_lang: tf.io.FixedLenFeature([], tf.string),
+    }
+    if self._params.has_unique_id:
+      name_to_features['unique_id'] = tf.io.FixedLenFeature([], tf.int64)
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+
+  def _tokenize(self, inputs) -> Dict[str, tf.Tensor]:
+    tokenized_inputs = {}
+    for k, v in inputs.items():
+      if k == self._params.src_lang:
+        tokenized_inputs['inputs'] = self._tokenizer.tokenize(v)
+      elif k == self._params.tgt_lang:
+        tokenized_inputs['targets'] = self._tokenizer.tokenize(v)
+      else:
+        tokenized_inputs[k] = v
+    print(tokenized_inputs)
+    return tokenized_inputs
+
+  def _filter_max_length(self, inputs):
+    # return tf.constant(True)
+    return tf.logical_and(
+        tf.shape(inputs['inputs'])[0] <= self._max_seq_length,
+        tf.shape(inputs['targets'])[0] <= self._max_seq_length)
+
+  def _maybe_truncate(self, inputs):
+    truncated_inputs = {}
+    for k, v in inputs.items():
+      if k == 'inputs' or k == 'targets':
+        truncated_inputs[k] = tf.pad(
+            v[:self._max_seq_length - 1], [[0, 1]],
+            constant_values=1) if tf.shape(v)[0] > self._max_seq_length else v
+      else:
+        truncated_inputs[k] = v
+    return truncated_inputs
+
+  def _tokenize_bucketize_and_batch(
+      self,
+      dataset,
+      input_context: Optional[tf.distribute.InputContext] = None):
+    dataset = dataset.map(
+        self._tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    if self._params.is_training:
+      dataset = dataset.filter(self._filter_max_length)
+    else:
+      dataset = dataset.map(
+          self._maybe_truncate,
+          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    per_replica_batch_size = input_context.get_per_replica_batch_size(
+        self._global_batch_size) if input_context else self._global_batch_size
+    if self._static_batch:
+      padded_shapes = {}
+      for name, _ in dataset.element_spec.items():
+        if name == 'unique_id':
+          padded_shapes[name] = []
+        else:
+          padded_shapes[name] = [self._max_seq_length
+                                ] if self._static_batch else [None]
+      batch_size = per_replica_batch_size
+      if self._params.is_training:
+        batch_size = int(batch_size // self._max_seq_length)
+      dataset = dataset.padded_batch(
+          batch_size,
+          padded_shapes,
+          drop_remainder=True)
+    else:
+      # Group and batch such that each batch has examples of similar length.
+      dataset = _batch_examples(dataset, per_replica_batch_size,
+                                self._max_seq_length)
+    # Prefetch the next element to improve speed of input pipeline.
+    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return dataset
+
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    decoder_fn = None
+    # Only decode for TFRecords.
+    if self._params.input_path:
+      decoder_fn = self._decode
+
+    def _identity(
+        dataset, input_context: Optional[tf.distribute.InputContext] = None):
+      del input_context
+      return dataset
+
+    transform_and_batch_fn = _identity
+    if self._params.transform_and_batch:
+      transform_and_batch_fn = self._tokenize_bucketize_and_batch
+
+    reader = input_reader.InputReader(
+        params=self._params,
+        decoder_fn=decoder_fn,
+        transform_and_batch_fn=transform_and_batch_fn)
+    return reader.read(input_context)
--- a/official/nlp/data/wmt_dataloader_test.py
+++ b/official/nlp/data/wmt_dataloader_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for official.nlp.data.wmt_dataloader."""
+import os
+from absl.testing import parameterized
+
+import tensorflow as tf
+
+from sentencepiece import SentencePieceTrainer
+from official.nlp.data import wmt_dataloader
+
+
+def _generate_line_file(filepath, lines):
+  with tf.io.gfile.GFile(filepath, 'w') as f:
+    for l in lines:
+      f.write('{}\n'.format(l))
+
+
+def _generate_record_file(filepath, src_lines, tgt_lines, unique_id=False):
+  writer = tf.io.TFRecordWriter(filepath)
+  for i, (src, tgt) in enumerate(zip(src_lines, tgt_lines)):
+    features = {
+        'en': tf.train.Feature(
+            bytes_list=tf.train.BytesList(
+                value=[src.encode()])),
+        'reverse_en': tf.train.Feature(
+            bytes_list=tf.train.BytesList(
+                value=[tgt.encode()])),
+    }
+    if unique_id:
+      features['unique_id'] = tf.train.Feature(
+          int64_list=tf.train.Int64List(value=[i])),
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature=features))
+    writer.write(example.SerializeToString())
+  writer.close()
+
+
+def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
+  argstr = ' '.join([
+      f'--input={input_path}', f'--vocab_size={vocab_size}',
+      '--character_coverage=0.995',
+      f'--model_prefix={model_path}', '--model_type=bpe',
+      '--bos_id=-1', '--pad_id=0', f'--eos_id={eos_id}', '--unk_id=2'
+  ])
+  SentencePieceTrainer.Train(argstr)
+
+
+class WMTDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(WMTDataLoaderTest, self).setUp()
+    self._temp_dir = self.get_temp_dir()
+    src_lines = [
+        'abc ede fg',
+        'bbcd ef a g',
+        'de f a a g'
+    ]
+    tgt_lines = [
+        'dd cc a ef  g',
+        'bcd ef a g',
+        'gef cd ba'
+    ]
+    self._record_train_input_path = os.path.join(self._temp_dir, 'train.record')
+    _generate_record_file(self._record_train_input_path, src_lines, tgt_lines)
+    self._record_test_input_path = os.path.join(self._temp_dir, 'test.record')
+    _generate_record_file(self._record_test_input_path, src_lines, tgt_lines,
+                          unique_id=True)
+    self._sentencepeice_input_path = os.path.join(self._temp_dir, 'inputs.txt')
+    _generate_line_file(self._sentencepeice_input_path, src_lines + tgt_lines)
+    sentencepeice_model_prefix = os.path.join(self._temp_dir, 'sp')
+    _train_sentencepiece(self._sentencepeice_input_path, 20,
+                         sentencepeice_model_prefix)
+    self._sentencepeice_model_path = '{}.model'.format(
+        sentencepeice_model_prefix)
+
+  @parameterized.named_parameters(
+      ('train_static', True, True, 100, (2, 35)),
+      ('train_non_static', True, False, 100, (12, 7)),
+      ('non_train_static', False, True, 3, (3, 35)),
+      ('non_train_non_static', False, False, 50, (2, 7)),)
+  def test_load_dataset(
+      self, is_training, static_batch, batch_size, expected_shape):
+    data_config = wmt_dataloader.WMTDataConfig(
+        input_path=self._record_train_input_path
+        if is_training else self._record_test_input_path,
+        max_seq_length=35,
+        global_batch_size=batch_size,
+        is_training=is_training,
+        static_batch=static_batch,
+        src_lang='en',
+        tgt_lang='reverse_en',
+        sentencepiece_model_path=self._sentencepeice_model_path)
+    dataset = wmt_dataloader.WMTDataLoader(data_config).load()
+    examples = next(iter(dataset))
+    inputs, targets = examples['inputs'], examples['targets']
+    self.assertEqual(inputs.shape, expected_shape)
+    self.assertEqual(targets.shape, expected_shape)
+
+  def test_load_dataset_raise_invalid_window(self):
+    batch_tokens_size = 10  # this is too small to form buckets.
+    data_config = wmt_dataloader.WMTDataConfig(
+        input_path=self._record_train_input_path,
+        max_seq_length=100,
+        global_batch_size=batch_tokens_size,
+        is_training=True,
+        static_batch=False,
+        src_lang='en',
+        tgt_lang='reverse_en',
+        sentencepiece_model_path=self._sentencepeice_model_path)
+    with self.assertRaisesRegex(
+        ValueError, 'The token budget, global batch size, is too small.*'):
+      _ = wmt_dataloader.WMTDataLoader(data_config).load()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/docs/pretrained_models.md
+++ b/official/nlp/docs/pretrained_models.md
+# Pre-trained Models
+
+We provide a large collection of baselines and checkpoints for NLP pre-trained
+models.
+
+## How to Load Pretrained Models
+
+### How to Initialize from Checkpoint
+
+**Note:** TF-HUB/Savedmodel is the preferred way to distribute models as it is
+self-contained. Please consider using TF-HUB for finetuning tasks first.
+
+If you use the [NLP training library](train.md),
+you can specify the checkpoint path link directly when launching your job. For
+example, to initialize the model from the checkpoint, you can specify
+`--params_override=task.init_checkpoint=PATH_TO_INIT_CKPT` as:
+
+```
+python3 train.py \
+ --params_override=task.init_checkpoint=PATH_TO_INIT_CKPT
+```
+
+### How to load TF-HUB SavedModel
+
+Finetuning tasks such as question answering (SQuAD) and sentence
+prediction (GLUE) support loading a model from TF-HUB. These built-in tasks
+support a specific `task.hub_module_url` parameter. To set this parameter,
+replace `--params_override=task.init_checkpoint=...` with
+`--params_override=task.hub_module_url=TF_HUB_URL`, like below:
+
+```
+python3 train.py \
+ --params_override=task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
+```
+
+## BERT
+
+Public BERT pre-trained models released by the BERT authors.
+
+We released both checkpoints and tf.hub modules as the pretrained models for
+fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
+released in TF 1.x official BERT repository
+[google-research/bert](https://github.com/google-research/bert)
+in order to keep consistent with BERT paper.
+
+### Checkpoints
+
+Model                                    | Configuration                | Training Data | Checkpoint & Vocabulary | TF-HUB SavedModels
+---------------------------------------- | :--------------------------: | ------------: | ----------------------: | ------:
+BERT-base uncased English                | uncased_L-12_H-768_A-12      | Wiki + Books  | [uncased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/)
+BERT-base cased English                  | cased_L-12_H-768_A-12        | Wiki + Books  | [cased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/cased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/)
+BERT-large uncased English               | uncased_L-24_H-1024_A-16     | Wiki + Books  | [uncased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/uncased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/)
+BERT-large cased English                  | cased_L-24_H-1024_A-16       | Wiki + Books  | [cased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/cased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/)
+BERT-large, Uncased (Whole Word Masking) | wwm_uncased_L-24_H-1024_A-16 | Wiki + Books  | [wwm_uncased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/wwm_uncased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/)
+BERT-large, Cased (Whole Word Masking)   | wwm_cased_L-24_H-1024_A-16   | Wiki + Books  | [wwm_cased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/wwm_cased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/)
+BERT-base MultiLingual                   | multi_cased_L-12_H-768_A-12  | Wiki + Books  | [multi_cased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/multi_cased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/)
+BERT-base Chinese                        | chinese_L-12_H-768_A-12      | Wiki + Books  | [chinese_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/chinese_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/)
+
+You may explore more in the TF-Hub BERT collection:
+https://tfhub.dev/google/collections/bert/1
+
+### BERT variants
+
+We also have pretrained BERT models with variants in both network architecture
+and training methodologies. These models achieve higher downstream accuracy
+scores.
+
+Model                            | Configuration            | Training Data            | TF-HUB SavedModels                                                                    | Comment
+-------------------------------- | :----------------------: | -----------------------: | ------------------------------------------------------------------------------------: | ------:
+BERT-base talking heads + ggelu  | uncased_L-12_H-768_A-12  | Wiki + Books   | [talkheads_ggelu_base](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1)   | BERT-base trained with [talking heads attention](https://arxiv.org/abs/2003.02436) and [gated GeLU](https://arxiv.org/abs/2002.05202).
+BERT-large talking heads + ggelu | uncased_L-24_H-1024_A-16 | Wiki + Books  | [talkheads_ggelu_large](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1) | BERT-large trained with [talking heads attention](https://arxiv.org/abs/2003.02436) and [gated GeLU](https://arxiv.org/abs/2002.05202).
+LAMBERT-large uncased English    | uncased_L-24_H-1024_A-16 | Wiki + Books  | [lambert](https://tfhub.dev/tensorflow/lambert_en_uncased_L-24_H-1024_A-16/1)         | BERT trained with LAMB and techniques from RoBERTa.
--- a/official/nlp/docs/tfhub.md
+++ b/official/nlp/docs/tfhub.md
+# Exporting a pre-trained Encoder to TF Hub
+
+## Overview
+
+This doc explains how to use TF-NLP's
+[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
+tool to export pre-trained Transformer encoders to SavedModels suitable for
+publication on TF Hub. (For the steps after that, see TF Hub's
+[publisher guide](https://www.tensorflow.org/hub/publish).)
+For testing purposes, those SavedModels can also be used from their export
+locations on the filesystem.
+
+On TF Hub, Transformer encoders for text come as a pair of SavedModels:
+
+*   The preprocessing model applies a tokenizer with a fixed vocab plus some
+    additional logic to turn text into Transformer inputs.
+*   The encoder model (or "model" for short) applies the pre-trained Transformer
+    encoder.
+
+TF Hub defines
+[Common APIs](https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders)
+for all SavedModels of those two respective types, encapsulating the particular
+choice of preprocessing logic and Encoder architecture.
+
+## Exporting the Encoder
+
+There is a choice between exporting just the encoder, or the encoder plus the
+prediction head for the masked language model (MLM) task from pre-training.
+
+Exporting just the encoder suffices for many straightforward applications.
+
+### Exporting the Encoder alone
+
+To export an encoder-only model, you can set `--export_type=model` and run the
+tool like this:
+
+```shell
+python official/nlp/tools/export_tfhub.py \
+  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
+  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --export_type=model \
+  --export_path=/tmp/bert_model
+```
+
+The flag `--encoder_config_file` refers to a YAML file representing the
+[encoders.EncoderConfig](https://github.com/tensorflow/models/search?q=EncoderConfig+path%3Aofficial%2Fnlp%2Fconfigs+filename%3Aencoders.py)
+dataclass, which supports multiple encoders (e.g., BERT, ALBERT). Instead of
+`--encoder_config_file`, you can set `--bert_config_file` to a legacy
+`bert_config.json` file to export a BERT model. If the model definition involves
+[GIN](https://github.com/google/gin-config), the flags `--gin_file` and
+`--gin_params` must be set accordingly, consistent with pre-training.
+
+The `--model_checkpoint_path` refers to an object-based (TF2) checkpoint written
+by
+[BertPretrainerV2](https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py),
+or any other checkpoint that can be restored to
+`tf.train.Checkpoint(encoder=encoder)` for the encoder defined by the config
+flags. Legacy checkpoints with `model=` instead of `encoder=` are also supported
+for now.
+
+The exported SavedModel expects dict inputs and outputs as follows, implementing
+a specialization of the respective
+[Common SavedModel API](https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders):
+
+```python
+encoder = hub.load(...)
+encoder_inputs = dict(
+    input_word_ids=...,  # Shape [batch, seq_length], dtype=int32
+    input_mask=...,      # Shape [batch, seq_length], dtype=int32
+    input_type_ids=...,  # Shape [batch, seq_length], dtype=int32
+)
+encoder_outputs = encoder(encoder_inputs)
+assert encoder_outputs.keys() == {
+  "pooled_output",    # Shape [batch_size, width], dtype=float32
+  "default",          # Alias for "pooled_output" (aligns with other models)
+  "sequence_output",  # Shape [batch_size, seq_length, width], dtype=float32
+  "encoder_outputs",  # List of Tensors with outputs of all transformer layers
+}
+```
+
+The encoder's pooler layer is restored from the `--model_checkpoint_path`.
+However, unlike classic BERT, `BertPretrainerV2` does not train the pooler layer
+of the encoder. You have three options to handle that:
+
+*   Set flag `--copy_pooler_dense_to_encoder` to copy the pooling layer from the
+    `ClassificationHead` passed to `BertPretrainerV2` for the next sentence
+    prediction task. This mimicks classic BERT, but is not recommended for new
+    models (see next item).
+*   Leave flag `--copy_pooler_dense_to_encoder` unset and export the untrained,
+    randomly initialized pooling layer of the encoder. Folklore (as of 2020) has
+    it that an untrained pooler gets fine-tuned better than a pre-trained
+    pooler, so this is the default.
+*   Leave flag `--copy_pooler_dense_to_encoder` unset and perform your own
+    initialization of the pooling layer before export. For example, Google's
+    [BERT Experts](https://tfhub.dev/google/collections/experts/bert/1)
+    published in October 2020 initialize it to the identity map, reporting equal
+    gains if fine-tuning, and more predictable behavior if not.
+
+In any case, at this time, the export tool requires the encoder model to *have*
+a `pooled_output`, whether trained or not. (This can be revised in the future.)
+
+The encoder model does not include any preprocessing logic, but for the benefit
+of users who take preprocessing into their own hands, the relevant information
+is attached from flags `--vocab_file` or `--sp_model_file`, resp., and
+`--do_lower_case`, which need to be set in exactly the same way as for the
+preprocessing model (see below).
+
+The root object of the exported SavedModel stores the resulting values as
+attributes on the root object:
+
+```python
+encoder = hub.load(...)
+# Gets the filename of the respective tf.saved_model.Asset object.
+if hasattr(encoder, "vocab_file"):
+  print("Wordpiece vocab at", encoder.vocab_file.asset_path.numpy())
+elif hasattr(encoder, "sp_model_file"):
+  print("SentencePiece model at", encoder.sp_model_file.asset_path.numpy())
+# Gets the value of a scalar bool tf.Variable.
+print("...using do_lower_case =", encoder.do_lower_case.numpy())
+```
+
+New users are encouraged to ignore these attributes and use the preprocessing
+model instead. However, there are legacy users, and advanced users that require
+access to the full vocab.
+
+### Exporting the Encoder with a Masked Language Model head
+
+To export an encoder and the masked language model it was trained with, first
+read the preceding section about exporting just the encoder. All the
+explanations there on setting the right flags apply here as well, up to the
+following differences.
+
+The masked language model is added to the export by changing flag
+`--export_type` from `model` to `model_with_mlm`, so the export command looks
+like this:
+
+```shell
+python official/nlp/tools/export_tfhub.py \
+  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
+  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --export_type=model_with_mlm \
+  --export_path=/tmp/bert_model
+```
+
+The `--model_checkpoint_path` refers to an object-based (TF2) checkpoint written
+by
+[BertPretrainerV2](https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py),
+or any other checkpoint that can be restored to
+`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)` with the encoder
+defined by the config flags.
+
+This is a more comprehensive requirement on the checkpoint than for
+`--export_type=model`; not all Transformer encoders and not all pre-training
+techniques can satisfy it. For example,
+[ELECTRA](https://arxiv.org/abs/2003.10555) uses the BERT architecture but is
+pre-trained without an MLM task.
+
+The root object of the exported SavedModel is called in the same way as above.
+In addition, the SavedModel has an `mlm` subobject that can be called as follows
+to output an `mlm_logits` tensor as well:
+
+```python
+mlm_inputs = dict(
+    input_word_ids=...,       # Shape [batch, seq_length], dtype=int32
+    input_mask=...,           # Shape [batch, seq_length], dtype=int32
+    input_type_ids=...,       # Shape [batch, seq_length], dtype=int32
+    masked_lm_positions=...,  # Shape [batch, num_predictions], dtype=int32
+)
+mlm_outputs = encoder.mlm(mlm_inputs)
+assert mlm_outputs.keys() == {
+  "pooled_output",   # Shape [batch, width], dtype=float32
+  "sequence_output", # Shape [batch, seq_length, width], dtype=float32
+  "encoder_outputs", # List of Tensors with outputs of all transformer layers
+  "mlm_logits"       # Shape [batch, num_predictions, vocab_size], dtype=float32
+}
+```
+
+The extra subobject imposes a moderate size overhead.
+
+### Exporting from a TF1 BERT checkpoint
+
+A BERT model trained with the
+[original BERT implementation for TF1](https://github.com/google-research/bert)
+can be exported after converting its checkpoint with the
+[tf2_encoder_checkpoint_converter](https://github.com/tensorflow/models/blob/master/official/nlp/bert/tf2_encoder_checkpoint_converter.py)
+tool.
+
+After that, run
+[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
+per the instructions above on the converted checkpoint. Do not set
+`--copy_pooler_dense_to_encoder`, because the pooler layer is part of the
+converted encoder. For `--vocab_file` and `--do_lower_case`, the values from TF1
+BERT can be used verbatim.
+
+## Exporting the preprocessing model
+
+You can skip this step if TF Hub already has a preprocessing model that does
+exactly what your encoder needs (same tokenizer, same vocab, same normalization
+settings (`do_lower_case`)). You can inspect its collection of
+[Transformer Encoders for Text](https://tfhub.dev/google/collections/transformer_encoders_text/1)
+and click through to models with a similar input domain to find their
+preprocessing models.
+
+To export the preprocessing model, set `--export_type=preprocessing` and run the
+export tool like this:
+
+```shell
+python official/nlp/tools/export_tfhub.py \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --do_lower_case=True \
+  --export_type=preprocessing \
+  --export_path=/tmp/bert_preprocessing
+```
+
+Note: Set flag `--experimental_disable_assert_in_preprocessing` when exporting
+to users of the public TensorFlow releases 2.4.x to avoid a fatal ops placement
+issue when preprocessing is used within Dataset.map() on TPU workers.
+This is not an issue with TF2.3 and TF2.5+.
+
+Flag `--vocab_file` specifies the vocab file used with
+[BertTokenizer](https://github.com/tensorflow/models/search?q=BertTokenizer+filename%3Atext_layers.py).
+For models that use the
+[SentencepieceTokenizer](https://github.com/tensorflow/models/search?q=SentencepieceTokenizer+filename%3Atext_layers.py),
+set flag `--sp_model_file` instead.
+
+The boolean flag `--do_lower_case` controls text normalization (as in the
+respective tokenizer classes, so it's a bit more than just smashing case). If
+unset, do_lower_case will be enabled if 'uncased' appears in --vocab_file, or
+unconditionally if --sp_model_file is set, mimicking the conventions of BERT and
+ALBERT, respectively. For programmatic use, or if in doubt, it's best to set
+`--do_lower_case` explicity.
+
+If the definition of preprocessing involved
+[GIN](https://github.com/google/gin-config),
+the flags `--gin_file` and `--gin_params` would have to be set accordingly,
+consistent with pre-training. (At the time of this writing, no such GIN
+configurables exist in the code.)
+
+The exported SavedModel can be called in the following way for a single segment
+input.
+
+```python
+preprocessor = hub.load(...)
+text_input = ... # Shape [batch_size], dtype=tf.string
+encoder_inputs = preprocessor(text_input, seq_length=seq_length)
+assert encoder_inputs.keys() == {
+  "input_word_ids", # Shape [batch_size, seq_length], dtype=int32
+  "input_mask",     # Shape [batch_size, seq_length], dtype=int32
+  "input_type_ids"  # Shape [batch_size, seq_length], dtype=int32
+}
+```
+
+Flag `--default_seq_length` controls the value of `seq_length` if that argument
+is omitted in the usage example above. The flag defaults to 128, because
+mutiples of 128 work best for Cloud TPUs, yet the cost of attention computation
+grows quadratically with `seq_length`.
+
+Beyond this example, the exported SavedModel implements the full set interface
+from the preprocessor API for text embeddings with preprocessed inputs and with
+Transformer encoders from TF Hub's
+[Common APIs for text](https://www.tensorflow.org/hub/common_saved_model_apis/text).
+
+Please see
+[tfhub.dev/tensorflow/bert_en_uncased_preprocess](https://tfhub.dev/tensorflow/bert_en_uncased_preprocess)
+for the full documentation of one preprocessing model exported with this tool,
+especially how custom trimming of inputs can happen between `.tokenize` and
+`.bert_pack_inputs`.
+
+Using the `encoder.mlm()` interface requires masking of tokenized inputs by user
+code. The necessary information on the vocabulary encapsulated in the
+preprocessing model can be obtained like this (uniformly across tokenizers):
+
+```python
+special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
+vocab_size = int(special_tokens_dict["vocab_size"])
+padding_id = int(special_tokens_dict["padding_id"])  # [PAD] or <pad>
+start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])  # [CLS]
+end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])  # [SEP]
+mask_id = int(special_tokens_dict["mask_id"])  # [MASK]
+```
+
+## Testing the exported models
+
+Please test your SavedModels before publication by fine-tuning them on a
+suitable task and comparing performance and accuracy to a baseline experiment
+built from equivalent Python code.
+The
+[trainer doc](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
+has instructions how to run BERT on MNLI and other tasks from the GLUE
+benchmark.
--- a/official/nlp/docs/train.md
+++ b/official/nlp/docs/train.md
+# Model Garden NLP Common Training Driver
+
+[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) is the common training driver that supports multiple
+NLP tasks (e.g., pre-training, GLUE and SQuAD fine-tuning etc) and multiple
+models (e.g., BERT, ALBERT, MobileBERT etc).
+
+## Experiment Configuration
+
+[train.py] is driven by configs defined by the [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py)
+including configurations for `task`, `trainer` and `runtime`. The pre-defined
+NLP related [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py) can be found in
+[configs/experiment_configs.py](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiment_configs.py).
+
+## Experiment Registry
+
+We use an [experiment registry](https://github.com/tensorflow/models/blob/master/official/core/exp_factory.py) to build a mapping
+between experiment type to experiment configuration instance. For example,
+[configs/finetuning_experiments.py](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py)
+registers `bert/sentence_prediction` and `bert/squad` experiments. User can use
+`--experiment` FLAG to invoke a registered experiment configuration,
+e.g., `--experiment=bert/sentence_prediction`.
+
+## Overriding Configuration via Yaml and FLAGS
+
+The registered experiment configuration can be overridden by one or
+multiple Yaml files provided by `--config_file` FLAG. For example:
+
+```shell
+--config_file=configs/experiments/glue_mnli_matched.yaml \
+--config_file=configs/models/bert_en_uncased_base.yaml
+```
+
+In addition, experiment configuration can be further overriden by
+`params_override` FLAG. For example:
+
+```shell
+ --params_override=task.train_data.input_path=/some/path,task.hub_module_url=/some/tfhub
+```
+
+## Run on Cloud TPUs
+
+Next, we will describe how to run the [train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) on Cloud TPUs.
+
+### Setup
+First, you need to create a `tf-nightly` TPU with
+[ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
+
+```shell
+export TPU_NAME=YOUR_TPU_NAME
+ctpu up -name $TPU_NAME --tf-version=nightly --tpu-size=YOUR_TPU_SIZE --project=YOUR_PROJECT
+```
+
+and then install Model Garden and required dependencies:
+
+```shell
+git clone https://github.com/tensorflow/models.git
+export PYTHONPATH=$PYTHONPATH:/path/to/models
+pip3 install --user -r official/requirements.txt
+```
+
+### Fine-tuning Sentence Classification with BERT from TF-Hub
+
+This example fine-tunes BERT-base from TF-Hub on the the Multi-Genre Natural
+Language Inference (MultiNLI) corpus using TPUs.
+
+Firstly, you can prepare the fine-tuning data using
+[`create_finetuning_data.py`](https://github.com/tensorflow/models/blob/master/official/nlp/data/create_finetuning_data.py) script.
+For GLUE tasks, you can (1) download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`, (2) prepare the vocabulary file,
+and (3) run the following command:
+
+```shell
+export GLUE_DIR=~/glue
+export VOCAB_FILE=~/uncased_L-12_H-768_A-12/vocab.txt
+
+export TASK_NAME=MNLI
+export OUTPUT_DATA_DIR=gs://some_bucket/datasets
+python3 data/create_finetuning_data.py \
+ --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+ --vocab_file=${VOCAB_FILE} \
+ --train_data_output_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_train.tf_record \
+ --eval_data_output_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_eval.tf_record \
+ --meta_data_file_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_meta_data \
+ --fine_tuning_task_type=classification --max_seq_length=128 \
+ --classification_task_name=${TASK_NAME}
+```
+
+Resulting training and evaluation datasets in `tf_record` format will be later
+passed to [train.py](train.py). We will support to read dataset from
+tensorflow_datasets (TFDS) and use tf.text for pre-processing soon.
+
+Then you can execute the following commands to start the training and evaluation
+job.
+
+```shell
+export INPUT_DATA_DIR=gs://some_bucket/datasets
+export OUTPUT_DIR=gs://some_bucket/my_output_dir
+
+# See tfhub BERT collection for more tfhub models:
+# https://tfhub.dev/google/collections/bert/1
+export BERT_HUB_URL=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
+
+# Override the configurations by FLAGS. Alternatively, you can directly edit
+# `configs/experiments/glue_mnli_matched.yaml` to specify corresponding fields.
+export PARAMS=task.train_data.input_path=$INPUT_DATA_DIR/mnli_train.tf_record
+export PARAMS=$PARAMS,task.validation_data.input_path=$INPUT_DATA_DIR/mnli_eval.tf_record
+export PARAMS=$PARAMS,task.hub_module_url=$BERT_HUB_URL
+export PARAMS=$PARAMS,runtime.distribution_strategy=tpu
+
+python3 train.py \
+ --experiment=bert/sentence_prediction \
+ --mode=train_and_eval \
+ --model_dir=$OUTPUT_DIR \
+ --config_file=configs/experiments/glue_mnli_matched.yaml \
+ --tfhub_cache_dir=$OUTPUT_DIR/hub_cache \
+ --tpu=${TPU_NAME} \
+ --params_override=$PARAMS
+
+```
+
+You can monitor the training progress in the console and find the output
+models in `$OUTPUT_DIR`.
+
+### Fine-tuning SQuAD with a pre-trained BERT checkpoint
+
+This example fine-tunes a pre-trained BERT checkpoint on the
+Stanford Question Answering Dataset (SQuAD) using TPUs.
+The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
+detailed information about the SQuAD datasets and evaluation. After downloading
+the SQuAD datasets and the [pre-trained BERT checkpoints](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md),
+you can run the following command to prepare the `tf_record` files:
+
+```shell
+export SQUAD_DIR=~/squad
+export BERT_DIR=~/uncased_L-12_H-768_A-12
+export OUTPUT_DATA_DIR=gs://some_bucket/datasets
+
+python3 create_finetuning_data.py \
+ --squad_data_file=${SQUAD_DIR}/train-v1.1.json \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DATA_DIR}/train.tf_record \
+ --meta_data_file_path=${OUTPUT_DATA_DIR}/squad_meta_data \
+ --fine_tuning_task_type=squad --max_seq_length=384
+```
+
+Note: To create fine-tuning data with SQuAD 2.0, you need to add flag `--version_2_with_negative=True`.
+
+Then, you can start the training and evaluation jobs:
+
+```shell
+export SQUAD_DIR=~/squad
+export INPUT_DATA_DIR=gs://some_bucket/datasets
+export OUTPUT_DIR=gs://some_bucket/my_output_dir
+
+# See the following link for more pre-trained checkpoints:
+# https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md
+export BERT_DIR=~/uncased_L-12_H-768_A-12
+
+# Override the configurations by FLAGS. Alternatively, you can directly edit
+# `configs/experiments/squad_v1.1.yaml` to specify corresponding fields.
+# Also note that the training data is the pre-processed tf_record file, while
+# the validation file is the raw json file.
+export PARAMS=task.train_data.input_path=$INPUT_DATA_DIR/train.tf_record
+export PARAMS=$PARAMS,task.validation_data.input_path=$SQUAD_DIR/dev-v1.1.json
+export PARAMS=$PARAMS,task.validation_data.vocab_file=$BERT_DIR/vocab.txt
+export PARAMS=$PARAMS,task.init_checkpoint=$BERT_DIR/bert_model.ckpt
+export PARAMS=$PARAMS,runtime.distribution_strategy=tpu
+
+python3 train.py \
+ --experiment=bert/squad \
+ --mode=train_and_eval \
+ --model_dir=$OUTPUT_DIR \
+ --config_file=configs/experiments/squad_v1.1.yaml \
+ --tpu=${TPU_NAME} \
+ --params_override=$PARAMS
+
+```
+
+Note: More examples about pre-training will come soon.
--- a/official/nlp/finetuning/binary_helper.py
+++ b/official/nlp/finetuning/binary_helper.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The helper for finetuning binaries."""
+import json
+import math
+import sys
+from typing import Any, Dict, List, Optional
+
+from absl import logging
+import tensorflow as tf
+
+from official.core import config_definitions as cfg
+from official.modeling import hyperparams
+from official.nlp.configs import encoders
+from official.nlp.data import question_answering_dataloader
+from official.nlp.data import sentence_prediction_dataloader
+from official.nlp.data import tagging_dataloader
+from official.nlp.tasks import question_answering
+from official.nlp.tasks import sentence_prediction
+from official.nlp.tasks import tagging
+
+
+def override_trainer_cfg(trainer_cfg: cfg.TrainerConfig, learning_rate: float,
+                         num_epoch: int, global_batch_size: int,
+                         warmup_ratio: float, training_data_size: int,
+                         eval_data_size: int, num_eval_per_epoch: int,
+                         best_checkpoint_export_subdir: str,
+                         best_checkpoint_eval_metric: str,
+                         best_checkpoint_metric_comp: str):
+  """Overrides a `cfg.TrainerConfig` object."""
+  steps_per_epoch = training_data_size // global_batch_size
+  train_steps = steps_per_epoch * num_epoch
+  # TODO(b/165081095): always set to -1 after the bug is resolved.
+  if eval_data_size:
+    eval_steps = int(math.ceil(eval_data_size / global_batch_size))
+  else:
+    eval_steps = -1  # exhaust the validation data.
+  warmp_steps = int(train_steps * warmup_ratio)
+  validation_interval = steps_per_epoch // num_eval_per_epoch
+  trainer_cfg.override({
+      'optimizer_config': {
+          'learning_rate': {
+              'type': 'polynomial',
+              'polynomial': {
+                  'decay_steps': train_steps,
+                  'initial_learning_rate': learning_rate,
+                  'end_learning_rate': 0,
+              }
+          },
+          'optimizer': {
+              'type': 'adamw',
+          },
+          'warmup': {
+              'polynomial': {
+                  'warmup_steps': warmp_steps,
+              },
+              'type': 'polynomial',
+          },
+      },
+      'train_steps': train_steps,
+      'validation_interval': validation_interval,
+      'validation_steps': eval_steps,
+      'best_checkpoint_export_subdir': best_checkpoint_export_subdir,
+      'best_checkpoint_eval_metric': best_checkpoint_eval_metric,
+      'best_checkpoint_metric_comp': best_checkpoint_metric_comp,
+  })
+
+
+def load_model_config_file(model_config_file: str) -> Dict[str, Any]:
+  """Loads bert config json file or `encoders.EncoderConfig` in yaml file."""
+  if not model_config_file:
+    # model_config_file may be empty when using tf.hub.
+    return {}
+
+  try:
+    encoder_config = encoders.EncoderConfig()
+    encoder_config = hyperparams.override_params_dict(
+        encoder_config, model_config_file, is_strict=True)
+    logging.info('Load encoder_config yaml file from %s.', model_config_file)
+    return encoder_config.as_dict()
+  except KeyError:
+    pass
+
+  logging.info('Load bert config json file from %s', model_config_file)
+  with tf.io.gfile.GFile(model_config_file, 'r') as reader:
+    text = reader.read()
+    config = json.loads(text)
+
+  def get_value(key1, key2):
+    if key1 in config and key2 in config:
+      raise ValueError('Unexpected that both %s and %s are in config.' %
+                       (key1, key2))
+
+    return config[key1] if key1 in config else config[key2]
+
+  def get_value_or_none(key):
+    return config[key] if key in config else None
+
+  # Support both legacy bert_config attributes and the new config attributes.
+  return {
+      'bert': {
+          'attention_dropout_rate':
+              get_value('attention_dropout_rate',
+                        'attention_probs_dropout_prob'),
+          'dropout_rate':
+              get_value('dropout_rate', 'hidden_dropout_prob'),
+          'hidden_activation':
+              get_value('hidden_activation', 'hidden_act'),
+          'hidden_size':
+              config['hidden_size'],
+          'embedding_size':
+              get_value_or_none('embedding_size'),
+          'initializer_range':
+              config['initializer_range'],
+          'intermediate_size':
+              config['intermediate_size'],
+          'max_position_embeddings':
+              config['max_position_embeddings'],
+          'num_attention_heads':
+              config['num_attention_heads'],
+          'num_layers':
+              get_value('num_layers', 'num_hidden_layers'),
+          'type_vocab_size':
+              config['type_vocab_size'],
+          'vocab_size':
+              config['vocab_size'],
+      }
+  }
+
+
+def override_sentence_prediction_task_config(
+    task_cfg: sentence_prediction.SentencePredictionConfig,
+    model_config_file: str,
+    init_checkpoint: str,
+    hub_module_url: str,
+    global_batch_size: int,
+    train_input_path: str,
+    validation_input_path: str,
+    seq_length: int,
+    num_classes: int,
+    metric_type: Optional[str] = 'accuracy',
+    label_type: Optional[str] = 'int'):
+  """Overrides a `SentencePredictionConfig` object."""
+  task_cfg.override({
+      'init_checkpoint': init_checkpoint,
+      'metric_type': metric_type,
+      'model': {
+          'num_classes': num_classes,
+          'encoder': load_model_config_file(model_config_file),
+      },
+      'hub_module_url': hub_module_url,
+      'train_data': {
+          'drop_remainder': True,
+          'global_batch_size': global_batch_size,
+          'input_path': train_input_path,
+          'is_training': True,
+          'seq_length': seq_length,
+          'label_type': label_type,
+      },
+      'validation_data': {
+          'drop_remainder': False,
+          'global_batch_size': global_batch_size,
+          'input_path': validation_input_path,
+          'is_training': False,
+          'seq_length': seq_length,
+          'label_type': label_type,
+      }
+  })
+
+
+def override_qa_task_config(
+    task_cfg: question_answering.QuestionAnsweringConfig,
+    model_config_file: str, init_checkpoint: str, hub_module_url: str,
+    global_batch_size: int, train_input_path: str, validation_input_path: str,
+    seq_length: int, tokenization: str, vocab_file: str, do_lower_case: bool,
+    version_2_with_negative: bool):
+  """Overrides a `QuestionAnsweringConfig` object."""
+  task_cfg.override({
+      'init_checkpoint': init_checkpoint,
+      'model': {
+          'encoder': load_model_config_file(model_config_file),
+      },
+      'hub_module_url': hub_module_url,
+      'train_data': {
+          'drop_remainder': True,
+          'global_batch_size': global_batch_size,
+          'input_path': train_input_path,
+          'is_training': True,
+          'seq_length': seq_length,
+      },
+      'validation_data': {
+          'do_lower_case': do_lower_case,
+          'drop_remainder': False,
+          'global_batch_size': global_batch_size,
+          'input_path': validation_input_path,
+          'is_training': False,
+          'seq_length': seq_length,
+          'tokenization': tokenization,
+          'version_2_with_negative': version_2_with_negative,
+          'vocab_file': vocab_file,
+      }
+  })
+
+
+def override_tagging_task_config(task_cfg: tagging.TaggingConfig,
+                                 model_config_file: str, init_checkpoint: str,
+                                 hub_module_url: str, global_batch_size: int,
+                                 train_input_path: str,
+                                 validation_input_path: str, seq_length: int,
+                                 class_names: List[str]):
+  """Overrides a `TaggingConfig` object."""
+  task_cfg.override({
+      'init_checkpoint': init_checkpoint,
+      'model': {
+          'encoder': load_model_config_file(model_config_file),
+      },
+      'hub_module_url': hub_module_url,
+      'train_data': {
+          'drop_remainder': True,
+          'global_batch_size': global_batch_size,
+          'input_path': train_input_path,
+          'is_training': True,
+          'seq_length': seq_length,
+      },
+      'validation_data': {
+          'drop_remainder': False,
+          'global_batch_size': global_batch_size,
+          'input_path': validation_input_path,
+          'is_training': False,
+          'seq_length': seq_length,
+      },
+      'class_names': class_names,
+  })
+
+
+def write_glue_classification(task,
+                              model,
+                              input_file,
+                              output_file,
+                              predict_batch_size,
+                              seq_length,
+                              class_names,
+                              label_type='int',
+                              min_float_value=None,
+                              max_float_value=None):
+  """Makes classification predictions for glue and writes to output file.
+
+  Args:
+    task: `Task` instance.
+    model: `keras.Model` instance.
+    input_file: Input test data file path.
+    output_file: Output test data file path.
+    predict_batch_size: Batch size for prediction.
+    seq_length: Input sequence length.
+    class_names: List of string class names.
+    label_type: String denoting label type ('int', 'float'), defaults to 'int'.
+    min_float_value: If set, predictions will be min-clipped to this value (only
+      for regression when `label_type` is set to 'float'). Defaults to `None`
+      (no clipping).
+    max_float_value: If set, predictions will be max-clipped to this value (only
+      for regression when `label_type` is set to 'float'). Defaults to `None`
+      (no clipping).
+  """
+  if label_type not in ('int', 'float'):
+    raise ValueError('Unsupported `label_type`. Given: %s, expected `int` or '
+                     '`float`.' % label_type)
+
+  data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
+      input_path=input_file,
+      global_batch_size=predict_batch_size,
+      is_training=False,
+      seq_length=seq_length,
+      label_type=label_type,
+      drop_remainder=False,
+      include_example_id=True)
+  predictions = sentence_prediction.predict(task, data_config, model)
+
+  if label_type == 'float':
+    min_float_value = (-sys.float_info.max
+                       if min_float_value is None else min_float_value)
+    max_float_value = (
+        sys.float_info.max if max_float_value is None else max_float_value)
+
+    # Clip predictions to range [min_float_value, max_float_value].
+    predictions = [
+        min(max(prediction, min_float_value), max_float_value)
+        for prediction in predictions
+    ]
+
+  with tf.io.gfile.GFile(output_file, 'w') as writer:
+    writer.write('index\tprediction\n')
+    for index, prediction in enumerate(predictions):
+      if label_type == 'float':
+        # Regression.
+        writer.write('%d\t%.3f\n' % (index, prediction))
+      else:
+        # Classification.
+        writer.write('%d\t%s\n' % (index, class_names[prediction]))
+
+
+def write_xtreme_classification(task,
+                                model,
+                                input_file,
+                                output_file,
+                                predict_batch_size,
+                                seq_length,
+                                class_names,
+                                translated_input_file=None,
+                                test_time_aug_wgt=0.3):
+  """Makes classification predictions for xtreme and writes to output file."""
+  data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
+      input_path=input_file,
+      seq_length=seq_length,
+      is_training=False,
+      label_type='int',
+      global_batch_size=predict_batch_size,
+      drop_remainder=False,
+      include_example_id=True)
+  if translated_input_file is not None:
+    data_config_aug = (
+        sentence_prediction_dataloader.SentencePredictionDataConfig(
+            input_path=translated_input_file,
+            seq_length=seq_length,
+            is_training=False,
+            label_type='int',
+            global_batch_size=predict_batch_size,
+            drop_remainder=False,
+            include_example_id=True))
+  else:
+    data_config_aug = None
+  predictions = sentence_prediction.predict(task, data_config, model,
+                                            data_config_aug, test_time_aug_wgt)
+  with tf.io.gfile.GFile(output_file, 'w') as writer:
+    for prediction in predictions:
+      writer.write('%s\n' % class_names[prediction])
+
+
+def write_question_answering(task,
+                             model,
+                             input_file,
+                             output_file,
+                             predict_batch_size,
+                             seq_length,
+                             tokenization,
+                             vocab_file,
+                             do_lower_case,
+                             version_2_with_negative=False):
+  """Makes question answering predictions and writes to output file."""
+  data_config = question_answering_dataloader.QADataConfig(
+      do_lower_case=do_lower_case,
+      doc_stride=128,
+      drop_remainder=False,
+      global_batch_size=predict_batch_size,
+      input_path=input_file,
+      is_training=False,
+      query_length=64,
+      seq_length=seq_length,
+      tokenization=tokenization,
+      version_2_with_negative=version_2_with_negative,
+      vocab_file=vocab_file)
+  all_predictions, _, _ = question_answering.predict(task, data_config, model)
+  with tf.io.gfile.GFile(output_file, 'w') as writer:
+    writer.write(json.dumps(all_predictions, indent=4) + '\n')
+
+
+def write_tagging(task, model, input_file, output_file, predict_batch_size,
+                  seq_length):
+  """Makes tagging predictions and writes to output file."""
+  data_config = tagging_dataloader.TaggingDataConfig(
+      input_path=input_file,
+      is_training=False,
+      seq_length=seq_length,
+      global_batch_size=predict_batch_size,
+      drop_remainder=False,
+      include_sentence_id=True)
+  results = tagging.predict(task, data_config, model)
+  class_names = task.task_config.class_names
+  last_sentence_id = -1
+
+  with tf.io.gfile.GFile(output_file, 'w') as writer:
+    for sentence_id, _, predict_ids in results:
+      token_labels = [class_names[x] for x in predict_ids]
+      assert sentence_id == last_sentence_id or (
+          sentence_id == last_sentence_id + 1)
+
+      if sentence_id != last_sentence_id and last_sentence_id != -1:
+        writer.write('\n')
+
+      writer.write('\n'.join(token_labels))
+      writer.write('\n')
+      last_sentence_id = sentence_id
--- a/official/nlp/finetuning/glue/flags.py
+++ b/official/nlp/finetuning/glue/flags.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common flags for GLUE finetuning binary."""
+from typing import Callable
+
+from absl import flags
+from absl import logging
+
+
+def define_flags():
+  """Defines flags."""
+
+  # ===========================================================================
+  # Glue binary flags.
+  # ===========================================================================
+  flags.DEFINE_enum(
+      'mode', 'train_eval_and_predict',
+      ['train_eval_and_predict', 'train_eval', 'predict'],
+      'The mode to run the binary. If `train_eval_and_predict` '
+      'it will (1) train on the training data and (2) evaluate on '
+      'the validation data and (3) finally generate predictions '
+      'on the prediction data; if `train_eval`, it will only '
+      'run training and evaluation; if `predict`, it will only '
+      'run prediction using the model in `model_dir`.')
+
+  flags.DEFINE_enum('task_name', None, [
+      'AX', 'COLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B',
+      'WNLI'
+  ], 'The type of GLUE task.')
+
+  flags.DEFINE_string('train_input_path', None,
+                      'The file path to the training data.')
+
+  flags.DEFINE_string('validation_input_path', None,
+                      'The file path to the evaluation data.')
+
+  flags.DEFINE_string('test_input_path', None,
+                      'The file path to the test input data.')
+
+  flags.DEFINE_string('test_output_path', None,
+                      'The file path to the test output data.')
+
+  flags.DEFINE_string('model_dir', '', 'The model directory containing '
+                      'subdirectories for each task. Only needed for "predict" '
+                      'mode. For all other modes, if not provided, a unique '
+                      'directory will be created automatically for each run.')
+
+  flags.DEFINE_string(
+      'input_meta_data_path', None, 'Path to file that contains '
+      'metadata about input file. It is output by the `create_finetuning_data` '
+      'binary. Required for all modes except "predict".')
+
+  flags.DEFINE_string('init_checkpoint', '',
+                      'Initial checkpoint from a pre-trained BERT model.')
+
+  flags.DEFINE_string(
+      'model_config_file', '', 'The config file specifying the architecture '
+      'of the pre-trained model. The file can be either a bert_config.json '
+      'file or `encoders.EncoderConfig` in yaml file.')
+
+  flags.DEFINE_string(
+      'hub_module_url', '', 'TF-Hub path/url to a pretrained model. If '
+      'specified, `init_checkpoint` and `model_config_file` flag should not be '
+      'used.')
+
+  flags.DEFINE_multi_string('gin_file', None,
+                            'List of paths to the gin config files.')
+
+  flags.DEFINE_multi_string('gin_params', None,
+                            'Newline separated list of gin parameter bindings.')
+
+  flags.DEFINE_multi_string(
+      'config_file', None, 'This is the advanced usage to specify the '
+      '`ExperimentConfig` directly. When specified, '
+      'we will ignore FLAGS related to `ExperimentConfig` such as '
+      '`train_input_path`, `validation_input_path` and following hparams.')
+
+  # ===========================================================================
+  # Tuning hparams.
+  # ===========================================================================
+  flags.DEFINE_integer('global_batch_size', 32,
+                       'Global batch size for train/eval/predict.')
+
+  flags.DEFINE_float('learning_rate', 3e-5, 'Initial learning rate.')
+
+  flags.DEFINE_integer('num_epoch', 3, 'Number of training epochs.')
+
+  flags.DEFINE_float('warmup_ratio', 0.1,
+                     'Proportion of learning rate warmup steps.')
+
+  flags.DEFINE_integer('num_eval_per_epoch', 2,
+                       'Number of evaluations to run per epoch.')
+
+
+def validate_flags(flags_obj: flags.FlagValues,
+                   file_exists_fn: Callable[[str], bool]):
+  """Raises ValueError if any flags are misconfigured.
+
+  Args:
+    flags_obj: A `flags.FlagValues` object, usually from `flags.FLAG`.
+    file_exists_fn: A callable to decide if a file path exists or not.
+  """
+
+  def _check_path_exists(flag_path, flag_name):
+    if not file_exists_fn(flag_path):
+      raise ValueError('Flag `%s` at %s does not exist.' %
+                       (flag_name, flag_path))
+
+  def _validate_path(flag_path, flag_name):
+    if not flag_path:
+      raise ValueError('Flag `%s` must be provided in mode %s.' %
+                       (flag_name, flags_obj.mode))
+    _check_path_exists(flag_path, flag_name)
+
+  if 'train' in flags_obj.mode:
+    _validate_path(flags_obj.train_input_path, 'train_input_path')
+    _validate_path(flags_obj.input_meta_data_path, 'input_meta_data_path')
+
+    if flags_obj.gin_file:
+      for gin_file in flags_obj.gin_file:
+        _check_path_exists(gin_file, 'gin_file')
+    if flags_obj.config_file:
+      for config_file in flags_obj.config_file:
+        _check_path_exists(config_file, 'config_file')
+
+  if 'eval' in flags_obj.mode:
+    _validate_path(flags_obj.validation_input_path, 'validation_input_path')
+
+  if flags_obj.mode == 'predict':
+    # model_dir is only needed strictly in 'predict' mode.
+    _validate_path(flags_obj.model_dir, 'model_dir')
+
+  if 'predict' in flags_obj.mode:
+    _validate_path(flags_obj.test_input_path, 'test_input_path')
+
+  if not flags_obj.config_file and flags_obj.mode != 'predict':
+    if flags_obj.hub_module_url:
+      if flags_obj.init_checkpoint or flags_obj.model_config_file:
+        raise ValueError(
+            'When `hub_module_url` is specified, `init_checkpoint` and '
+            '`model_config_file` should be empty.')
+      logging.info(
+          'Using the pretrained tf.hub from %s', flags_obj.hub_module_url)
+    else:
+      if not (flags_obj.init_checkpoint and flags_obj.model_config_file):
+        raise ValueError('Both `init_checkpoint` and `model_config_file` '
+                         'should be specified if `config_file` is not '
+                         'specified.')
+      _validate_path(flags_obj.model_config_file, 'model_config_file')
+      logging.info(
+          'Using the pretrained checkpoint from %s and model_config_file from '
+          '%s.', flags_obj.init_checkpoint, flags_obj.model_config_file)
--- a/official/nlp/finetuning/glue/run_glue.py
+++ b/official/nlp/finetuning/glue/run_glue.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runs prediction to generate submission files for GLUE tasks."""
+import functools
+import json
+import os
+import pprint
+
+from absl import app
+from absl import flags
+from absl import logging
+
+import gin
+import tensorflow as tf
+
+from official.common import distribute_utils
+# Imports registered experiment configs.
+from official.common import registry_imports  # pylint: disable=unused-import
+from official.core import exp_factory
+from official.core import task_factory
+from official.core import train_lib
+from official.core import train_utils
+from official.modeling.hyperparams import params_dict
+from official.nlp.finetuning import binary_helper
+from official.nlp.finetuning.glue import flags as glue_flags
+
+
+# Device configs.
+flags.DEFINE_string('distribution_strategy', 'tpu',
+                    'The Distribution Strategy to use for training.')
+flags.DEFINE_string(
+    'tpu', '',
+    'The Cloud TPU to use for training. This should be either the name '
+    'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.')
+flags.DEFINE_integer('num_gpus', 1, 'The number of GPUs to use at each worker.')
+
+FLAGS = flags.FLAGS
+
+EXPERIMENT_TYPE = 'bert/sentence_prediction'
+BEST_CHECKPOINT_EXPORT_SUBDIR = 'best_ckpt'
+
+EVAL_METRIC_MAP = {
+    'AX': 'matthews_corrcoef',
+    'COLA': 'matthews_corrcoef',
+    'MNLI': 'cls_accuracy',
+    'MRPC': 'cls_accuracy',
+    'QNLI': 'cls_accuracy',
+    'QQP': 'cls_accuracy',
+    'RTE': 'cls_accuracy',
+    'SST-2': 'cls_accuracy',
+    'STS-B': 'pearson_spearman_corr',
+    'WNLI': 'cls_accuracy',
+}
+
+AX_CLASS_NAMES = ['contradiction', 'entailment', 'neutral']
+COLA_CLASS_NAMES = ['0', '1']
+MNLI_CLASS_NAMES = ['contradiction', 'entailment', 'neutral']
+MRPC_CLASS_NAMES = ['0', '1']
+QNLI_CLASS_NAMES = ['entailment', 'not_entailment']
+QQP_CLASS_NAMES = ['0', '1']
+RTE_CLASS_NAMES = ['entailment', 'not_entailment']
+SST_2_CLASS_NAMES = ['0', '1']
+WNLI_CLASS_NAMES = ['0', '1']
+
+
+def _override_exp_config_by_file(exp_config, exp_config_files):
+  """Overrides an `ExperimentConfig` object by files."""
+  for exp_config_file in exp_config_files:
+    if not tf.io.gfile.exists(exp_config_file):
+      raise ValueError('%s does not exist.' % exp_config_file)
+    params_dict.override_params_dict(
+        exp_config, exp_config_file, is_strict=True)
+
+  return exp_config
+
+
+def _override_exp_config_by_flags(exp_config, input_meta_data):
+  """Overrides an `ExperimentConfig` object by flags."""
+  if FLAGS.task_name in ('AX', 'COLA',):
+    override_task_cfg_fn = functools.partial(
+        binary_helper.override_sentence_prediction_task_config,
+        num_classes=input_meta_data['num_labels'],
+        metric_type='matthews_corrcoef')
+  elif FLAGS.task_name in ('MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2',
+                           'WNLI'):
+    override_task_cfg_fn = functools.partial(
+        binary_helper.override_sentence_prediction_task_config,
+        num_classes=input_meta_data['num_labels'])
+  elif FLAGS.task_name in ('STS-B',):
+    override_task_cfg_fn = functools.partial(
+        binary_helper.override_sentence_prediction_task_config,
+        num_classes=1,
+        metric_type='pearson_spearman_corr',
+        label_type='float')
+  else:
+    raise ValueError('Task %s not supported.' % FLAGS.task_name)
+
+  binary_helper.override_trainer_cfg(
+      exp_config.trainer,
+      learning_rate=FLAGS.learning_rate,
+      num_epoch=FLAGS.num_epoch,
+      global_batch_size=FLAGS.global_batch_size,
+      warmup_ratio=FLAGS.warmup_ratio,
+      training_data_size=input_meta_data['train_data_size'],
+      eval_data_size=input_meta_data['eval_data_size'],
+      num_eval_per_epoch=FLAGS.num_eval_per_epoch,
+      best_checkpoint_export_subdir=BEST_CHECKPOINT_EXPORT_SUBDIR,
+      best_checkpoint_eval_metric=EVAL_METRIC_MAP[FLAGS.task_name],
+      best_checkpoint_metric_comp='higher')
+
+  override_task_cfg_fn(
+      exp_config.task,
+      model_config_file=FLAGS.model_config_file,
+      init_checkpoint=FLAGS.init_checkpoint,
+      hub_module_url=FLAGS.hub_module_url,
+      global_batch_size=FLAGS.global_batch_size,
+      train_input_path=FLAGS.train_input_path,
+      validation_input_path=FLAGS.validation_input_path,
+      seq_length=input_meta_data['max_seq_length'])
+  return exp_config
+
+
+def _get_exp_config(input_meta_data, exp_config_files):
+  """Gets an `ExperimentConfig` object."""
+  exp_config = exp_factory.get_exp_config(EXPERIMENT_TYPE)
+
+  if exp_config_files:
+    logging.info(
+        'Loading `ExperimentConfig` from file, and flags will be ignored.')
+    exp_config = _override_exp_config_by_file(exp_config, exp_config_files)
+  else:
+    logging.info('Loading `ExperimentConfig` from flags.')
+    exp_config = _override_exp_config_by_flags(exp_config, input_meta_data)
+
+  exp_config.validate()
+  exp_config.lock()
+
+  pp = pprint.PrettyPrinter()
+  logging.info('Final experiment parameters: %s',
+               pp.pformat(exp_config.as_dict()))
+
+  return exp_config
+
+
+def _write_submission_file(task, seq_length):
+  """Writes submission files that can be uploaded to the leaderboard."""
+  tf.io.gfile.makedirs(os.path.dirname(FLAGS.test_output_path))
+  model = task.build_model()
+
+  ckpt_file = tf.train.latest_checkpoint(
+      os.path.join(FLAGS.model_dir, BEST_CHECKPOINT_EXPORT_SUBDIR))
+  logging.info('Restoring checkpoints from %s', ckpt_file)
+  checkpoint = tf.train.Checkpoint(model=model)
+  checkpoint.read(ckpt_file).expect_partial()
+
+  write_fn = binary_helper.write_glue_classification
+  write_fn_map = {
+      'AX':
+          functools.partial(
+              write_fn, class_names=AX_CLASS_NAMES),
+      'COLA':
+          functools.partial(
+              write_fn, class_names=COLA_CLASS_NAMES),
+      'MNLI':
+          functools.partial(
+              write_fn, class_names=MNLI_CLASS_NAMES),
+      'MRPC':
+          functools.partial(
+              write_fn, class_names=MRPC_CLASS_NAMES),
+      'QNLI':
+          functools.partial(
+              write_fn, class_names=QNLI_CLASS_NAMES),
+      'QQP':
+          functools.partial(
+              write_fn, class_names=QQP_CLASS_NAMES),
+      'RTE':
+          functools.partial(
+              write_fn, class_names=RTE_CLASS_NAMES),
+      'SST-2':
+          functools.partial(
+              write_fn, class_names=SST_2_CLASS_NAMES),
+      'STS-B':
+          # No class_names (regression), clip predictions to [0.0, 5.0] per glue
+          # benchmark grader.
+          functools.partial(
+              write_fn, class_names=None, label_type='float',
+              min_float_value=0.0, max_float_value=5.0),
+      'WNLI':
+          functools.partial(
+              write_fn, class_names=WNLI_CLASS_NAMES),
+  }
+  logging.info('Predicting %s', FLAGS.test_input_path)
+  write_fn_map[FLAGS.task_name](
+      task=task,
+      model=model,
+      input_file=FLAGS.test_input_path,
+      output_file=FLAGS.test_output_path,
+      predict_batch_size=(
+          task.task_config.train_data.global_batch_size),
+      seq_length=seq_length)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  glue_flags.validate_flags(FLAGS, file_exists_fn=tf.io.gfile.exists)
+
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  distribution_strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      tpu_address=FLAGS.tpu)
+
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+
+  with distribution_strategy.scope():
+    task = None
+    if 'train_eval' in FLAGS.mode:
+      logging.info('Starting training and eval...')
+      logging.info('Model dir: %s', FLAGS.model_dir)
+
+      exp_config = _get_exp_config(
+          input_meta_data=input_meta_data,
+          exp_config_files=FLAGS.config_file)
+      train_utils.serialize_config(exp_config, FLAGS.model_dir)
+      task = task_factory.get_task(exp_config.task, logging_dir=FLAGS.model_dir)
+      train_lib.run_experiment(
+          distribution_strategy=distribution_strategy,
+          task=task,
+          mode='train_and_eval',
+          params=exp_config,
+          model_dir=FLAGS.model_dir)
+
+    if 'predict' in FLAGS.mode:
+      logging.info('Starting predict...')
+      # When mode is `predict`, `task` will be None.
+      if task is None:
+        exp_config = _get_exp_config(
+            input_meta_data=input_meta_data,
+            exp_config_files=[os.path.join(FLAGS.model_dir, 'params.yaml')])
+        task = task_factory.get_task(
+            exp_config.task, logging_dir=FLAGS.model_dir)
+      _write_submission_file(task, input_meta_data['max_seq_length'])
+
+
+if __name__ == '__main__':
+  glue_flags.define_flags()
+  flags.mark_flag_as_required('mode')
+  flags.mark_flag_as_required('task_name')
+  app.run(main)
--- a/official/nlp/keras_nlp/README.md
+++ b/official/nlp/keras_nlp/README.md
+# keras-nlp
+
+## Layers
+
+Layers are the fundamental building blocks for NLP models. They can be used to
+assemble new layers, networks, or models.
+
+*   [TransformerEncoderBlock](layers/transformer_encoder_block.py) implements
+    an optionally masked transformer as described in
+    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
+
+*   [OnDeviceEmbedding](layers/on_device_embedding.py) implements efficient
+    embedding lookups designed for TPU-based models.
+
+*   [PositionalEmbedding](layers/position_embedding.py) creates a positional
+    embedding as described in ["BERT: Pre-training of Deep Bidirectional
+    Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805).
+
+*   [SelfAttentionMask](layers/self_attention_mask.py) creates a 3D attention
+    mask from a 2D tensor mask.
+
+*   [MaskedLM](layers/masked_lm.py) implements a masked language model. It
+    assumes the embedding table variable is passed to it.
+
+
+## Encoders
+
+Encoders are combinations of layers (and possibly other encoders). They are
+sub-units of models that would not be trained alone. It encapsulates common
+network structures like a classification head or a transformer encoder into an
+easily handled object with a standardized configuration.
+
+*   [BertEncoder](encoders/bert_encoder.py) implements a bi-directional
+    Transformer-based encoder as described in
+    ["BERT: Pre-training of Deep Bidirectional Transformers for Language
+    Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding
+    lookups, transformer layers and pooling layer.
--- a/official/nlp/keras_nlp/__init__.py
+++ b/official/nlp/keras_nlp/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-NLP package definition."""
+# pylint: disable=wildcard-import
+from official.nlp.keras_nlp import encoders
+from official.nlp.keras_nlp import layers
--- a/official/nlp/keras_nlp/contributing.md
+++ b/official/nlp/keras_nlp/contributing.md
+## Contributing to KerasNLP
+
+Patches to KerasNLP are welcome!
+
+The source-of-truth repository lives under
+[TF Model Garden NLP](https://github.com/tensorflow/models/tree/master/official/nlp/keras_nlp),
+and is mirrored as a read-only repository under
+[keras-team/keras-nlp](https://github.com/keras-team/keras-nlp).
+Contributions should be made as PRs to the TF Model Garden repository.
+This is to ensure the codebase is rigorously tested with state-of-art models
+on different accelerators.
+In the long run, we will move development to the current repository `keras-team/keras-nlp`.
+
+## :heavy_check_mark: Contributor checklist
+
+1. Ensure you have signed the [Contributor License Agreement](https://cla.developers.google.com/about/google-individual?csw=1).
+    * All code contributors are required to sign a Contributor License Agreement.
+    * Please read this [troubleshooting guide](Contributor-License-Agreements#troubleshooting-clas)
+    if you encounter an issue.
+2. Please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
+3. Check if your changes are consistent with the [TensorFlow coding style](https://www.tensorflow.org/community/contribute/code_style).
--- a/official/nlp/keras_nlp/encoders/__init__.py
+++ b/official/nlp/keras_nlp/encoders/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-NLP layers package definition."""
+from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder
--- a/official/nlp/keras_nlp/encoders/bert_encoder.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Bert encoder network."""
+# pylint: disable=g-classes-have-attributes
+
+import collections
+from absl import logging
+import tensorflow as tf
+
+from official.nlp.keras_nlp import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+class BertEncoder(tf.keras.Model):
+  """Bi-directional Transformer-based encoder network.
+
+  This network implements a bi-directional Transformer-based encoder as
+  described in "BERT: Pre-training of Deep Bidirectional Transformers for
+  Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
+  embedding lookups and transformer layers, but not the masked language model
+  or classification task networks.
+
+  The default values for this object are taken from the BERT-Base implementation
+  in "BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding".
+
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    vocab_size: The size of the token vocabulary.
+    hidden_size: The size of the transformer hidden layers.
+    num_layers: The number of transformer layers.
+    num_attention_heads: The number of attention heads for each transformer. The
+      hidden size must be divisible by the number of attention heads.
+    max_sequence_length: The maximum sequence length that this encoder can
+      consume. If None, max_sequence_length uses the value from sequence length.
+      This determines the variable shape for positional embeddings.
+    type_vocab_size: The number of types that the 'type_ids' input can take.
+    inner_dim: The output dimension of the first Dense layer in a two-layer
+        feedforward network for each transformer.
+    inner_activation: The activation for the first Dense layer in a two-layer
+        feedforward network for each transformer.
+    output_dropout: Dropout probability for the post-attention and output
+        dropout.
+    attention_dropout: The dropout rate to use for the attention layers
+      within the transformer layers.
+    initializer: The initialzer to use for all weights in this encoder.
+    output_range: The sequence output range, [0, output_range), by slicing the
+      target sequence of the last transformer layer. `None` means the entire
+      target sequence will attend to the source sequence, which yields the full
+      output.
+    embedding_width: The width of the word embeddings. If the embedding width is
+      not equal to hidden size, embedding parameters will be factorized into two
+      matrices in the shape of ['vocab_size', 'embedding_width'] and
+      ['embedding_width', 'hidden_size'] ('embedding_width' is usually much
+      smaller than 'hidden_size').
+    embedding_layer: An optional Layer instance which will be called to
+     generate embeddings for the input word IDs.
+  """
+
+  def __init__(
+      self,
+      vocab_size,
+      hidden_size=768,
+      num_layers=12,
+      num_attention_heads=12,
+      max_sequence_length=512,
+      type_vocab_size=16,
+      inner_dim=3072,
+      inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
+      output_dropout=0.1,
+      attention_dropout=0.1,
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      output_range=None,
+      embedding_width=None,
+      embedding_layer=None,
+      **kwargs):
+    activation = tf.keras.activations.get(inner_activation)
+    initializer = tf.keras.initializers.get(initializer)
+
+    word_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_word_ids')
+    mask = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_mask')
+    type_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_type_ids')
+
+    if embedding_width is None:
+      embedding_width = hidden_size
+
+    if embedding_layer is None:
+      embedding_layer_inst = layers.OnDeviceEmbedding(
+          vocab_size=vocab_size,
+          embedding_width=embedding_width,
+          initializer=initializer,
+          name='word_embeddings')
+    else:
+      embedding_layer_inst = embedding_layer
+    word_embeddings = embedding_layer_inst(word_ids)
+
+    # Always uses dynamic slicing for simplicity.
+    position_embedding_layer = layers.PositionEmbedding(
+        initializer=initializer,
+        max_length=max_sequence_length,
+        name='position_embedding')
+    position_embeddings = position_embedding_layer(word_embeddings)
+    type_embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=type_vocab_size,
+        embedding_width=embedding_width,
+        initializer=initializer,
+        use_one_hot=True,
+        name='type_embeddings')
+    type_embeddings = type_embedding_layer(type_ids)
+
+    embeddings = tf.keras.layers.Add()(
+        [word_embeddings, position_embeddings, type_embeddings])
+
+    embedding_norm_layer = tf.keras.layers.LayerNormalization(
+        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
+
+    embeddings = embedding_norm_layer(embeddings)
+    embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))
+
+    # We project the 'embedding' output to 'hidden_size' if it is not already
+    # 'hidden_size'.
+    if embedding_width != hidden_size:
+      embedding_projection = tf.keras.layers.experimental.EinsumDense(
+          '...x,xy->...y',
+          output_shape=hidden_size,
+          bias_axes='y',
+          kernel_initializer=initializer,
+          name='embedding_projection')
+      embeddings = embedding_projection(embeddings)
+    else:
+      embedding_projection = None
+
+    transformer_layers = []
+    data = embeddings
+    attention_mask = layers.SelfAttentionMask()(data, mask)
+    encoder_outputs = []
+    for i in range(num_layers):
+      if i == num_layers - 1 and output_range is not None:
+        transformer_output_range = output_range
+      else:
+        transformer_output_range = None
+      layer = layers.TransformerEncoderBlock(
+          num_attention_heads=num_attention_heads,
+          inner_dim=inner_dim,
+          inner_activation=inner_activation,
+          output_dropout=output_dropout,
+          attention_dropout=attention_dropout,
+          output_range=transformer_output_range,
+          kernel_initializer=initializer,
+          name='transformer/layer_%d' % i)
+      transformer_layers.append(layer)
+      data = layer([data, attention_mask])
+      encoder_outputs.append(data)
+
+    last_encoder_output = encoder_outputs[-1]
+    # Applying a tf.slice op (through subscript notation) to a Keras tensor
+    # like this will create a SliceOpLambda layer. This is better than a Lambda
+    # layer with Python code, because that is fundamentally less portable.
+    first_token_tensor = last_encoder_output[:, 0, :]
+    pooler_layer = tf.keras.layers.Dense(
+        units=hidden_size,
+        activation='tanh',
+        kernel_initializer=initializer,
+        name='pooler_transform')
+    cls_output = pooler_layer(first_token_tensor)
+
+    outputs = dict(
+        sequence_output=encoder_outputs[-1],
+        pooled_output=cls_output,
+        encoder_outputs=encoder_outputs,
+    )
+
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(BertEncoder, self).__init__(
+        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
+
+    config_dict = {
+        'vocab_size': vocab_size,
+        'hidden_size': hidden_size,
+        'num_layers': num_layers,
+        'num_attention_heads': num_attention_heads,
+        'max_sequence_length': max_sequence_length,
+        'type_vocab_size': type_vocab_size,
+        'inner_dim': inner_dim,
+        'inner_activation': tf.keras.activations.serialize(activation),
+        'output_dropout': output_dropout,
+        'attention_dropout': attention_dropout,
+        'initializer': tf.keras.initializers.serialize(initializer),
+        'output_range': output_range,
+        'embedding_width': embedding_width,
+        'embedding_layer': embedding_layer,
+    }
+
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+    self._pooler_layer = pooler_layer
+    self._transformer_layers = transformer_layers
+    self._embedding_norm_layer = embedding_norm_layer
+    self._embedding_layer = embedding_layer_inst
+    self._position_embedding_layer = position_embedding_layer
+    self._type_embedding_layer = type_embedding_layer
+    if embedding_projection is not None:
+      self._embedding_projection = embedding_projection
+
+  def get_embedding_table(self):
+    return self._embedding_layer.embeddings
+
+  def get_embedding_layer(self):
+    return self._embedding_layer
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @property
+  def transformer_layers(self):
+    """List of Transformer layers in the encoder."""
+    return self._transformer_layers
+
+  @property
+  def pooler_layer(self):
+    """The pooler dense layer after the transformer layers."""
+    return self._pooler_layer
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    if 'embedding_layer' in config and config['embedding_layer'] is not None:
+      warn_string = (
+          'You are reloading a model that was saved with a '
+          'potentially-shared embedding layer object. If you contine to '
+          'train this model, the embedding layer will no longer be shared. '
+          'To work around this, load the model outside of the Keras API.')
+      print('WARNING: ' + warn_string)
+      logging.warn(warn_string)
+
+    return cls(**config)
--- a/official/nlp/keras_nlp/encoders/bert_encoder_test.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for transformer-based bert encoder network."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.keras_nlp.encoders import bert_encoder
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class BertEncoderTest(keras_parameterized.TestCase):
+
+  def tearDown(self):
+    super(BertEncoderTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy("float32")
+
+  def test_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    self.assertIsInstance(test_network.transformer_layers, list)
+    self.assertLen(test_network.transformer_layers, 3)
+    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+  def test_all_encoder_outputs_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    all_encoder_outputs = dict_outputs["encoder_outputs"]
+    pooled = dict_outputs["pooled_output"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertLen(all_encoder_outputs, 3)
+    for data in all_encoder_outputs:
+      self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+  def test_network_creation_with_float16_dtype(self):
+    hidden_size = 32
+    sequence_length = 21
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # If float_dtype is set to float16, the data output is float32 (from a layer
+    # norm) and pool output should be float16.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float16, pooled.dtype)
+
+  @parameterized.named_parameters(
+      ("all_sequence", None, 21),
+      ("output_range", 1, 1),
+  )
+  def test_network_invocation(self, output_range, out_seq_len):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        output_range=output_range)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], out_seq_len)
+
+    # Creates a BertEncoder with max_sequence_length != sequence_length
+    max_sequence_length = 128
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], sequence_length)
+
+    # Creates a BertEncoder with embedding_width != hidden_size
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        embedding_width=16)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[-1], hidden_size)
+    self.assertTrue(hasattr(test_network, "_embedding_projection"))
+
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        vocab_size=100,
+        hidden_size=32,
+        num_layers=3,
+        num_attention_heads=2,
+        max_sequence_length=21,
+        type_vocab_size=12,
+        inner_dim=1223,
+        inner_activation="relu",
+        output_dropout=0.05,
+        attention_dropout=0.22,
+        initializer="glorot_uniform",
+        output_range=-1,
+        embedding_width=16,
+        embedding_layer=None)
+    network = bert_encoder.BertEncoder(**kwargs)
+    expected_config = dict(kwargs)
+    expected_config["inner_activation"] = tf.keras.activations.serialize(
+        tf.keras.activations.get(expected_config["inner_activation"]))
+    expected_config["initializer"] = tf.keras.initializers.serialize(
+        tf.keras.initializers.get(expected_config["initializer"]))
+    self.assertEqual(network.get_config(), expected_config)
+    # Create another network object from the first object's config.
+    new_network = bert_encoder.BertEncoder.from_config(network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+
+    # Tests model saving/loading.
+    model_path = self.get_temp_dir() + "/model"
+    network.save(model_path)
+    _ = tf.keras.models.load_model(model_path)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/keras_nlp/layers/__init__.py
+++ b/official/nlp/keras_nlp/layers/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-NLP layers package definition."""
+from official.nlp.keras_nlp.layers.masked_lm import MaskedLM
+from official.nlp.keras_nlp.layers.on_device_embedding import OnDeviceEmbedding
+from official.nlp.keras_nlp.layers.position_embedding import PositionEmbedding
+from official.nlp.keras_nlp.layers.self_attention_mask import SelfAttentionMask
+from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
--- a/official/nlp/keras_nlp/layers/masked_lm.py
+++ b/official/nlp/keras_nlp/layers/masked_lm.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Masked language model network."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+class MaskedLM(tf.keras.layers.Layer):
+  """Masked language model network head for BERT modeling.
+
+  This layer implements a masked language model based on the provided
+  transformer based encoder. It assumes that the encoder network being passed
+  has a "get_embedding_table()" method.
+
+  Example:
+  ```python
+  encoder=keras_nlp.BertEncoder(...)
+  lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
+  ```
+
+  Args:
+    embedding_table: The embedding table from encoder network.
+    activation: The activation, if any, for the dense layer.
+    initializer: The initializer for the dense layer. Defaults to a Glorot
+      uniform initializer.
+    output: The output style for this layer. Can be either 'logits' or
+      'predictions'.
+  """
+
+  def __init__(self,
+               embedding_table,
+               activation=None,
+               initializer='glorot_uniform',
+               output='logits',
+               name=None,
+               **kwargs):
+    super(MaskedLM, self).__init__(name=name, **kwargs)
+    self.embedding_table = embedding_table
+    self.activation = activation
+    self.initializer = tf.keras.initializers.get(initializer)
+
+    if output not in ('predictions', 'logits'):
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+    self._output_type = output
+
+  def build(self, input_shape):
+    self._vocab_size, hidden_size = self.embedding_table.shape
+    self.dense = tf.keras.layers.Dense(
+        hidden_size,
+        activation=self.activation,
+        kernel_initializer=self.initializer,
+        name='transform/dense')
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+
+    super(MaskedLM, self).build(input_shape)
+
+  def call(self, sequence_data, masked_positions):
+    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
+    lm_data = self.dense(masked_lm_input)
+    lm_data = self.layer_norm(lm_data)
+    lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
+    logits = tf.nn.bias_add(lm_data, self.bias)
+    masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
+        masked_positions)[1]
+    logits = tf.reshape(logits,
+                        [-1, masked_positions_length, self._vocab_size])
+    if self._output_type == 'logits':
+      return logits
+    return tf.nn.log_softmax(logits)
+
+  def get_config(self):
+    raise NotImplementedError('MaskedLM cannot be directly serialized because '
+                              'it has variable sharing logic.')
+
+  def _gather_indexes(self, sequence_tensor, positions):
+    """Gathers the vectors at the specific positions, for performance.
+
+    Args:
+        sequence_tensor: Sequence output of shape
+          (`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
+          hidden units.
+        positions: Positions ids of tokens in sequence to mask for pretraining
+          of with dimension (batch_size, num_predictions) where
+          `num_predictions` is maximum number of tokens to mask out and predict
+          per each sequence.
+
+    Returns:
+        Masked out sequence tensor of shape (batch_size * num_predictions,
+        num_hidden).
+    """
+    sequence_shape = tf.shape(sequence_tensor)
+    batch_size, seq_length = sequence_shape[0], sequence_shape[1]
+    width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
+
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                      [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+
+    return output_tensor
--- a/official/nlp/keras_nlp/layers/on_device_embedding.py
+++ b/official/nlp/keras_nlp/layers/on_device_embedding.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based one-hot embedding layer."""
+# pylint: disable=g-classes-have-attributes
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+class OnDeviceEmbedding(tf.keras.layers.Layer):
+  """Performs an embedding lookup suitable for accelerator devices.
+
+  This layer uses either tf.gather or tf.one_hot to translate integer indices to
+  float embeddings.
+
+  Args:
+    vocab_size: Number of elements in the vocabulary.
+    embedding_width: Output size of the embedding layer.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+    use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
+      lookup. Defaults to False (that is, using tf.gather). Setting this option
+      to True may improve performance, especially on small vocabulary sizes, but
+      will generally require more memory.
+    scale_factor: Whether to scale the output embeddings. Defaults to None (that
+      is, not to scale). Setting this option to a float will let values in
+      output embeddings multiplied by scale_factor.
+  """
+
+  def __init__(self,
+               vocab_size,
+               embedding_width,
+               initializer="glorot_uniform",
+               use_one_hot=False,
+               scale_factor=None,
+               **kwargs):
+
+    super(OnDeviceEmbedding, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self._embedding_width = embedding_width
+    self._initializer = initializer
+    self._use_one_hot = use_one_hot
+    self._scale_factor = scale_factor
+
+  def get_config(self):
+    config = {
+        "vocab_size": self._vocab_size,
+        "embedding_width": self._embedding_width,
+        "initializer": self._initializer,
+        "use_one_hot": self._use_one_hot,
+        "scale_factor": self._scale_factor,
+    }
+    base_config = super(OnDeviceEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    self.embeddings = self.add_weight(
+        "embeddings",
+        shape=[self._vocab_size, self._embedding_width],
+        initializer=self._initializer,
+        dtype=tf.float32)
+
+    super(OnDeviceEmbedding, self).build(input_shape)
+
+  def call(self, inputs):
+    flat_inputs = tf.reshape(inputs, [-1])
+    if self._use_one_hot:
+      dtype = self._compute_dtype
+      if not tf.dtypes.as_dtype(dtype).is_floating:
+        # TensorFlow 1 compatibility. In TF1, self._compute_dtype is int32
+        # instead of a floating-point dtype, as the dtype is inferred from the
+        # dtype of the inputs
+        dtype = tf.float32
+      one_hot_data = tf.one_hot(
+          flat_inputs, depth=self._vocab_size, dtype=dtype)
+      embeddings = tf.matmul(one_hot_data, self.embeddings)
+    else:
+      embeddings = tf.gather(self.embeddings, flat_inputs)
+    embeddings = tf.reshape(
+        embeddings,
+        # Work around b/142213824: prefer concat to shape over a Python list.
+        tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
+    embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
+    if self._scale_factor:
+      embeddings *= self._scale_factor
+    return embeddings
+
+  @property
+  def vocab_size(self):
+    return self._vocab_size
+
+  @property
+  def embedding_width(self):
+    return self._embedding_width
--- a/official/nlp/modeling/layers/on_device_embedding_test.py
+++ b/official/nlp/modeling/layers/on_device_embedding_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,18 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for Keras-based one-hot embedding layer."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for Keras-based one-hot embedding layer."""

 import numpy as np
 import tensorflow as tf

 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import on_device_embedding
+from official.nlp.keras_nlp.layers import on_device_embedding


 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
@@ -49,9 +45,9 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
  def test_layer_creation_with_mixed_precision(self):
    vocab_size = 31
    embedding_width = 27
-    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, dtype=policy)
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        dtype="mixed_float16")
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
@@ -87,10 +83,9 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
  def test_layer_invocation_with_mixed_precision(self):
    vocab_size = 31
    embedding_width = 27
-    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
        vocab_size=vocab_size, embedding_width=embedding_width,
-        dtype=policy)
+        dtype="mixed_float16")
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
@@ -128,11 +123,10 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
  def test_one_hot_layer_creation_with_mixed_precision(self):
    vocab_size = 31
    embedding_width = 27
-    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
        vocab_size=vocab_size,
        embedding_width=embedding_width,
-        dtype=policy,
+        dtype="mixed_float16",
        use_one_hot=True)
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
@@ -171,11 +165,10 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
  def test_one_hot_layer_invocation_with_mixed_precision(self):
    vocab_size = 31
    embedding_width = 27
-    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
        vocab_size=vocab_size,
        embedding_width=embedding_width,
-        dtype=policy,
+        dtype="mixed_float16",
        use_one_hot=True)
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
@@ -193,6 +186,28 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
    output = model.predict(input_data)
    self.assertEqual(tf.float16, output.dtype)

+  def test_use_scale_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        scale_factor=embedding_width**0.5)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/keras_nlp/layers/position_embedding.py
+++ b/official/nlp/keras_nlp/layers/position_embedding.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based positional embedding layer."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+class PositionEmbedding(tf.keras.layers.Layer):
+  """Creates a positional embedding.
+
+  Example:
+  ```python
+  position_embedding = PositionEmbedding(max_length=100)
+  inputs = tf.keras.Input((100, 32), dtype=tf.float32)
+  outputs = position_embedding(inputs)
+  ```
+
+
+  Args:
+    max_length: The maximum size of the dynamic sequence.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+
+  Reference: This layer creates a positional embedding as described in
+  [BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding](https://arxiv.org/abs/1810.04805).
+  """
+
+  def __init__(self,
+               max_length,
+               initializer="glorot_uniform",
+               **kwargs):
+
+    super(PositionEmbedding, self).__init__(**kwargs)
+    if max_length is None:
+      raise ValueError(
+          "`max_length` must be an Integer, not `None`."
+      )
+    self._max_length = max_length
+    self._initializer = tf.keras.initializers.get(initializer)
+
+  def get_config(self):
+    config = {
+        "max_length": self._max_length,
+        "initializer": tf.keras.initializers.serialize(self._initializer),
+    }
+    base_config = super(PositionEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    dimension_list = input_shape.as_list()
+
+    if len(dimension_list) != 3:
+      raise ValueError("PositionEmbedding expects a 3-dimensional input tensor "
+                       "of shape [batch, sequence, width], got "
+                       "{}".format(input_shape))
+    seq_length = dimension_list[1]
+    width = dimension_list[2]
+
+    if self._max_length is not None:
+      weight_sequence_length = self._max_length
+    else:
+      weight_sequence_length = seq_length
+
+    self._position_embeddings = self.add_weight(
+        "embeddings",
+        shape=[weight_sequence_length, width],
+        initializer=self._initializer)
+
+    super(PositionEmbedding, self).build(input_shape)
+
+  def call(self, inputs):
+    input_shape = tf.shape(inputs)
+    position_embeddings = self._position_embeddings[:input_shape[1], :]
+    return tf.broadcast_to(position_embeddings, input_shape)