resolve conflict with master

b0ccdb11 · Shixin Luo · e61588cd · 1611a8c5 · b0ccdb11 · b0ccdb11
Commit b0ccdb11 authored Sep 28, 2020 by Shixin Luo
20 changed files
--- a/official/nlp/bert/model_training_utils.py
+++ b/official/nlp/bert/model_training_utils.py
@@ -25,8 +25,8 @@ import tempfile
 from absl import logging
 import tensorflow as tf
 from tensorflow.python.util import deprecation
+from official.common import distribute_utils
 from official.staging.training import grad_utils
-from official.utils.misc import distribution_utils
 _SUMMARY_TXT = 'training_summary.txt'
 _MIN_SUMMARY_STEPS = 10
@@ -164,9 +164,9 @@ def run_customized_training_loop(
        evaluation is skipped.
      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
        is not none.
-      metric_fn: A metrics function that returns a Keras Metric object to record
+      metric_fn: A metrics function that returns either a Keras Metric object or
-        evaluation result using evaluation dataset or with training dataset
+        a list of Keras Metric objects to record evaluation result using
-        after every epoch.
+        evaluation dataset or with training dataset after every epoch.
      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
        `model_fn`.
      custom_callbacks: A list of Keras Callbacks objects to run during
@@ -266,7 +266,7 @@ def run_customized_training_loop(
  train_iterator = _get_input_iterator(train_input_fn, strategy)
  eval_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
-  with distribution_utils.get_strategy_scope(strategy):
+  with distribute_utils.get_strategy_scope(strategy):
    # To correctly place the model weights on accelerators,
    # model and optimizer should be created in scope.
    model, sub_model = model_fn()
@@ -291,7 +291,9 @@ def run_customized_training_loop(
      logging.info('Loading from checkpoint file completed')
    train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
-    eval_metrics = [metric_fn()] if metric_fn else []
+    eval_metrics = metric_fn() if metric_fn else []
+    if not isinstance(eval_metrics, list):
+      eval_metrics = [eval_metrics]
    # If evaluation is required, make a copy of metric as it will be used by
    # both train and evaluation.
    train_metrics = [

--- a/official/nlp/bert/run_classifier.py
+++ b/official/nlp/bert/run_classifier.py
@@ -28,6 +28,7 @@ from absl import flags
 from absl import logging
 import gin
 import tensorflow as tf
+from official.common import distribute_utils
 from official.modeling import performance
 from official.nlp import optimization
 from official.nlp.bert import bert_models
@@ -35,7 +36,6 @@ from official.nlp.bert import common_flags
 from official.nlp.bert import configs as bert_configs
 from official.nlp.bert import input_pipeline
 from official.nlp.bert import model_saving_utils
-from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
 flags.DEFINE_enum(
@@ -447,7 +447,7 @@ def custom_main(custom_callbacks=None, custom_metrics=None):
                      FLAGS.model_dir)
    return
-  strategy = distribution_utils.get_distribution_strategy(
+  strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=FLAGS.distribution_strategy,
      num_gpus=FLAGS.num_gpus,
      tpu_address=FLAGS.tpu)

--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
@@ -23,6 +23,7 @@ from absl import flags
 from absl import logging
 import gin
 import tensorflow as tf
+from official.common import distribute_utils
 from official.modeling import performance
 from official.nlp import optimization
 from official.nlp.bert import bert_models
@@ -30,7 +31,6 @@ from official.nlp.bert import common_flags
 from official.nlp.bert import configs
 from official.nlp.bert import input_pipeline
 from official.nlp.bert import model_training_utils
-from official.utils.misc import distribution_utils
 flags.DEFINE_string('input_files', None,
@@ -205,9 +205,8 @@ def main(_):
    FLAGS.model_dir = '/tmp/bert20/'
  # Configures cluster spec for multi-worker distribution strategy.
  if FLAGS.num_gpus > 0:
-    _ = distribution_utils.configure_cluster(FLAGS.worker_hosts,
+    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
-                                             FLAGS.task_index)
+  strategy = distribute_utils.get_distribution_strategy(
-  strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=FLAGS.distribution_strategy,
      num_gpus=FLAGS.num_gpus,
      all_reduce_alg=FLAGS.all_reduce_alg,

--- a/official/nlp/bert/run_squad.py
+++ b/official/nlp/bert/run_squad.py
@@ -28,12 +28,11 @@ from absl import flags
 from absl import logging
 import gin
 import tensorflow as tf
+from official.common import distribute_utils
 from official.nlp.bert import configs as bert_configs
 from official.nlp.bert import run_squad_helper
 from official.nlp.bert import tokenization
 from official.nlp.data import squad_lib as squad_lib_wp
-from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
@@ -105,9 +104,8 @@ def main(_):
  # Configures cluster spec for multi-worker distribution strategy.
  if FLAGS.num_gpus > 0:
-    _ = distribution_utils.configure_cluster(FLAGS.worker_hosts,
+    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
-                                             FLAGS.task_index)
+  strategy = distribute_utils.get_distribution_strategy(
-  strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=FLAGS.distribution_strategy,
      num_gpus=FLAGS.num_gpus,
      all_reduce_alg=FLAGS.all_reduce_alg,

--- a/official/nlp/bert/tf2_encoder_checkpoint_converter.py
+++ b/official/nlp/bert/tf2_encoder_checkpoint_converter.py
@@ -15,7 +15,8 @@
 """A converter from a V1 BERT encoder checkpoint to a V2 encoder checkpoint.
 The conversion will yield an object-oriented checkpoint that can be used
-to restore a TransformerEncoder object.
+to restore a BertEncoder or BertPretrainerV2 object (see the `converted_model`
+FLAG below).
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -27,9 +28,10 @@ from absl import app
 from absl import flags
 import tensorflow as tf
-from official.modeling import activations
+from official.modeling import tf_utils
 from official.nlp.bert import configs
 from official.nlp.bert import tf1_checkpoint_converter_lib
+from official.nlp.modeling import models
 from official.nlp.modeling import networks
 FLAGS = flags.FLAGS
@@ -46,6 +48,10 @@ flags.DEFINE_string("checkpoint_model_name", "encoder",
                    "The name of the model when saving the checkpoint, i.e., "
                    "the checkpoint will be saved using: "
                    "tf.train.Checkpoint(FLAGS.checkpoint_model_name=model).")
+flags.DEFINE_enum(
+    "converted_model", "encoder", ["encoder", "pretrainer"],
+    "Whether to convert the checkpoint to a `BertEncoder` model or a "
+    "`BertPretrainerV2` model (with mlm but without classification heads).")
 def _create_bert_model(cfg):
@@ -55,7 +61,7 @@ def _create_bert_model(cfg):
    cfg: A `BertConfig` to create the core model.
  Returns:
-    A TransformerEncoder netowork.
+    A BertEncoder network.
  """
  bert_encoder = networks.BertEncoder(
      vocab_size=cfg.vocab_size,
@@ -63,7 +69,7 @@ def _create_bert_model(cfg):
      num_layers=cfg.num_hidden_layers,
      num_attention_heads=cfg.num_attention_heads,
      intermediate_size=cfg.intermediate_size,
-      activation=activations.gelu,
+      activation=tf_utils.get_activation(cfg.hidden_act),
      dropout_rate=cfg.hidden_dropout_prob,
      attention_dropout_rate=cfg.attention_probs_dropout_prob,
      max_sequence_length=cfg.max_position_embeddings,
@@ -75,8 +81,29 @@ def _create_bert_model(cfg):
  return bert_encoder
-def convert_checkpoint(bert_config, output_path, v1_checkpoint,
+def _create_bert_pretrainer_model(cfg):
-                       checkpoint_model_name="model"):
+  """Creates a BERT keras core model from BERT configuration.
+  Args:
+    cfg: A `BertConfig` to create the core model.
+  Returns:
+    A BertPretrainerV2 model.
+  """
+  bert_encoder = _create_bert_model(cfg)
+  pretrainer = models.BertPretrainerV2(
+      encoder_network=bert_encoder,
+      mlm_activation=tf_utils.get_activation(cfg.hidden_act),
+      mlm_initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=cfg.initializer_range))
+  return pretrainer
+def convert_checkpoint(bert_config,
+                       output_path,
+                       v1_checkpoint,
+                       checkpoint_model_name="model",
+                       converted_model="encoder"):
  """Converts a V1 checkpoint into an OO V2 checkpoint."""
  output_dir, _ = os.path.split(output_path)
  tf.io.gfile.makedirs(output_dir)
@@ -84,6 +111,7 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
  # Create a temporary V1 name-converted checkpoint in the output directory.
  temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")
  temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt")
  tf1_checkpoint_converter_lib.convert(
      checkpoint_from_path=v1_checkpoint,
      checkpoint_to_path=temporary_checkpoint,
@@ -92,8 +120,14 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
      permutations=tf1_checkpoint_converter_lib.BERT_V2_PERMUTATIONS,
      exclude_patterns=["adam", "Adam"])
-  # Create a V2 checkpoint from the temporary checkpoint.
+  if converted_model == "encoder":
    model = _create_bert_model(bert_config)
+  elif converted_model == "pretrainer":
+    model = _create_bert_pretrainer_model(bert_config)
+  else:
+    raise ValueError("Unsupported converted_model: %s" % converted_model)
+  # Create a V2 checkpoint from the temporary checkpoint.
  tf1_checkpoint_converter_lib.create_v2_checkpoint(model, temporary_checkpoint,
                                                    output_path,
                                                    checkpoint_model_name)
@@ -106,13 +140,21 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
    pass
-def main(_):
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
  output_path = FLAGS.converted_checkpoint_path
  v1_checkpoint = FLAGS.checkpoint_to_convert
  checkpoint_model_name = FLAGS.checkpoint_model_name
+  converted_model = FLAGS.converted_model
  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  convert_checkpoint(bert_config, output_path, v1_checkpoint,
+  convert_checkpoint(
-                     checkpoint_model_name)
+      bert_config=bert_config,
+      output_path=output_path,
+      v1_checkpoint=v1_checkpoint,
+      checkpoint_model_name=checkpoint_model_name,
+      converted_model=converted_model)
 if __name__ == "__main__":

--- a/official/nlp/bert/tokenization.py
+++ b/official/nlp/bert/tokenization.py
@@ -421,7 +421,7 @@ def preprocess_text(inputs, remove_space=True, lower=False):
  """Preprocesses data by removing extra space and normalize data.
  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
  Args:
    inputs: The input text.
@@ -454,7 +454,7 @@ def encode_pieces(sp_model, text, sample=False):
  """Segements text into pieces.
  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
  Args:
@@ -496,7 +496,7 @@ def encode_ids(sp_model, text, sample=False):
  """Segments text and return token ids.
  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
  Args:
    sp_model: A spm.SentencePieceProcessor object.

--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -26,8 +26,9 @@ import tensorflow as tf
 from official.modeling import hyperparams
 from official.modeling import tf_utils
-from official.nlp.modeling import layers
+from official.nlp import keras_nlp
 from official.nlp.modeling import networks
+from official.nlp.projects.bigbird import encoder as bigbird_encoder
 @dataclasses.dataclass
@@ -60,18 +61,18 @@ class MobileBertEncoderConfig(hyperparams.Config):
    num_blocks: number of transformer block in the encoder model.
    hidden_size: the hidden size for the transformer block.
    num_attention_heads: number of attention heads in the transformer block.
-    intermediate_size: the size of the "intermediate" (a.k.a., feed
+    intermediate_size: the size of the "intermediate" (a.k.a., feed forward)
-      forward) layer.
+      layer.
-    intermediate_act_fn: the non-linear activation function to apply
+    intermediate_act_fn: the non-linear activation function to apply to the
-      to the output of the intermediate/feed-forward layer.
+      output of the intermediate/feed-forward layer.
    hidden_dropout_prob: dropout probability for the hidden layers.
    attention_probs_dropout_prob: dropout probability of the attention
      probabilities.
    intra_bottleneck_size: the size of bottleneck.
    initializer_range: The stddev of the truncated_normal_initializer for
      initializing all weight matrices.
-    key_query_shared_bottleneck: whether to share linear transformation for
+    key_query_shared_bottleneck: whether to share linear transformation for keys
-      keys and queries.
+      and queries.
    num_feedforward_networks: number of stacked feed-forward networks.
    normalization_type: the type of normalization_type, only 'no_norm' and
      'layer_norm' are supported. 'no_norm' represents the element-wise linear
@@ -79,8 +80,6 @@ class MobileBertEncoderConfig(hyperparams.Config):
      MobileBERT paper. 'layer_norm' is used for the teacher model.
    classifier_activation: if using the tanh activation for the final
      representation of the [CLS] token in fine-tuning.
-    return_all_layers: if return all layer outputs.
-    return_attention_score: if return attention scores for each layer.
  """
  word_vocab_size: int = 30522
  word_embed_size: int = 128
@@ -99,8 +98,6 @@ class MobileBertEncoderConfig(hyperparams.Config):
  num_feedforward_networks: int = 1
  normalization_type: str = "layer_norm"
  classifier_activation: bool = True
-  return_all_layers: bool = False
-  return_attention_score: bool = False
 @dataclasses.dataclass
@@ -120,25 +117,47 @@ class AlbertEncoderConfig(hyperparams.Config):
  initializer_range: float = 0.02
+@dataclasses.dataclass
+class BigBirdEncoderConfig(hyperparams.Config):
+  """BigBird encoder configuration."""
+  vocab_size: int = 50358
+  hidden_size: int = 768
+  num_layers: int = 12
+  num_attention_heads: int = 12
+  hidden_activation: str = "gelu"
+  intermediate_size: int = 3072
+  dropout_rate: float = 0.1
+  attention_dropout_rate: float = 0.1
+  max_position_embeddings: int = 4096
+  num_rand_blocks: int = 3
+  block_size: int = 64
+  type_vocab_size: int = 16
+  initializer_range: float = 0.02
+  embedding_size: Optional[int] = None
 @dataclasses.dataclass
 class EncoderConfig(hyperparams.OneOfConfig):
  """Encoder configuration."""
  type: Optional[str] = "bert"
  albert: AlbertEncoderConfig = AlbertEncoderConfig()
  bert: BertEncoderConfig = BertEncoderConfig()
+  bigbird: BigBirdEncoderConfig = BigBirdEncoderConfig()
  mobilebert: MobileBertEncoderConfig = MobileBertEncoderConfig()
 ENCODER_CLS = {
    "bert": networks.BertEncoder,
    "mobilebert": networks.MobileBERTEncoder,
-    "albert": networks.AlbertTransformerEncoder,
+    "albert": networks.AlbertEncoder,
+    "bigbird": bigbird_encoder.BigBirdEncoder,
 }
 @gin.configurable
-def build_encoder(config: EncoderConfig,
+def build_encoder(
-                  embedding_layer: Optional[layers.OnDeviceEmbedding] = None,
+    config: EncoderConfig,
+    embedding_layer: Optional[keras_nlp.layers.OnDeviceEmbedding] = None,
    encoder_cls=None,
    bypass_config: bool = False):
  """Instantiate a Transformer encoder network from EncoderConfig.
@@ -188,7 +207,8 @@ def build_encoder(config: EncoderConfig,
        pooled_output_dim=encoder_cfg.hidden_size,
        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=encoder_cfg.initializer_range),
-        return_all_layer_outputs=encoder_cfg.return_all_encoder_outputs)
+        return_all_layer_outputs=encoder_cfg.return_all_encoder_outputs,
+        dict_outputs=True)
    return encoder_cls(**kwargs)
  if encoder_type == "mobilebert":
@@ -205,12 +225,11 @@ def build_encoder(config: EncoderConfig,
        hidden_dropout_prob=encoder_cfg.hidden_dropout_prob,
        attention_probs_dropout_prob=encoder_cfg.attention_probs_dropout_prob,
        intra_bottleneck_size=encoder_cfg.intra_bottleneck_size,
+        initializer_range=encoder_cfg.initializer_range,
        key_query_shared_bottleneck=encoder_cfg.key_query_shared_bottleneck,
        num_feedforward_networks=encoder_cfg.num_feedforward_networks,
        normalization_type=encoder_cfg.normalization_type,
-        classifier_activation=encoder_cfg.classifier_activation,
+        classifier_activation=encoder_cfg.classifier_activation)
-        return_all_layers=encoder_cfg.return_all_layers,
-        return_attention_score=encoder_cfg.return_attention_score)
  if encoder_type == "albert":
    return encoder_cls(
@@ -226,7 +245,26 @@ def build_encoder(config: EncoderConfig,
        dropout_rate=encoder_cfg.dropout_rate,
        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range))
+            stddev=encoder_cfg.initializer_range),
+        dict_outputs=True)
+  if encoder_type == "bigbird":
+    return encoder_cls(
+        vocab_size=encoder_cfg.vocab_size,
+        hidden_size=encoder_cfg.hidden_size,
+        num_layers=encoder_cfg.num_layers,
+        num_attention_heads=encoder_cfg.num_attention_heads,
+        intermediate_size=encoder_cfg.intermediate_size,
+        activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
+        dropout_rate=encoder_cfg.dropout_rate,
+        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        num_rand_blocks=encoder_cfg.num_rand_blocks,
+        block_size=encoder_cfg.block_size,
+        max_sequence_length=encoder_cfg.max_position_embeddings,
+        type_vocab_size=encoder_cfg.type_vocab_size,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        embedding_width=encoder_cfg.embedding_size)
  # Uses the default BERTEncoder configuration schema to create the encoder.
  # If it does not match, please add a switch branch by the encoder type.
@@ -245,4 +283,5 @@ def build_encoder(config: EncoderConfig,
          stddev=encoder_cfg.initializer_range),
      embedding_width=encoder_cfg.embedding_size,
      embedding_layer=embedding_layer,
-      return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs)
+      return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
+      dict_outputs=True)
--- a/official/nlp/keras_nlp/README.md
+++ b/official/nlp/keras_nlp/README.md
+# keras-nlp
+## Layers
+Layers are the fundamental building blocks for NLP models. They can be used to
+assemble new layers, networks, or models.
+*   [TransformerEncoderBlock](layers/transformer_encoder_block.py) implements
+    an optionally masked transformer as described in
+    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
+*   [OnDeviceEmbedding](layers/on_device_embedding.py) implements efficient
+    embedding lookups designed for TPU-based models.
+*   [PositionalEmbedding](layers/position_embedding.py) creates a positional
+    embedding as described in ["BERT: Pre-training of Deep Bidirectional
+    Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805).
+*   [SelfAttentionMask](layers/self_attention_mask.py) creates a 3D attention
+    mask from a 2D tensor mask.
+*   [MaskedLM](layers/masked_lm.py) implements a masked language model. It
+    assumes the embedding table variable is passed to it.
+## Encoders
+Encoders are combinations of layers (and possibly other encoders). They are
+sub-units of models that would not be trained alone. It encapsulates common
+network structures like a classification head or a transformer encoder into an
+easily handled object with a standardized configuration.
+*   [BertEncoder](encoders/bert_encoder.py) implements a bi-directional
+    Transformer-based encoder as described in
+    ["BERT: Pre-training of Deep Bidirectional Transformers for Language
+    Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding
+    lookups, transformer layers and pooling layer.
--- a/official/nlp/keras_nlp/__init__.py
+++ b/official/nlp/keras_nlp/__init__.py
@@ -14,4 +14,5 @@
 # ==============================================================================
 """Keras-NLP package definition."""
 # pylint: disable=wildcard-import
-from official.nlp.keras_nlp.layers import *
+from official.nlp.keras_nlp import encoders
+from official.nlp.keras_nlp import layers
--- a/official/nlp/keras_nlp/contributing.md
+++ b/official/nlp/keras_nlp/contributing.md
+## Contributing to KerasNLP
+Patches to KerasNLP are welcome!
+The source-of-truth repository lives under
+[TF Model Garden NLP](https://github.com/tensorflow/models/tree/master/official/nlp/keras_nlp),
+and is mirrored as a read-only repository under
+[keras-team/keras-nlp](https://github.com/keras-team/keras-nlp).
+Contributions should be made as PRs to the TF Model Garden repository.
+This is to ensure the codebase is rigorously tested with state-of-art models
+on different accelerators.
+In the long run, we will move development to the current repository `keras-team/keras-nlp`.
+## :heavy_check_mark: Contributor checklist
+1. Ensure you have signed the [Contributor License Agreement](https://cla.developers.google.com/about/google-individual?csw=1).
+    * All code contributors are required to sign a Contributor License Agreement.
+    * Please read this [troubleshooting guide](Contributor-License-Agreements#troubleshooting-clas)
+    if you encounter an issue.
+2. Please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
+3. Check if your changes are consistent with the [TensorFlow coding style](https://www.tensorflow.org/community/contribute/code_style).
--- a/official/nlp/keras_nlp/encoders/__init__.py
+++ b/official/nlp/keras_nlp/encoders/__init__.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-NLP layers package definition."""
+from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder
--- a/official/nlp/keras_nlp/encoders/bert_encoder.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bert encoder network."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+from official.nlp.keras_nlp import layers
+@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+class BertEncoder(tf.keras.Model):
+  """Bi-directional Transformer-based encoder network.
+  This network implements a bi-directional Transformer-based encoder as
+  described in "BERT: Pre-training of Deep Bidirectional Transformers for
+  Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
+  embedding lookups and transformer layers, but not the masked language model
+  or classification task networks.
+  The default values for this object are taken from the BERT-Base implementation
+  in "BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding".
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+  Arguments:
+    vocab_size: The size of the token vocabulary.
+    hidden_size: The size of the transformer hidden layers.
+    num_layers: The number of transformer layers.
+    num_attention_heads: The number of attention heads for each transformer. The
+      hidden size must be divisible by the number of attention heads.
+    max_sequence_length: The maximum sequence length that this encoder can
+      consume. If None, max_sequence_length uses the value from sequence length.
+      This determines the variable shape for positional embeddings.
+    type_vocab_size: The number of types that the 'type_ids' input can take.
+    inner_dim: The output dimension of the first Dense layer in a two-layer
+        feedforward network for each transformer.
+    inner_activation: The activation for the first Dense layer in a two-layer
+        feedforward network for each transformer.
+    output_dropout: Dropout probability for the post-attention and output
+        dropout.
+    attention_dropout: The dropout rate to use for the attention layers
+      within the transformer layers.
+    initializer: The initialzer to use for all weights in this encoder.
+    output_range: The sequence output range, [0, output_range), by slicing the
+      target sequence of the last transformer layer. `None` means the entire
+      target sequence will attend to the source sequence, which yeilds the full
+      output.
+    embedding_width: The width of the word embeddings. If the embedding width is
+      not equal to hidden size, embedding parameters will be factorized into two
+      matrices in the shape of ['vocab_size', 'embedding_width'] and
+      ['embedding_width', 'hidden_size'] ('embedding_width' is usually much
+      smaller than 'hidden_size').
+  """
+  def __init__(
+      self,
+      vocab_size,
+      hidden_size=768,
+      num_layers=12,
+      num_attention_heads=12,
+      max_sequence_length=512,
+      type_vocab_size=16,
+      inner_dim=3072,
+      inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
+      output_dropout=0.1,
+      attention_dropout=0.1,
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      output_range=None,
+      embedding_width=None,
+      **kwargs):
+    activation = tf.keras.activations.get(inner_activation)
+    initializer = tf.keras.initializers.get(initializer)
+    self._self_setattr_tracking = False
+    self._config_dict = {
+        'vocab_size': vocab_size,
+        'hidden_size': hidden_size,
+        'num_layers': num_layers,
+        'num_attention_heads': num_attention_heads,
+        'max_sequence_length': max_sequence_length,
+        'type_vocab_size': type_vocab_size,
+        'inner_dim': inner_dim,
+        'inner_activation': tf.keras.activations.serialize(activation),
+        'output_dropout': output_dropout,
+        'attention_dropout': attention_dropout,
+        'initializer': tf.keras.initializers.serialize(initializer),
+        'output_range': output_range,
+        'embedding_width': embedding_width,
+    }
+    word_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_word_ids')
+    mask = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_mask')
+    type_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_type_ids')
+    if embedding_width is None:
+      embedding_width = hidden_size
+    self._embedding_layer = self._build_embedding_layer()
+    word_embeddings = self._embedding_layer(word_ids)
+    # Always uses dynamic slicing for simplicity.
+    self._position_embedding_layer = layers.PositionEmbedding(
+        initializer=initializer,
+        max_length=max_sequence_length,
+        name='position_embedding')
+    position_embeddings = self._position_embedding_layer(word_embeddings)
+    self._type_embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=type_vocab_size,
+        embedding_width=embedding_width,
+        initializer=initializer,
+        use_one_hot=True,
+        name='type_embeddings')
+    type_embeddings = self._type_embedding_layer(type_ids)
+    embeddings = tf.keras.layers.Add()(
+        [word_embeddings, position_embeddings, type_embeddings])
+    self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
+        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
+    embeddings = self._embedding_norm_layer(embeddings)
+    embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))
+    # We project the 'embedding' output to 'hidden_size' if it is not already
+    # 'hidden_size'.
+    if embedding_width != hidden_size:
+      self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
+          '...x,xy->...y',
+          output_shape=hidden_size,
+          bias_axes='y',
+          kernel_initializer=initializer,
+          name='embedding_projection')
+      embeddings = self._embedding_projection(embeddings)
+    self._transformer_layers = []
+    data = embeddings
+    attention_mask = layers.SelfAttentionMask()(data, mask)
+    encoder_outputs = []
+    for i in range(num_layers):
+      if i == num_layers - 1 and output_range is not None:
+        transformer_output_range = output_range
+      else:
+        transformer_output_range = None
+      layer = layers.TransformerEncoderBlock(
+          num_attention_heads=num_attention_heads,
+          inner_dim=inner_dim,
+          inner_activation=inner_activation,
+          output_dropout=output_dropout,
+          attention_dropout=attention_dropout,
+          output_range=transformer_output_range,
+          kernel_initializer=initializer,
+          name='transformer/layer_%d' % i)
+      self._transformer_layers.append(layer)
+      data = layer([data, attention_mask])
+      encoder_outputs.append(data)
+    first_token_tensor = (
+        tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
+            encoder_outputs[-1]))
+    self._pooler_layer = tf.keras.layers.Dense(
+        units=hidden_size,
+        activation='tanh',
+        kernel_initializer=initializer,
+        name='pooler_transform')
+    cls_output = self._pooler_layer(first_token_tensor)
+    outputs = dict(
+        sequence_output=encoder_outputs[-1],
+        pooled_output=cls_output,
+        encoder_outputs=encoder_outputs,
+    )
+    super(BertEncoder, self).__init__(
+        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
+  def get_embedding_table(self):
+    return self._embedding_layer.embeddings
+  def _build_embedding_layer(self):
+    embedding_width = self._config_dict[
+        'embedding_width'] or self._config_dict['hidden_size']
+    return layers.OnDeviceEmbedding(
+        vocab_size=self._config_dict['vocab_size'],
+        embedding_width=embedding_width,
+        initializer=self._config_dict['initializer'],
+        name='word_embeddings')
+  def get_embedding_layer(self):
+    return self._embedding_layer
+  def get_config(self):
+    return self._config_dict
+  @property
+  def transformer_layers(self):
+    """List of Transformer layers in the encoder."""
+    return self._transformer_layers
+  @property
+  def pooler_layer(self):
+    """The pooler dense layer after the transformer layers."""
+    return self._pooler_layer
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/nlp/keras_nlp/encoders/bert_encoder_test.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for transformer-based bert encoder network."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.keras_nlp.encoders import bert_encoder
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class BertEncoderTest(keras_parameterized.TestCase):
+  def tearDown(self):
+    super(BertEncoderTest, self).tearDown()
+    tf.keras.mixed_precision.experimental.set_policy("float32")
+  def test_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    self.assertIsInstance(test_network.transformer_layers, list)
+    self.assertLen(test_network.transformer_layers, 3)
+    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+  def test_all_encoder_outputs_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    all_encoder_outputs = dict_outputs["encoder_outputs"]
+    pooled = dict_outputs["pooled_output"]
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertLen(all_encoder_outputs, 3)
+    for data in all_encoder_outputs:
+      self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+  def test_network_creation_with_float16_dtype(self):
+    hidden_size = 32
+    sequence_length = 21
+    tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+    # If float_dtype is set to float16, the data output is float32 (from a layer
+    # norm) and pool output should be float16.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float16, pooled.dtype)
+  @parameterized.named_parameters(
+      ("all_sequence", None, 21),
+      ("output_range", 1, 1),
+  )
+  def test_network_invocation(self, output_range, out_seq_len):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        output_range=output_range)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], out_seq_len)
+    # Creates a BertEncoder with max_sequence_length != sequence_length
+    max_sequence_length = 128
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], sequence_length)
+    # Creates a BertEncoder with embedding_width != hidden_size
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        embedding_width=16)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[-1], hidden_size)
+    self.assertTrue(hasattr(test_network, "_embedding_projection"))
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        vocab_size=100,
+        hidden_size=32,
+        num_layers=3,
+        num_attention_heads=2,
+        max_sequence_length=21,
+        type_vocab_size=12,
+        inner_dim=1223,
+        inner_activation="relu",
+        output_dropout=0.05,
+        attention_dropout=0.22,
+        initializer="glorot_uniform",
+        output_range=-1,
+        embedding_width=16)
+    network = bert_encoder.BertEncoder(**kwargs)
+    expected_config = dict(kwargs)
+    expected_config["inner_activation"] = tf.keras.activations.serialize(
+        tf.keras.activations.get(expected_config["inner_activation"]))
+    expected_config["initializer"] = tf.keras.initializers.serialize(
+        tf.keras.initializers.get(expected_config["initializer"]))
+    self.assertEqual(network.get_config(), expected_config)
+    # Create another network object from the first object's config.
+    new_network = bert_encoder.BertEncoder.from_config(network.get_config())
+    # Validate that the config can be forced to JSON.
+    _ = network.to_json()
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+    # Tests model saving/loading.
+    model_path = self.get_temp_dir() + "/model"
+    network.save(model_path)
+    _ = tf.keras.models.load_model(model_path)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/keras_nlp/layers/__init__.py
+++ b/official/nlp/keras_nlp/layers/__init__.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-NLP layers package definition."""
+from official.nlp.keras_nlp.layers.masked_lm import MaskedLM
+from official.nlp.keras_nlp.layers.on_device_embedding import OnDeviceEmbedding
 from official.nlp.keras_nlp.layers.position_embedding import PositionEmbedding
 from official.nlp.keras_nlp.layers.self_attention_mask import SelfAttentionMask
 from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
--- a/official/nlp/keras_nlp/layers/masked_lm.py
+++ b/official/nlp/keras_nlp/layers/masked_lm.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Masked language model network."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+class MaskedLM(tf.keras.layers.Layer):
+  """Masked language model network head for BERT modeling.
+  This layer implements a masked language model based on the provided
+  transformer based encoder. It assumes that the encoder network being passed
+  has a "get_embedding_table()" method.
+  Example:
+  ```python
+  encoder=keras_nlp.BertEncoder(...)
+  lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
+  ```
+  Arguments:
+    embedding_table: The embedding table from encoder network.
+    activation: The activation, if any, for the dense layer.
+    initializer: The initializer for the dense layer. Defaults to a Glorot
+      uniform initializer.
+    output: The output style for this layer. Can be either 'logits' or
+      'predictions'.
+  """
+  def __init__(self,
+               embedding_table,
+               activation=None,
+               initializer='glorot_uniform',
+               output='logits',
+               name=None,
+               **kwargs):
+    super(MaskedLM, self).__init__(name=name, **kwargs)
+    self.embedding_table = embedding_table
+    self.activation = activation
+    self.initializer = tf.keras.initializers.get(initializer)
+    if output not in ('predictions', 'logits'):
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+    self._output_type = output
+  def build(self, input_shape):
+    self._vocab_size, hidden_size = self.embedding_table.shape
+    self.dense = tf.keras.layers.Dense(
+        hidden_size,
+        activation=self.activation,
+        kernel_initializer=self.initializer,
+        name='transform/dense')
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+    super(MaskedLM, self).build(input_shape)
+  def call(self, sequence_data, masked_positions):
+    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
+    lm_data = self.dense(masked_lm_input)
+    lm_data = self.layer_norm(lm_data)
+    lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
+    logits = tf.nn.bias_add(lm_data, self.bias)
+    masked_positions_length = masked_positions.shape[1] or tf.shape(
+        masked_positions)[1]
+    logits = tf.reshape(logits,
+                        [-1, masked_positions_length, self._vocab_size])
+    if self._output_type == 'logits':
+      return logits
+    return tf.nn.log_softmax(logits)
+  def get_config(self):
+    raise NotImplementedError('MaskedLM cannot be directly serialized because '
+                              'it has variable sharing logic.')
+  def _gather_indexes(self, sequence_tensor, positions):
+    """Gathers the vectors at the specific positions.
+    Args:
+        sequence_tensor: Sequence output of `BertModel` layer of shape
+          (`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
+          hidden units of `BertModel` layer.
+        positions: Positions ids of tokens in sequence to mask for pretraining
+          of with dimension (batch_size, num_predictions) where
+          `num_predictions` is maximum number of tokens to mask out and predict
+          per each sequence.
+    Returns:
+        Masked out sequence tensor of shape (batch_size * num_predictions,
+        num_hidden).
+    """
+    sequence_shape = tf.shape(sequence_tensor)
+    batch_size, seq_length = sequence_shape[0], sequence_shape[1]
+    width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                      [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+    return output_tensor
--- a/official/nlp/keras_nlp/layers/on_device_embedding.py
+++ b/official/nlp/keras_nlp/layers/on_device_embedding.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-based one-hot embedding layer."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+class OnDeviceEmbedding(tf.keras.layers.Layer):
+  """Performs an embedding lookup suitable for accelerator devices.
+  This layer uses either tf.gather or tf.one_hot to translate integer indices to
+  float embeddings.
+  Arguments:
+    vocab_size: Number of elements in the vocabulary.
+    embedding_width: Output size of the embedding layer.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+    use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
+      lookup. Defaults to False (that is, using tf.gather). Setting this option
+      to True may improve performance, especially on small vocabulary sizes, but
+      will generally require more memory.
+    scale_factor: Whether to scale the output embeddings. Defaults to None (that
+      is, not to scale). Setting this option to a float will let values in
+      output embeddings multiplied by scale_factor.
+  """
+  def __init__(self,
+               vocab_size,
+               embedding_width,
+               initializer="glorot_uniform",
+               use_one_hot=False,
+               scale_factor=None,
+               **kwargs):
+    super(OnDeviceEmbedding, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self._embedding_width = embedding_width
+    self._initializer = initializer
+    self._use_one_hot = use_one_hot
+    self._scale_factor = scale_factor
+  def get_config(self):
+    config = {
+        "vocab_size": self._vocab_size,
+        "embedding_width": self._embedding_width,
+        "initializer": self._initializer,
+        "use_one_hot": self._use_one_hot,
+        "scale_factor": self._scale_factor,
+    }
+    base_config = super(OnDeviceEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def build(self, input_shape):
+    self.embeddings = self.add_weight(
+        "embeddings",
+        shape=[self._vocab_size, self._embedding_width],
+        initializer=self._initializer,
+        dtype=tf.float32)
+    super(OnDeviceEmbedding, self).build(input_shape)
+  def call(self, inputs):
+    flat_inputs = tf.reshape(inputs, [-1])
+    if self._use_one_hot:
+      one_hot_data = tf.one_hot(
+          flat_inputs, depth=self._vocab_size, dtype=self.embeddings.dtype)
+      embeddings = tf.matmul(one_hot_data, self.embeddings)
+    else:
+      embeddings = tf.gather(self.embeddings, flat_inputs)
+    embeddings = tf.reshape(
+        embeddings,
+        # Work around b/142213824: prefer concat to shape over a Python list.
+        tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
+    embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
+    if self._scale_factor:
+      embeddings *= self._scale_factor
+    return embeddings
--- a/official/nlp/modeling/layers/on_device_embedding_test.py
+++ b/official/nlp/modeling/layers/on_device_embedding_test.py
@@ -18,7 +18,7 @@ import numpy as np
 import tensorflow as tf
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import on_device_embedding
+from official.nlp.keras_nlp.layers import on_device_embedding
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
@@ -192,7 +192,8 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
    vocab_size = 31
    embedding_width = 27
    test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, use_scale=True)
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        scale_factor=embedding_width**0.5)
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)

--- a/official/nlp/keras_nlp/layers/position_embedding.py
+++ b/official/nlp/keras_nlp/layers/position_embedding.py
@@ -17,7 +17,7 @@
 import tensorflow as tf
-@tf.keras.utils.register_keras_serializable(package="Text")
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
 class PositionEmbedding(tf.keras.layers.Layer):
  """Creates a positional embedding.

--- a/official/nlp/keras_nlp/layers/transformer_encoder_block.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block.py
@@ -14,11 +14,10 @@
 # ==============================================================================
 """Keras-based TransformerEncoder block layer."""
-# Import libraries
 import tensorflow as tf
-@tf.keras.utils.register_keras_serializable(package="Text")
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
 class TransformerEncoderBlock(tf.keras.layers.Layer):
  """TransformerEncoderBlock layer.
@@ -241,6 +240,9 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
      input_tensor, attention_mask = (inputs, None)
    if self._output_range:
+      if self._norm_first:
+        source_tensor = input_tensor[:, 0:self._output_range, :]
+        input_tensor = self._attention_layer_norm(input_tensor)
      target_tensor = input_tensor[:, 0:self._output_range, :]
      attention_mask = attention_mask[:, 0:self._output_range, :]
    else:

--- a/official/nlp/keras_nlp/layers/transformer_encoder_block_test.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block_test.py
@@ -14,11 +14,6 @@
 # ==============================================================================
 """Tests for Keras-based transformer block layer."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-# Import libraries
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
@@ -142,6 +137,34 @@ class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase):
    self.assertAllClose(
        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+  def test_layer_output_range_with_pre_norm(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048,
+        inner_activation='relu', norm_first=True)
+    sequence_length = 21
+    width = 80
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+    # The layer only attends to the first token and outputs the first token
+    # embeeding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
    test_layer = transformer_cls(