Merge branch 'master' of https://github.com/tensorflow/models into RTESuperGLUE

2e9bb539 · stephenwu · 7bae5317 · 8fba84f8 · 2e9bb539 · 2e9bb539
Commit 2e9bb539 authored Feb 25, 2021 by stephenwu
20 changed files
--- a/official/nlp/continuous_finetune_lib.py
+++ b/official/nlp/continuous_finetune_lib.py
@@ -145,6 +145,11 @@ def run_continuous_finetune(
      min_interval_secs=10,
      timeout=params.trainer.continuous_eval_timeout,
      timeout_fn=timeout_fn):
+
+    # If there are checkpoints, they might be the finetune checkpoint of a
+    # different pretrained checkpoint. So we just remove all checkpoints.
+    train_utils.remove_ckpts(model_dir)
+
    with distribution_strategy.scope():
      global_step = train_utils.read_global_step_from_checkpoint(pretrain_ckpt)
    # Replaces params.task.init_checkpoint to make sure that we load

--- a/official/nlp/continuous_finetune_lib_test.py
+++ b/official/nlp/continuous_finetune_lib_test.py
@@ -90,6 +90,9 @@ class ContinuousFinetuneTest(tf.test.TestCase, parameterized.TestCase):
          pretrain_steps=pretrain_steps)
      self.assertIn('best_acc', eval_metrics)

+      self.assertFalse(
+          tf.io.gfile.exists(os.path.join(FLAGS.model_dir, 'checkpoint')))
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -1307,7 +1307,7 @@ class AXgProcessor(DataProcessor):
    """Creates examples for the training/dev/test sets."""
    examples = []
    for line in lines:
-      guid = "%s-%s" % (set_type, self.process_text_fn(str(line['idx'])))
+      guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"])))
      text_a = self.process_text_fn(line["premise"])
      text_b = self.process_text_fn(line["hypothesis"])
      label = self.process_text_fn(line["label"])
@@ -1315,7 +1315,8 @@ class AXgProcessor(DataProcessor):
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

-class RTESuperGLUEProcessor(DataProcessor):
+
+class SuperGLUERTEProcessor(DataProcessor):
  """Processor for the RTE dataset (SuperGLUE version)."""

  def get_train_examples(self, data_dir):
@@ -1349,16 +1350,17 @@ class RTESuperGLUEProcessor(DataProcessor):
    examples = []
    for i, line in enumerate(lines):
      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line['premise'])
-      text_b = self.process_text_fn(line['hypothesis'])
+      text_a = self.process_text_fn(line["premise"])
+      text_b = self.process_text_fn(line["hypothesis"])
      if set_type == "test":
        label = "entailment"
      else:
-        label = self.process_text_fn(line['label'])
+        label = self.process_text_fn(line["label"])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

+
 def file_based_convert_examples_to_features(examples,
                                            label_list,
                                            max_seq_length,

--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -49,8 +49,8 @@ flags.DEFINE_string(
 flags.DEFINE_enum(
    "classification_task_name", "MNLI", [
        "AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
-        "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X", "AX-g",
-        "RTE-SuperGLUE"
+        "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X",
+        "AX-g", "SUPERGLUE-RTE"
    ], "The name of the task to train BERT classifier. The "
    "difference between XTREME-XNLI and XNLI is: 1. the format "
    "of input tsv files; 2. the dev set for XTREME is english "
@@ -242,8 +242,8 @@ def generate_classifier_dataset():
                only_use_en_dev=FLAGS.only_use_en_dev),
        "ax-g":
            classifier_data_lib.AXgProcessor,
-        "rte-superglue":
-            classifier_data_lib.RTESuperGLUEProcessor
+        "superglue-rte":
+            classifier_data_lib.SuperGLUERTEProcessor
    }
    task_name = FLAGS.classification_task_name.lower()
    if task_name not in processors:

--- a/official/nlp/data/sentence_prediction_dataloader_test.py
+++ b/official/nlp/data/sentence_prediction_dataloader_test.py
@@ -164,6 +164,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
        input_path='' if use_tfds else tf_record_path,
        tfds_name='glue/mrpc' if use_tfds else '',
        tfds_split='train' if use_tfds else '',
+        tfds_download=True,
        text_fields=text_fields,
        global_batch_size=batch_size,
        seq_length=seq_length,
@@ -195,6 +196,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
        input_path='' if use_tfds else tf_record_path,
        tfds_name='glue/mrpc' if use_tfds else '',
        tfds_split='train' if use_tfds else '',
+        tfds_download=True,
        text_fields=text_fields,
        global_batch_size=batch_size,
        seq_length=seq_length,
@@ -228,6 +230,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
        input_path='' if use_tfds else tf_record_path,
        tfds_name='glue/mrpc' if use_tfds else '',
        tfds_split='train' if use_tfds else '',
+        tfds_download=True,
        text_fields=text_fields,
        global_batch_size=batch_size,
        seq_length=seq_length,

--- a/official/nlp/data/wmt_dataloader.py
+++ b/official/nlp/data/wmt_dataloader.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Input pipeline for the transformer model to read, filter, and batch examples.

-1. Batching scheme
+Batching scheme

   Prior to batching, elements in the dataset are grouped by length (max between
   'inputs' and 'targets' length). Each group is then batched such that:
@@ -60,8 +60,8 @@ def _create_min_max_boundaries(max_length,

  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
  returned values will be:
-    buckets_min = [0, 4, 8, 16, 24]
-    buckets_max = [4, 8, 16, 24, 25]
+    buckets_min = [0, 4, 8, 16]
+    buckets_max = [4, 8, 16, 25]

  Args:
    max_length: The maximum length of example in dataset.

--- a/official/nlp/docs/tfhub.md
+++ b/official/nlp/docs/tfhub.md
+# Exporting a pre-trained Encoder to TF Hub
+
+## Overview
+
+This doc explains how to use TF-NLP's
+[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
+tool to export pre-trained Transformer encoders to SavedModels suitable for
+publication on TF Hub. (For the steps after that, see TF Hub's
+[publisher guide](https://www.tensorflow.org/hub/publish).)
+For testing purposes, those SavedModels can also be used from their export
+locations on the filesystem.
+
+On TF Hub, Transformer encoders for text come as a pair of SavedModels:
+
+*   The preprocessing model applies a tokenizer with a fixed vocab plus some
+    additional logic to turn text into Transformer inputs.
+*   The encoder model (or "model" for short) applies the pre-trained Transformer
+    encoder.
+
+TF Hub defines
+[Common APIs](https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders)
+for all SavedModels of those two respective types, encapsulating the particular
+choice of preprocessing logic and Encoder architecture.
+
+## Exporting the Encoder
+
+There is a choice between exporting just the encoder, or the encoder plus the
+prediction head for the masked language model (MLM) task from pre-training.
+
+Exporting just the encoder suffices for many straightforward applications.
+
+### Exporting the Encoder alone
+
+To export an encoder-only model, you can set `--export_type=model` and run the
+tool like this:
+
+```shell
+python official/nlp/tools/export_tfhub.py \
+  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
+  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --export_type=model \
+  --export_path=/tmp/bert_model
+```
+
+The flag `--encoder_config_file` refers to a YAML file representing the
+[encoders.EncoderConfig](https://github.com/tensorflow/models/search?q=EncoderConfig+path%3Aofficial%2Fnlp%2Fconfigs+filename%3Aencoders.py)
+dataclass, which supports multiple encoders (e.g., BERT, ALBERT). Instead of
+`--encoder_config_file`, you can set `--bert_config_file` to a legacy
+`bert_config.json` file to export a BERT model. If the model definition involves
+[GIN](https://github.com/google/gin-config), the flags `--gin_file` and
+`--gin_params` must be set accordingly, consistent with pre-training.
+
+The `--model_checkpoint_path` refers to an object-based (TF2) checkpoint written
+by
+[BertPretrainerV2](https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py),
+or any other checkpoint that can be restored to
+`tf.train.Checkpoint(encoder=encoder)` for the encoder defined by the config
+flags. Legacy checkpoints with `model=` instead of `encoder=` are also supported
+for now.
+
+The exported SavedModel expects dict inputs and outputs as follows, implementing
+a specialization of the respective
+[Common SavedModel API](https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders):
+
+```python
+encoder = hub.load(...)
+encoder_inputs = dict(
+    input_word_ids=...,  # Shape [batch, seq_length], dtype=int32
+    input_mask=...,      # Shape [batch, seq_length], dtype=int32
+    input_type_ids=...,  # Shape [batch, seq_length], dtype=int32
+)
+encoder_outputs = encoder(encoder_inputs)
+assert encoder_outputs.keys() == {
+  "pooled_output",    # Shape [batch_size, width], dtype=float32
+  "default",          # Alias for "pooled_output" (aligns with other models)
+  "sequence_output",  # Shape [batch_size, seq_length, width], dtype=float32
+  "encoder_outputs",  # List of Tensors with outputs of all transformer layers
+}
+```
+
+The encoder's pooler layer is restored from the `--model_checkpoint_path`.
+However, unlike classic BERT, `BertPretrainerV2` does not train the pooler layer
+of the encoder. You have three options to handle that:
+
+*   Set flag `--copy_pooler_dense_to_encoder` to copy the pooling layer from the
+    `ClassificationHead` passed to `BertPretrainerV2` for the next sentence
+    prediction task. This mimicks classic BERT, but is not recommended for new
+    models (see next item).
+*   Leave flag `--copy_pooler_dense_to_encoder` unset and export the untrained,
+    randomly initialized pooling layer of the encoder. Folklore (as of 2020) has
+    it that an untrained pooler gets fine-tuned better than a pre-trained
+    pooler, so this is the default.
+*   Leave flag `--copy_pooler_dense_to_encoder` unset and perform your own
+    initialization of the pooling layer before export. For example, Google's
+    [BERT Experts](https://tfhub.dev/google/collections/experts/bert/1)
+    published in October 2020 initialize it to the identity map, reporting equal
+    gains if fine-tuning, and more predictable behavior if not.
+
+In any case, at this time, the export tool requires the encoder model to *have*
+a `pooled_output`, whether trained or not. (This can be revised in the future.)
+
+The encoder model does not include any preprocessing logic, but for the benefit
+of users who take preprocessing into their own hands, the relevant information
+is attached from flags `--vocab_file` or `--sp_model_file`, resp., and
+`--do_lower_case`, which need to be set in exactly the same way as for the
+preprocessing model (see below).
+
+The root object of the exported SavedModel stores the resulting values as
+attributes on the root object:
+
+```python
+encoder = hub.load(...)
+# Gets the filename of the respective tf.saved_model.Asset object.
+if hasattr(encoder, "vocab_file"):
+  print("Wordpiece vocab at", encoder.vocab_file.asset_path.numpy())
+elif hasattr(encoder, "sp_model_file"):
+  print("SentencePiece model at", encoder.sp_model_file.asset_path.numpy())
+# Gets the value of a scalar bool tf.Variable.
+print("...using do_lower_case =", encoder.do_lower_case.numpy())
+```
+
+New users are encouraged to ignore these attributes and use the preprocessing
+model instead. However, there are legacy users, and advanced users that require
+access to the full vocab.
+
+### Exporting the Encoder with a Masked Language Model head
+
+To export an encoder and the masked language model it was trained with, first
+read the preceding section about exporting just the encoder. All the
+explanations there on setting the right flags apply here as well, up to the
+following differences.
+
+The masked language model is added to the export by changing flag
+`--export_type` from `model` to `model_with_mlm`, so the export command looks
+like this:
+
+```shell
+python official/nlp/tools/export_tfhub.py \
+  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
+  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --export_type=model_with_mlm \
+  --export_path=/tmp/bert_model
+```
+
+The `--model_checkpoint_path` refers to an object-based (TF2) checkpoint written
+by
+[BertPretrainerV2](https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py),
+or any other checkpoint that can be restored to
+`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)` with the encoder
+defined by the config flags.
+
+This is a more comprehensive requirement on the checkpoint than for
+`--export_type=model`; not all Transformer encoders and not all pre-training
+techniques can satisfy it. For example,
+[ELECTRA](https://arxiv.org/abs/2003.10555) uses the BERT architecture but is
+pre-trained without an MLM task.
+
+The root object of the exported SavedModel is called in the same way as above.
+In addition, the SavedModel has an `mlm` subobject that can be called as follows
+to output an `mlm_logits` tensor as well:
+
+```python
+mlm_inputs = dict(
+    input_word_ids=...,       # Shape [batch, seq_length], dtype=int32
+    input_mask=...,           # Shape [batch, seq_length], dtype=int32
+    input_type_ids=...,       # Shape [batch, seq_length], dtype=int32
+    masked_lm_positions=...,  # Shape [batch, num_predictions], dtype=int32
+)
+mlm_outputs = encoder.mlm(mlm_inputs)
+assert mlm_outputs.keys() == {
+  "pooled_output",   # Shape [batch, width], dtype=float32
+  "sequence_output", # Shape [batch, seq_length, width], dtype=float32
+  "encoder_outputs", # List of Tensors with outputs of all transformer layers
+  "mlm_logits"       # Shape [batch, num_predictions, vocab_size], dtype=float32
+}
+```
+
+The extra subobject imposes a moderate size overhead.
+
+### Exporting from a TF1 BERT checkpoint
+
+A BERT model trained with the
+[original BERT implementation for TF1](https://github.com/google-research/bert)
+can be exported after converting its checkpoint with the
+[tf2_encoder_checkpoint_converter](https://github.com/tensorflow/models/blob/master/official/nlp/bert/tf2_encoder_checkpoint_converter.py)
+tool.
+
+After that, run
+[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
+per the instructions above on the converted checkpoint. Do not set
+`--copy_pooler_dense_to_encoder`, because the pooler layer is part of the
+converted encoder. For `--vocab_file` and `--do_lower_case`, the values from TF1
+BERT can be used verbatim.
+
+## Exporting the preprocessing model
+
+You can skip this step if TF Hub already has a preprocessing model that does
+exactly what your encoder needs (same tokenizer, same vocab, same normalization
+settings (`do_lower_case`)). You can inspect its collection of
+[Transformer Encoders for Text](https://tfhub.dev/google/collections/transformer_encoders_text/1)
+and click through to models with a similar input domain to find their
+preprocessing models.
+
+To export the preprocessing model, set `--export_type=preprocessing` and run the
+export tool like this:
+
+```shell
+python official/nlp/tools/export_tfhub.py \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --do_lower_case=True \
+  --export_type=preprocessing \
+  --export_path=/tmp/bert_preprocessing
+```
+
+Note: Set flag `--experimental_disable_assert_in_preprocessing` when exporting
+to users of the public TensorFlow releases 2.4.x to avoid a fatal ops placement
+issue when preprocessing is used within Dataset.map() on TPU workers.
+This is not an issue with TF2.3 and TF2.5+.
+
+Flag `--vocab_file` specifies the vocab file used with
+[BertTokenizer](https://github.com/tensorflow/models/search?q=BertTokenizer+filename%3Atext_layers.py).
+For models that use the
+[SentencepieceTokenizer](https://github.com/tensorflow/models/search?q=SentencepieceTokenizer+filename%3Atext_layers.py),
+set flag `--sp_model_file` instead.
+
+The boolean flag `--do_lower_case` controls text normalization (as in the
+respective tokenizer classes, so it's a bit more than just smashing case). If
+unset, do_lower_case will be enabled if 'uncased' appears in --vocab_file, or
+unconditionally if --sp_model_file is set, mimicking the conventions of BERT and
+ALBERT, respectively. For programmatic use, or if in doubt, it's best to set
+`--do_lower_case` explicity.
+
+If the definition of preprocessing involved
+[GIN](https://github.com/google/gin-config),
+the flags `--gin_file` and `--gin_params` would have to be set accordingly,
+consistent with pre-training. (At the time of this writing, no such GIN
+configurables exist in the code.)
+
+The exported SavedModel can be called in the following way for a single segment
+input.
+
+```python
+preprocessor = hub.load(...)
+text_input = ... # Shape [batch_size], dtype=tf.string
+encoder_inputs = preprocessor(text_input, seq_length=seq_length)
+assert encoder_inputs.keys() == {
+  "input_word_ids", # Shape [batch_size, seq_length], dtype=int32
+  "input_mask",     # Shape [batch_size, seq_length], dtype=int32
+  "input_type_ids"  # Shape [batch_size, seq_length], dtype=int32
+}
+```
+
+Flag `--default_seq_length` controls the value of `seq_length` if that argument
+is omitted in the usage example above. The flag defaults to 128, because
+mutiples of 128 work best for Cloud TPUs, yet the cost of attention computation
+grows quadratically with `seq_length`.
+
+Beyond this example, the exported SavedModel implements the full set interface
+from the preprocessor API for text embeddings with preprocessed inputs and with
+Transformer encoders from TF Hub's
+[Common APIs for text](https://www.tensorflow.org/hub/common_saved_model_apis/text).
+
+Please see
+[tfhub.dev/tensorflow/bert_en_uncased_preprocess](https://tfhub.dev/tensorflow/bert_en_uncased_preprocess)
+for the full documentation of one preprocessing model exported with this tool,
+especially how custom trimming of inputs can happen between `.tokenize` and
+`.bert_pack_inputs`.
+
+Using the `encoder.mlm()` interface requires masking of tokenized inputs by user
+code. The necessary information on the vocabulary encapsulated in the
+preprocessing model can be obtained like this (uniformly across tokenizers):
+
+```python
+special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
+vocab_size = int(special_tokens_dict["vocab_size"])
+padding_id = int(special_tokens_dict["padding_id"])  # [PAD] or <pad>
+start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])  # [CLS]
+end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])  # [SEP]
+mask_id = int(special_tokens_dict["mask_id"])  # [MASK]
+```
+
+## Testing the exported models
+
+Please test your SavedModels before publication by fine-tuning them on a
+suitable task and comparing performance and accuracy to a baseline experiment
+built from equivalent Python code.
+The
+[trainer doc](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
+has instructions how to run BERT on MNLI and other tasks from the GLUE
+benchmark.
--- a/official/nlp/modeling/layers/attention.py
+++ b/official/nlp/modeling/layers/attention.py
@@ -27,7 +27,7 @@ MultiHeadAttention = tf.keras.layers.MultiHeadAttention
 class CachedAttention(tf.keras.layers.MultiHeadAttention):
  """Attention layer with cache used for auto-agressive decoding.

-  Arguments are the same as `MultiHeadAttention` layer.
+  Arguments are the same as `tf.keras.layers.MultiHeadAttention` layer.
  """

  def _update_cache(self, key, value, cache, decode_loop_step):

--- a/official/nlp/modeling/layers/dense_einsum.py
+++ b/official/nlp/modeling/layers/dense_einsum.py
@@ -24,7 +24,7 @@ _CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]

 @tf.keras.utils.register_keras_serializable(package="Text")
 class DenseEinsum(tf.keras.layers.Layer):
-  """A densely connected layer that uses tf.einsum as the backing computation.
+  """A densely connected layer that uses `tf.einsum` as the backing computation.

  This layer can perform einsum calculations of arbitrary dimensionality.


--- a/official/nlp/modeling/layers/gated_feedforward.py
+++ b/official/nlp/modeling/layers/gated_feedforward.py
@@ -33,8 +33,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
    intermediate_activation: Activation for the intermediate layer.
    dropout: Dropout probability for the output dropout.
    use_gate: Whether to use gated linear units. If True, assuming `GELU` as the
-      activation and omitting bias, will apply `GEGLU(x, W, V, W_2) = (GEGLU(xW)
-      * xV)W2`; if False, will follow
+      activation and omitting bias, will apply
+      `GEGLU(x, W, V, W_2) = (GEGLU(xW) * xV)W2`; if False, will follow
      "Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper and
        apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
    num_blocks: The number of feedforward blocks to stack. Each block contains a
@@ -43,8 +43,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
    dropout_position: Where to apply the dropout, the value can be either
      `before_residual` or `after_residual`. If `before_residual`, will apply
      `layer_output = layer_norm(dropout(layer_output) + layer_input)`; if
-      `after residual`, will apply `layer_output =
-      dropout(layer_norm(layer_output + layer_input))`.
+      `after residual`, will apply
+      `layer_output = dropout(layer_norm(layer_output + layer_input))`.
    kernel_initializer: Initializer for dense layer kernels.
    bias_initializer: Initializer for dense layer biases.
    kernel_regularizer: Regularizer for dense layer kernels.

--- a/official/nlp/modeling/layers/masked_softmax.py
+++ b/official/nlp/modeling/layers/masked_softmax.py
@@ -22,7 +22,7 @@ def _large_compatible_negative(tensor_type):
  """Large negative number as Tensor.

  This function is necessary because the standard value for epsilon
-  in this module (-1e9) cannot be represented using tf.float16
+  in this module (-1e9) cannot be represented using `tf.float16`.

  Args:
    tensor_type: a dtype to determine the type.

--- a/official/nlp/modeling/layers/relative_attention.py
+++ b/official/nlp/modeling/layers/relative_attention.py
@@ -75,7 +75,7 @@ class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
  """A multi-head attention layer with relative attention + position encoding.

  This layer shares the same input/output projections as the common
-  MultiHeadAttention layer.
+  `tf.keras.layers.MultiHeadAttention` layer.

  When it calculates attention logits, position encoding is projected to form
  relative keys. The logits are composed by shifted relative logits and content
@@ -333,8 +333,9 @@ class TwoStreamRelativeAttention(MultiHeadRelativeAttention):
  The query stream only has access to contextual information and the position,
  but not the content.

-  This layer shares the same build signature as `MultiHeadRelativeAttention` but
-  has different input/output projections.
+  This layer shares the same build signature as
+  `tf.keras.layers.MultiHeadAttention` but has different input/output
+  projections.

  **Note: This layer is currently experimental.


--- a/official/nlp/modeling/layers/self_attention_mask.py
+++ b/official/nlp/modeling/layers/self_attention_mask.py
@@ -23,7 +23,7 @@ from official.nlp.keras_nlp import layers
 class SelfAttentionMask(layers.SelfAttentionMask):
  """Create 3D attention mask from a 2D tensor mask.

-    **Warning: Please use the keras_nlp.layers.SelfAttentionMask.**
+    **Warning: Please use the `keras_nlp.layers.SelfAttentionMask`.**
    inputs[0]: from_tensor: 2D or 3D Tensor of shape
      [batch_size, from_seq_length, ...].
    inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].

--- a/official/nlp/modeling/layers/talking_heads_attention.py
+++ b/official/nlp/modeling/layers/talking_heads_attention.py
@@ -33,7 +33,7 @@ class TalkingHeadsAttention(tf.keras.layers.MultiHeadAttention):
  multi-head attention by including linearprojections across the attention-heads
  dimension, immediately before and after the softmax operation.

-  See the base class `MultiHeadAttention` for more details.
+  See the base class `tf.keras.layers.MultiHeadAttention` for more details.

  Args:
    num_heads: Number of attention heads.

--- a/official/nlp/modeling/layers/text_layers.py
+++ b/official/nlp/modeling/layers/text_layers.py
@@ -97,8 +97,9 @@ class BertTokenizer(tf.keras.layers.Layer):
  """Wraps BertTokenizer with pre-defined vocab as a Keras Layer.

  Attributes:
-    tokenize_with_offsets: If true, calls BertTokenizer.tokenize_with_offsets()
-      instead of plain .tokenize() and outputs a triple of
+    tokenize_with_offsets: If true, calls
+      `text.BertTokenizer.tokenize_with_offsets()` instead of plain
+      `text.BertTokenizer.tokenize()` and outputs a triple of
      (tokens, start_offsets, limit_offsets).
    raw_table_access: An object with methods .lookup(keys) and .size()
      that operate on the raw lookup table of tokens. It can be used to
@@ -110,25 +111,26 @@ class BertTokenizer(tf.keras.layers.Layer):
               lower_case: bool,
               tokenize_with_offsets: bool = False,
               **kwargs):
-    """Initialize a BertTokenizer layer.
+    """Initialize a `BertTokenizer` layer.

    Args:
      vocab_file: A Python string with the path of the vocabulary file.
        This is a text file with newline-separated wordpiece tokens.
        This layer initializes a lookup table from it that gets used with
-        text.BertTokenizer.
-      lower_case: A Python boolean forwarded to text.BertTokenizer.
+        `text.BertTokenizer`.
+      lower_case: A Python boolean forwarded to `text.BertTokenizer`.
        If true, input text is converted to lower case (where applicable)
        before tokenization. This must be set to match the way in which
        the vocab_file was created.
      tokenize_with_offsets: A Python boolean. If true, this layer calls
-         BertTokenizer.tokenize_with_offsets() instead of plain .tokenize()
-         and outputs a triple of (tokens, start_offsets, limit_offsets)
+         `text.BertTokenizer.tokenize_with_offsets()` instead of plain
+         `text.BertTokenizer.tokenize()` and outputs a triple of
+         (tokens, start_offsets, limit_offsets)
         insead of just tokens.
      **kwargs: standard arguments to Layer().

    Raises:
-      ImportError: if importing tensorflow_text failed.
+      ImportError: if importing `tensorflow_text` failed.
    """
    _check_if_tf_text_installed()

@@ -162,18 +164,18 @@ class BertTokenizer(tf.keras.layers.Layer):
    return vocab_table, vocab_initializer

  def call(self, inputs: tf.Tensor):
-    """Calls text.BertTokenizer on inputs.
+    """Calls `text.BertTokenizer` on inputs.

    Args:
      inputs: A string Tensor of shape [batch_size].

    Returns:
-      One or three of RaggedTensors if tokenize_with_offsets is False or True,
-      respectively. These are
-      tokens: A RaggedTensor of shape [batch_size, (words), (pieces_per_word)]
+      One or three of `RaggedTensors` if `tokenize_with_offsets` is False or
+      True, respectively. These are
+      tokens: A `RaggedTensor` of shape [batch_size, (words), (pieces_per_word)]
        and type int32. tokens[i,j,k] contains the k-th wordpiece of the
        j-th word in the i-th input.
-      start_offsets, limit_offsets: If tokenize_with_offsets is True,
+      start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
        RaggedTensors of type int64 with the same indices as tokens.
        Element [i,j,k] contains the byte offset at the start, or past the
        end, resp., for the k-th wordpiece of the j-th word in the i-th input.

--- a/official/nlp/modeling/layers/transformer.py
+++ b/official/nlp/modeling/layers/transformer.py
@@ -202,7 +202,7 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
      raise ValueError(
          "The hidden size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self.num_attention_heads))
-    self.attention_head_size = int(hidden_size / self.num_attention_heads)
+    self.attention_head_size = int(hidden_size) // self.num_attention_heads
    common_kwargs = dict(
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,

--- a/official/nlp/modeling/models/bert_pretrainer.py
+++ b/official/nlp/modeling/models/bert_pretrainer.py
@@ -30,7 +30,7 @@ from official.nlp.modeling import networks
 class BertPretrainer(tf.keras.Model):
  """BERT pretraining model.

-  [Note] Please use the new BertPretrainerV2 for your projects.
+  [Note] Please use the new `BertPretrainerV2` for your projects.

  The BertPretrainer allows a user to pass in a transformer stack, and
  instantiates the masked language model and classification networks that are

--- a/official/nlp/modeling/models/electra_pretrainer.py
+++ b/official/nlp/modeling/models/electra_pretrainer.py
@@ -37,7 +37,7 @@ class ElectraPretrainer(tf.keras.Model):
  that are used to create the training objectives.

  *Note* that the model is constructed by Keras Subclass API, where layers are
-  defined inside __init__ and call() implements the computation.
+  defined inside `__init__` and `call()` implements the computation.

  Args:
    generator_network: A transformer network for generator, this network should

--- a/official/nlp/modeling/models/seq2seq_transformer.py
+++ b/official/nlp/modeling/models/seq2seq_transformer.py
@@ -591,5 +591,6 @@ class TransformerDecoder(tf.keras.layers.Layer):

 def attention_initializer(hidden_size):
  """Initializer for attention layers in Seq2SeqTransformer."""
+  hidden_size = int(hidden_size)
  limit = math.sqrt(6.0 / (hidden_size + hidden_size))
  return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
--- a/official/nlp/modeling/networks/encoder_scaffold.py
+++ b/official/nlp/modeling/networks/encoder_scaffold.py
@@ -243,7 +243,6 @@ class EncoderScaffold(tf.keras.Model):
    self._position_embedding_layer = position_embedding_layer
    self._type_embedding_layer = type_embedding_layer
    self._embedding_norm_layer = embedding_norm_layer
-    self._embedding_network = embedding_network
    self._hidden_layers = hidden_layers
    if self._layer_norm_before_pooling:
      self._output_layer_norm = output_layer_norm